<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from collections import Counter
import nltk
import spacy
import re
import sqlite3
from nltk.corpus import cmudict  # >>> nltk.download('cmudict')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


conn = sqlite3.connect('./wiki_info.db')
cursor = conn.cursor()

nlp = spacy.load(r"../03_spaCy_ner/output/G_2018/model-last/") 


wnl = WordNetLemmatizer()


def starts_with_vowel_sound(word, pronunciations=cmudict.dict()):
    for syllables in pronunciations.get(word, []):
        return syllables[0][-1].isdigit()  # use only the first one

def get_term_by_termname(termname):
    cursor.execute("SELECT * FROM algodraftapp_wiki_info WHERE term_name= :term_name", {'term_name': termname})
    return cursor.fetchone()

def is_uncountable(word):
    term = get_term_by_termname(word)
    if term and '(uncountable)' in term[2]:
        return True
    else:
        return False
    
def term_lemmatized(term):
    words = term.split()
    if len(words)>1:
        target = words[-1]
        lemma = wnl.lemmatize(target)
        words[-1] = lemma
        return ' '.join(words)
    else:
        target = words[0]
        return wnl.lemmatize(target)
    
def get_article(word):
    if is_uncountable(word): 
        return 'The '
    elif starts_with_vowel_sound(word):
        return 'An '
    elif starts_with_vowel_sound(word) is None:
        return ('An ' if word[0].lower() in "aeiou" else 'A ')
    else:
        return 'A '
    
def MWE_definition(MWE):
    term_query = get_term_by_termname(MWE)

#     if term_query and term_query[0].lower() == term_query[1].lower():
    if term_query:
        term_summary = term_query[2].replace('(uncountable) ','').replace('(countable) ','') # get summary from sqlite query
        if term_summary!='':  
            if term_summary[0] == '(': # if it has a sense in its definition (extracted from wiktionary)
                pat = re.compile(r'[(](.*?)[)]', re.S) 
                sense = re.findall(pat, term_summary)[0]
                if sense:
                    term_def = (term_summary.split(")"))[1]
                    # check if it is titled 
                    if term_def.split()[0].istitle():
                        term_def = term_def.replace(term_def.split()[0], term_def.split()[0].lower(), 1)  
                    return f' (in the sense of {sense}) is a and/or means and/or designates one or more of:{term_def}'
                else: # definition extracted from wiktionary but without sense
                    return f' is a and/or means and/or designates one or more of: {term_summary}'

            else: # definition extracted from wikipedia
                # check if it is titled 
                term_summary = term_summary.replace('\n','')
                fst_word = term_summary.split()[0]
                if fst_word.istitle():               
                    if fst_word[:1] == 'A ' or fst_word[:4] == 'The':
                        match = re.search('is a', term_summary)
                        if match:
                            return f' is a and/or means and/or designates one or more of:{term_summary[match.start()+2:]}'
                        else:
                            term_summary = term_summary.replace('A ','a ',1).replace('The ','the ',1)
                            return term_summary
                    else:
                        term_summary = term_summary.replace(term_summary.split()[0], term_summary.split()[0].lower(), 1)
                return f' is a and/or means and/or designates one or more of: {term_summary}'
        elif term_query[5]!='':
            return f' is a and/or means and/or designates one or more of: {term_query[5]}'

    
def getContentLemmas(text, nlp):
    uselesslemmas = {'a','about','accord','accordance','according','act','activity','adapt','adjustment','amount','an','and','another','any','apparatus','are','article','article of manufacture','as','assembly','associate','at','base','basis','be','between','business method','by','cell','characterize','chemical','circuit','claim','claims','co','communication','composition','compound','compounds','comprise','computer','computer program product','computer-implemented method','computer-readable medium comprising instructions which when executed by a computer perform a method','consist','consists','control','couple','crystal','data','data structure','datum','define','denote','design','device of claim','device','different','display','each','element','end','equal','first','fluid','follow','for','form','forth','fourth','from','further','gene','great','greater','group','have','in','include','inferior','information','last','least','less','machine','manner','manufacture','material','may','mechanism','medical device','medium','member','memory','method','method of claim','mixture','more','non','object','obtain','obtainable','of','on','one','operate','optionally','or','ornamental','other','part','plant','plurality','printed matter','process','processor','product','product-by-process','profile','range','recite','record','respect','respective','said','same','say','second','select','set','signal','smaller','so','step','steps','storage','substance','substitute','such','superior','system','system of claim','take','than','that','the','thereof','thereon','third','to','two','unit','use','user interface','value','voice','way','when','wherein','which','window','with'}
    
#     lems = Counter([(token.lemma_, token.pos_) for token in doc
#                     if token.pos_ in ['NOUN', 'VERB', 'ADJ','ADV'] 
#                     and token.is_alpha 
#                     and token.lemma_ not in uselesslemmas 
#                     and len(token.lemma_) >= 3])
    doc = nlp(text)
    terms = []
    ents = [ent.text for ent in doc.ents if ent.label_ == 'TERM']
    for term in ents:  
        if term in uselesslemmas: continue
        try:
            while ((term[0] == '.') or (term[-1] in ['.','_', ';', '\n', ' ', ',']) and (not term.isupper() or (term.isupper() and len(term)<=4))) :
                term = term[:-1] 
            term = term_lemmatized(term)
            if term in uselesslemmas: continue
            terms.append(term)
        except IndexError: # single character term
            continue       
    
    return Counter(terms)



def makeDefinitions(lems):
    to_ignore = ['conductive material forms electrodes',
             'successful detection',
             'smaller ratio',
             'successful watermark detection',
             'unsupervised',
             'detected',
             'K',
             'm=2Ap',
             'knowledge of K=(KS',
             'depth log2(N',
             'images N',
             'cm2',
             'K.\n',
             'quantum code C;<br/',
             'codeword c(M',
             'by-letter encryption U(KS',
             'priority ranks'
            ]
    definitions = []
    
    for lem,f in lems.most_common():
        # if lem in uselesslemmas or len(lem)<3: continue
        # print("________________",lem, pos)
        if lem in to_ignore: continue
        if len(lem.split())>1 or lem.isupper():
            definition = MWE_definition(lem)
            if definition:
                definitions += [(get_article(lem.split()[0]),lem, definition)]
         
        else:
            syns = wordnet.synsets(lem)
            if not syns:
                if lem.endswith('able'):
                    lem=lem[:-4]
                    syns = wordnet.synsets(lem)
                elif lem.endswith('ly'):
                    lem=lem[:-2]
                    syns = wordnet.synsets(lem)
            #print(syns)
            alllemmasn=[]
            alllemmasv=[]
            alllemmasa=[]
            alldefsn=[]
            alldefsv=[]
            alldefsa=[]
            for syn in syns:
                #print(syn,syn.name(),syn.pos(),syn.lemma_names(),syn.lemmas(),syn.definition())
                if syn.pos()=='n':
                    alllemmasn+=syn.lemma_names()
                    alldefsn+=[syn.definition()]
                elif syn.pos()=='v':
                    alllemmasv+=syn.lemma_names()
                    alldefsv+=[syn.definition()]
                elif syn.pos() in 'ar':
                    alllemmasa+=syn.lemma_names()
                    alldefsa+=[syn.definition()]

            ns = Counter(alllemmasn).most_common()
            vs = Counter(alllemmasv).most_common()
            ars = Counter(alllemmasa).most_common()
            #print('ns',ns)
            #print('alldefsn',alldefsn)

            if ns:
                # definitions are triples normal bold normal
                definitions += [(get_article(lem) ,lem," {inthesenseof} is a and/or means and/or designates one or more of: {defs}.".format(inthesenseof= '(in the sense of '+', '.join([v.replace('_',' ') for v,f in ns if v!=lem])+')' if len(ns)>1 else '', defs='; '.join(alldefsn)))]
            if vs:
                definitions += [("In some embodiments the verb to ",lem,("{inthesenseof}can be replaced by: {defs}.".format(inthesenseof= ' (in the sense of '+', '.join([v.replace('_',' ') for v,f in vs if v!=lem])+') ' if len(vs)>1 else ' ', defs='; '.join(alldefsv))))]
            if ars:
                d=" {inthesenseof} means: {defs}.".format(lem=lem,inthesenseof= '(in the sense of '+', '.join([v.replace('_',' ') for v,f in ars if v!=lem])+')' if len(ars)>1 else '', defs='; '.join(alldefsa))
                definitions += [("",lem.title(),d)]
    
    return definitions

In [2]:
with open('./claims.txt', encoding = 'utf-8', mode='r') as f:
    claims = f.read().replace('<p>', '').replace('</p>','')

In [3]:
lem = getContentLemmas(claims, nlp)
makeDefinitions(lem)

[('A ',
  'sensor',
  ' (in the sense of detector, sensing element) is a and/or means and/or designates one or more of: any device that receives a signal or stimulus (as heat or pressure or light or motion etc.) and responds to it in a distinctive manner.'),
 ('A ',
  'host content',
  ' is a and/or means and/or designates one or more of: service for hosting websites'),
 ('A ',
  'KS',
  ' is a and/or means and/or designates one or more of: kyrgyzstan'),
 ('A ',
  'network',
  ' (in the sense of web, net, mesh, meshing, meshwork, electronic network) is a and/or means and/or designates one or more of: an interconnected system of things or people; (broadcasting) a communication system consisting of a group of broadcasting stations that all transmit the same programs; an open fabric of string or rope or wire woven together at regular intervals; a system of intersecting lines or channels; (electronics) a system of interconnected electronic components or circuits.'),
 ('In some embodiments 