<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import re
import spacy
import wikipedia
import warnings
import urllib 
import time
import json
import sqlite3
import requests
import mwparserfromhell


import pandas as pd
import wikitextparser as wtp

from spacy import displacy
from bs4 import BeautifulSoup
from tqdm import tqdm
from wasabi import msg
from collections import defaultdict   

warnings.filterwarnings('ignore')

In [2]:
class Term:
    """Class of term"""
    def __init__(self, term_name, title, summary):
        self.term_name = term_name
        self.title = title
        self.summary = summary
        
    @property
    def title(self):
        return '{}'.format(self.title)
    
    @property
    def summary(self):
        return '{}'.format(self.summary)
    
    def __repr__(self):
        return "Term({}: {}, {})".format(self.term_name, self.title, self.summary)
    
    
    
def insert_term(term):
    with conn:
        cursor.execute("INSERT INTO algodraftapp_wiki_info VALUES (:term_name, :title, :summary)", 
                       {'term_name':term.term_name, 'title': term.title, 'summary':term.summary})

        
def remove_term_by_termname(termname):
    with conn:
        cursor.execute("DELETE from algodraftapp_wiki_info WHERE term_name = :term_name",
                      {'term_name':termname})

        
def remove_term_by_title(title):
    with conn:
        cursor.execute("DELETE from algodraftapp_wiki_info WHERE title = :title",{'title':title}) 


def update_summary(term_name, summary):
    with conn:
        cursor.execute("""UPDATE algodraftapp_wiki_info SET summary = :summary
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'summary': summary})

        
def update_title(term_name, title):
    with conn:
        cursor.execute("""UPDATE algodraftapp_wiki_info SET title = :title
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'title': title})    

        
def update_hyponyms(term_name, hyponyms):
    with conn:
        cursor.execute("""UPDATE algodraftapp_wiki_info SET hyponyms = :hyponyms
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'hyponyms': hyponyms}) 
        
        
def update_synonyms(term_name, synonyms):
    with conn:
        cursor.execute("""UPDATE algodraftapp_wiki_info SET synonyms = :synonyms
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'synonyms': synonyms}) 

        
def update_hypernyms(term_name, hyperonyms):
    with conn:
        cursor.execute("""UPDATE algodraftapp_wiki_info SET hyperonyms = :hyperonyms
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'hyperonyms': hyperonyms})         
        

def get_term_by_termname(termname):
    cursor.execute("SELECT * FROM algodraftapp_wiki_info WHERE term_name= :term_name", {'term_name': termname})
    return cursor.fetchone()        


def get_terms_by_title(title):
    cursor.execute("SELECT * FROM algodraftapp_wiki_info WHERE title= :title", {'title': title})
    return cursor.fetchall()


def find_wiki_title(term):
    """
    Using the wikipedia API to find the corresponding page title
    """
    title = wikipedia.search(term)
    if title:
        return title[0]


def find_wiki_summary(term, nb_sent=2):
    """
    Using the wikipedia API to find the corresponding wikipedia abstract (the first paragraph of the wikipedia page)
    """
    try:
        ret = wikipedia.summary(term, sentences = nb_sent).replace('\n  \n    \n      \n        ', '')
        return re.sub(r'\n      \n    \n    {\\displaystyle .}\n  ','',ret)
    
    # if it is a ambiguous term, the function will return None as value of summary
    except wikipedia.exceptions.WikipediaException:
        return None
    

def parse(title, API_URL):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "rvlimit": 1,
        "titles": title,
        "format": "json",
        "formatversion": "2",
        "redirects" : 1
    }
    headers = {"User-Agent": "My-Bot-Name/1.0"}
    req = requests.get(API_URL, headers=headers, params=params)
    res = req.json()
    revision = res["query"]["pages"][0]['revisions'][0]
    text = revision["slots"]["main"]["content"]
    return mwparserfromhell.parse(text)


def is_plural_or_initialism(ret1):
    for template in ['plural of', 'initialism of', 'present participle of', 'alternative spelling of', 'abbreviation of', 'inflection of']:
        if template in ret1:
            return wtp.parse(ret1).templates[0].arguments[1].string[1:]


def sub_temps(description):
    templates = wtp.parse(description).templates
    while templates:
        temp = templates.pop(0)
        if 'lb' in temp.string:
            description = description.replace(temp.string,f'({temp.arguments[1].string[1:]})')
        if 'l|en' in temp.string:
            description = description.replace(temp.string,f'{temp.arguments[1].string[1:]}')
    templates = wtp.parse(description).templates
    if templates: description = sub_temps(description)
    else: return description


def get_description_wiktionary(term):
#     term = term.lower()
    try:
        wikicode = parse(term, "https://en.wiktionary.org/w/api.php")
        parsed = wtp.parse(str(wikicode))

        for sec in parsed.sections:
            if str(sec.title).lower() in ['noun','proper noun'] :
                break
        description = sec.get_lists()[0].items[0]
        
        # check if it is the plural or the initialism of another term
        check = is_plural_or_initialism(description)
        if check: 
            return get_description_wiktionary(check)
        
        description = sub_temps(description) 
        description = wtp.parse(description).plain_text().strip()
        
        if description[-1] == ':': description = description[:-1]      
        return description
    
    except KeyError:
        return


def get_description_wikipedia(term):              
    try:
        wikicode = parse(term, "https://en.wikipedia.org/w/api.php")
        templates = wikicode.filter_templates()    
        flag = 0
        for temp in templates:
            if str(temp.name) == 'short description':
                flag = 1
                break
        if flag:
            return str(temp.get(1))
    except KeyError or ValueError:
        return   
     
to_remove = ['conductive material forms electrodes',
             'successful detection',
             'smaller ratio',
             'successful watermark detection',
             'unsupervised',
             'detected',
             'K',
             'm=2Ap',
             'knowledge of K=(KS',
             'depth log2(N',
             'images N',
             'cm2',
             'K.\n',
             'quantum code C;<br/',
             'codeword c(M',
             'by-letter encryption U(KS',
             'priority ranks'
            ]
        
# def text_to_html(text, nlp): # gives the html under BeatifulSoup format
#     doc = nlp(text)

#     html = displacy.render(doc, style="ent", options={"ents": ["TERM"]}, jupyter=False, page=True)
#     soup = BeautifulSoup(html)
#     marks =  soup.find_all('mark')
#     url = ''
    
#     for mark in tqdm(marks):
#         try:
#             term = mark.get_text(strip=True,separator=', ').split(', ')[0] # get the term annotated
#             if term in to_remove: continue
            
#             wiki_info = DICT_PAGE_TITLE[term] # get wikipedia pagetitle and summary from json file
#             url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_info["title"].split())}' 
#             summary = wiki_info['summary']
            
#         except KeyError:
#             wiki_title = find_wiki_title(term)  
            
#             if wiki_title:
#                 url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_title.split())}' 
#                 wiki_summary = find_wiki_summary(wiki_title)
#                 DICT_PAGE_TITLE.update({term:{'title':wiki_title, 'summary': wiki_summary}})
                    
#         link = soup.new_tag('a', href=url) # create the html tag for link       
#         mark.wrap(link) #add html tag <a> (the one to make links) to around our annotated word
#     return soup

from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def term_lemmatized(term):
    words = term.split()
    if len(words)>1:
        target = words[-1]
        lemma = wnl.lemmatize(target)
        words[-1] = lemma
        return ' '.join(words)
    else:
        target = words[0]
        return wnl.lemmatize(target)


def text_to_json(text, nlp): # gives the ner results in json format
    
    dict_position = defaultdict(list)
    dict_position_trigger = defaultdict(list)
    dict_res = {}
    
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'TERM':    
            term = ent.text
            try:
                while (term[0] == '.' or term[-1] in ['.','_', ';', '\n', ' ', ',']) and ent.end_char>0 and (not term.isupper() or (term.isupper() and len(term)<=4)) :
                    ent.end_char -= 1
                    term = term[:-1]  
                if(term in to_remove): continue
                dict_position[term].append((ent.start_char, ent.end_char))
            except IndexError: # single character term
                continue         
            
        else: # if it is a trigger word
            dict_position_trigger[ent.text].append((ent.start_char, ent.end_char))          
            
    dict_position = dict(dict_position)
    
    msg.info("Analysing errors...")
    for error, pos_l in tqdm(dict_position_trigger.items()): # only for ERROR
        dict_res.update({error: {'label': 'ERROR', 'position': pos_l}})
    msg.good("Done!")
    
    msg.info("Analysing terms...")
    cnt = 0
    for term, pos_l in tqdm(dict_position.items()): # only for TERM

        wiki_info_queryset = get_term_by_termname(term) # get wikipedia pagetitle and summary from database
        if wiki_info_queryset: # if exists in database
            url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_info_queryset[1].split())}' 
            wiki_summary = wiki_info_queryset[2] 
        else:  
            # get the lemmatized term 
            term_lem = term_lemmatized(term)
            wiki_title = find_wiki_title(term_lem)
            
            # find summary  
            try:
                ## witionary
                wiki_summary = get_description_wiktionary(term_lem)
                if not wiki_summary or '|' in wiki_summary:
                    ## wikipedia short description
                    wiki_summary = get_description_wikipedia(term_lem)
                if (not wiki_summary) and wiki_title:                  
                    if not wiki_summary: ## wiktionary definition
                        wiki_summary = get_description_wiktionary(wiki_title)
                    if not wiki_summary: ## wikipedia short description
                        wiki_summary = get_description_wikipedia(wiki_title)
                    if not wiki_summary: ## wikipedia page abstract
                        wiki_summary = find_wiki_summary(wiki_title)
                        
                term_split = term_lem.split()
                term_split.pop(0)
                while (not(wiki_summary and wiki_title)) and len(term_split)>0: # truncate the term)
                    term_lem = ' '.join(term_split)
                    wiki_title = find_wiki_title(term_lem)
                    wiki_summary = get_description_wiktionary(term_lem)

                    if not wiki_summary or '|' in wiki_summary:
                        wiki_summary = get_description_wikipedia(term_lem)
                    if (not wiki_summary) and wiki_title:                  
                        if not wiki_summary:
                            wiki_summary = get_description_wiktionary(wiki_title)
                        if not wiki_summary:
                            wiki_summary = get_description_wikipedia(wiki_title)
                        if not wiki_summary:
                            wiki_summary = find_wiki_summary(wiki_title) 
                    term_split.pop(0)
                    
            except KeyError: # for terms of ERROR
                continue
                
            # find wiki title
            if wiki_title:
                url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_title.split())}' 
                # insert new term into database   
                insert_term(Term(term, wiki_title, wiki_summary))
                
            else: # returns the wikipedia main page
                url = f'https://www.wikipedia.org'
                insert_term(Term(term, '', ''))                         
            cnt += 1

        dict_res.update({term:{'label': 'TERM', 'position': pos_l,'wikilink': url,'summary': wiki_summary}})
    msg.good("Done!")
    
    if cnt:
#         conn.commit()
#         conn.close()
        msg.good(f"Found wikipedia information for {cnt} new terms.")
            
    return dict_res 

In [3]:
conn = sqlite3.connect('./wiki_info.db')
cursor = conn.cursor()

In [4]:
with open('./claims.txt', encoding = 'utf-8', mode='r') as f:
    claims = f.read().replace('<p>', '').replace('</p>','')

In [6]:
nlp = spacy.load(r"../03_spaCy_ner/output/G_2018/model-last/") 

In [7]:
# open('./demo.html', 'w', encoding="utf-8").write(str(text_to_html(claims,nlp))) #create file with html data

In [8]:
# save results
with open('demo.json', "w") as f: 
    json.dump(text_to_json(claims, nlp), f, indent = 4) # test with the first patent 

100%|██████████| 6/6 [00:00<00:00, 22290.37it/s]
 25%|██▍       | 160/653 [00:00<00:00, 1596.60it/s]

[38;5;4mℹ Analysing errors...[0m
[38;5;2m✔ Done![0m
[38;5;4mℹ Analysing terms...[0m


100%|██████████| 653/653 [00:00<00:00, 2308.53it/s]


[38;5;2m✔ Done![0m
