<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import re
import spacy
import wikipedia
import warnings
import urllib 
import time
import json
import sqlite3
import requests
import mwparserfromhell


import pandas as pd
import wikitextparser as wtp

from spacy import displacy
from bs4 import BeautifulSoup
from tqdm import tqdm
from wasabi import msg
from collections import defaultdict   

warnings.filterwarnings('ignore')

In [2]:
class Term:
    """Class of term"""
    def __init__(self, term_name, title, summary):
        self.term_name = term_name
        self.title = title
        self.summary = summary
        
    @property
    def title(self):
        return '{}'.format(self.title)
    
    @property
    def summary(self):
        return '{}'.format(self.summary)
    
    def __repr__(self):
        return "Term({}: {}, {})".format(self.term_name, self.title, self.summary)
    
    
    
def insert_term(term):
    with conn:
        cursor.execute("INSERT INTO data VALUES (:term_name, :title, :summary)", 
                       {'term_name':term.term_name, 'title': term.title, 'summary':term.summary})

        
def remove_term_by_termname(termname):
    with conn:
        cursor.execute("DELETE from data WHERE term_name = :term_name",
                      {'term_name':termname})

        
def remove_term_by_title(title):
    with conn:
        cursor.execute("DELETE from data WHERE title = :title",{'title':title}) 


def update_summary(term_name, summary):
    with conn:
        cursor.execute("""UPDATE data SET summary = :summary
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'summary': summary})

        
def update_title(term_name, title):
    with conn:
        cursor.execute("""UPDATE data SET title = :title
                        WHERE term_name = :term_name""",
                       {'term_name':term_name, 'title': title})        
        

def get_term_by_termname(termname):
    cursor.execute("SELECT * FROM data WHERE term_name= :term_name", {'term_name': termname})
    return cursor.fetchone()        


def get_terms_by_title(title):
    cursor.execute("SELECT * FROM data WHERE title= :title", {'title': title})
    return cursor.fetchall()



def find_wiki_title(term):
    """
    Using the wikipedia API to find the corresponding page title
    """
    title = wikipedia.search(term)
    if title:
        return title[0]


def find_wiki_summary(term):
    """
    Using the wikipedia API to find the corresponding wikipedia abstract (the first paragraph of the wikipedia page)
    """
    try:
        return wikipedia.summary(term).split('.')[0] + '.'
    # if it is a ambiguous term, the function will return None as value of summary
    except wikipedia.exceptions.WikipediaException:
        return None
    

def parse(title, API_URL):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "rvlimit": 1,
        "titles": title,
        "format": "json",
        "formatversion": "2",
        "redirects" : 1
    }
    headers = {"User-Agent": "My-Bot-Name/1.0"}
    req = requests.get(API_URL, headers=headers, params=params)
    res = req.json()
    revision = res["query"]["pages"][0]['revisions'][0]
    text = revision["slots"]["main"]["content"]
    return mwparserfromhell.parse(text)


def is_plural_or_initialism(ret1):
    for template in ['plural of', 'initialism of', 'present participle of', 'alternative spelling of']:
        match = re.match(f'^{template}', ret1)
        if match: return ret1[match.end()+1:]


def get_description_wiktionary(term):
    term = term.lower()
    try:
        wikicode = parse(term, "https://en.wiktionary.org/w/api.php")
        parsed = wtp.parse(str(wikicode))

        for sec in parsed.sections:
            if sec.title in ['Noun', 'Proper noun'] :
                break
        description = sec.get_lists()[0].items[0]
        
        templates = wtp.parse(description).templates
        ret1 = []
        while templates:
            temp = templates.pop(0)
            if 'lb' in temp.string:
                ret1.append('('+temp.arguments[1].string[1:]+') ')
            elif 'defdate' in temp.string:
                pass
            else:
                ret1.append(temp.string.replace('|en|', ' ').strip("{").strip("}").replace('[','').replace(']',''))
        ret1 = ' '.join(ret1) 
        
        # check if it is the plural or the initialism of another term
        check = is_plural_or_initialism(ret1)
        if check: 
            return get_description_wiktionary(check)
        
        ret2 = wtp.parse(description).plain_text().strip() 
        try:
            if ret2[-1] == ':': ret2 = ret2[:-1]
            return ret1 + ret2
        except IndexError:
            return ret1    
    
    except KeyError:
        return


def get_description_wikipedia(term):              
    try:
        wikicode = parse(term, "https://en.wikipedia.org/w/api.php")
        templates = wikicode.filter_templates()    
        flag = 0
        for temp in templates:
            if temp.name in ['short description', 'Short description']:
                flag = 1
                break
        if flag:
            return str(temp.get(1))
    except KeyError or ValueError:
        return   
     
to_remove = ['conductive material forms electrodes',
             'successful detection',
             'smaller ratio',
             'successful watermark detection',
             'unsupervised',
             'detected',
             'K',
             'm=2Ap',
             'knowledge of K=(KS',
             'depth log2(N',
             'images N',
             'cm2',
             'K.\n',
             'quantum code C;<br/',
             'codeword c(M',
             'by-letter encryption U(KS',
             'priority ranks'
            ]
        
# def text_to_html(text, nlp): # gives the html under BeatifulSoup format
#     doc = nlp(text)

#     html = displacy.render(doc, style="ent", options={"ents": ["TERM"]}, jupyter=False, page=True)
#     soup = BeautifulSoup(html)
#     marks =  soup.find_all('mark')
#     url = ''
    
#     for mark in tqdm(marks):
#         try:
#             term = mark.get_text(strip=True,separator=', ').split(', ')[0] # get the term annotated
#             if term in to_remove: continue
            
#             wiki_info = DICT_PAGE_TITLE[term] # get wikipedia pagetitle and summary from json file
#             url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_info["title"].split())}' 
#             summary = wiki_info['summary']
            
#         except KeyError:
#             wiki_title = find_wiki_title(term)  
            
#             if wiki_title:
#                 url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_title.split())}' 
#                 wiki_summary = find_wiki_summary(wiki_title)
#                 DICT_PAGE_TITLE.update({term:{'title':wiki_title, 'summary': wiki_summary}})
                    
#         link = soup.new_tag('a', href=url) # create the html tag for link       
#         mark.wrap(link) #add html tag <a> (the one to make links) to around our annotated word
#     return soup


def text_to_json(text, nlp): # gives the ner results in json format
    
    dict_position = defaultdict(list)
    dict_position_trigger = defaultdict(list)
    dict_res = {}
    
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'TERM':    
            term = ent.text
            try:
                while (term[-1] in ['.','_', ';', '\n', ' '] or term[0] == '.') and ent.end_char>0 and (not term.isupper() or (term.isupper() and len(term)<=4)) :
                    ent.end_char -= 1
                    term = term[:-1]  
                if(term in to_remove): continue
                dict_position[term].append((ent.start_char, ent.end_char))
            except IndexError: # single character term
                continue         
            
        else: # if it is a trigger word
            dict_position_trigger[ent.text].append((ent.start_char, ent.end_char))          
            
    dict_position = dict(dict_position)
    
    msg.info("Analysing errors...")
    for error, pos_l in tqdm(dict_position_trigger.items()): # only for ERROR
        dict_res.update({error: {'label': 'ERROR', 'position': pos_l}})
    msg.good("Done!")
    
    msg.info("Analysing terms...")
    cnt = 0
    for term, pos_l in tqdm(dict_position.items()): # only for TERM

        wiki_info = get_term_by_termname(term) # get wikipedia pagetitle and summary from database
        if wiki_info: # if exists in database
            url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_info[1].split())}' 
            wiki_summary = wiki_info[2] 

        else:   
            print(term)
            wiki_title = find_wiki_title(term)  
            # find summary  
            try:
                ## witionary
                wiki_summary = get_description_wiktionary(term)
                if not wiki_summary or '|' in wiki_summary:
                    ## wikipedia brief summary
                    wiki_summary = get_description_wikipedia(term)
                if (not wiki_summary) and wiki_title:                  
                    if not wiki_summary:
                        wiki_summary = get_description_wiktionary(wiki_title)
                    if not wiki_summary:
                        wiki_summary = get_description_wikipedia(wiki_title)
                    if not wiki_summary: ## wikipedia page abstract
                        wiki_summary = find_wiki_summary(wiki_title)             
            except KeyError: # for terms of ERROR
                continue
                
            # find wiki title
            if wiki_title:
                url = f'https://en.wikipedia.org/wiki/{"_".join(wiki_title.split())}' 
                # insert new term into database   
                print(Term(term, wiki_title, wiki_summary))
                insert_term(Term(term, wiki_title, wiki_summary))
                
            else: # returns the wikipedia main page
                url = f'https://www.wikipedia.org'
                insert_term(Term(term, None, None))
                           
            cnt += 1

        dict_res.update({term:{'label': 'TERM', 'position': pos_l,'wikilink': url,'summary': wiki_summary}})
    msg.good("Done!")
    
    # update the database data if there is new term recognised 
    if cnt:
        conn.commit()
        conn.close()
        msg.good(f"Found wikipedia information for {cnt} new terms.")
            
    return dict_res 

In [3]:
conn = sqlite3.connect('./wiki_info.db')
cursor = conn.cursor()

In [4]:
with open('./claims.txt', encoding = 'utf-8', mode='r') as f:
    claims = f.read().replace('<p>', '').replace('</p>','')

In [5]:
nlp = spacy.load(r"../03_spaCy_ner/output/G_2018/model-last/") 

In [6]:
# open('./demo.html', 'w', encoding="utf-8").write(str(text_to_html(claims,nlp))) #create file with html data

In [7]:
# save results
with open('demo.json', "w") as f: 
    json.dump(text_to_json(claims, nlp), f, indent = 4) # test with the first patent 

100%|██████████| 6/6 [00:00<00:00, 66576.25it/s]
 38%|███▊      | 247/653 [00:00<00:00, 2466.57it/s]

[38;5;4mℹ Analysing errors...[0m
[38;5;2m✔ Done![0m
[38;5;4mℹ Analysing terms...[0m


100%|██████████| 653/653 [00:00<00:00, 3165.26it/s]

[38;5;2m✔ Done![0m



