<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import argparse
import spacy 
import json
import requests 
import urllib
import re
import wikipedia
import numpy as np
import pandas as pd

from wasabi import msg
from pathlib import Path
from tqdm import tqdm
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
target_properties = {'P31': 'instance of',
                     'P279': 'subclass of',
                     'P361': 'part of',
                     'P366': 'use',
                     'P527': 'has part',
                     'P1269': 'facet of'
                    }

In [3]:
def find_wiki_title(term):
    title = wikipedia.search(term)
    if title:
        return title[0]
    
def find_wiki_summary(term):
    try:
        return wikipedia.summary(term)
    # if it is a ambiguous term, the function will return None as value of summary
    except wikipedia.exceptions.WikipediaException:
        return None    
    
def get_wikidata_id(term):  
    encoded_term = urllib.parse.quote(term)

    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&ppprop=wikibase_item&redirects=1&titles={encoded_term}"
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser').get_text()
    
    wikidata_id = re.findall('wikibase_item\"\:\"(.*)?\".*', soup)
    
    if wikidata_id != []:
        return wikidata_id[0]    
    
def not_disambiguation_page(wikidata_id):
    url = "https://www.wikidata.org/wiki/" + wikidata_id
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    div = soup.find("div", {"class": "wikibase-entitytermsview-heading-description"}).text
    
    return div != 'Wikimedia disambiguation page'


def retrieve_value_P(P):
    return target_properties[P]


def retrieve_value_Q(Q, reference): 
    if Q in reference:
        return reference[Q]
    else:
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=labels&languages=en&ids={Q}"
        json_response = requests.get(url).json()
        entities = json_response.get('entities')

        entity = entities.get(Q)
        if entity:
            labels = entity.get('labels')
            if labels:
                en = labels.get('en')
                if en:
                    value = en.get('value')
                    return value


def get_target_ItemProperties(wikidata_item, wikidata_id, reference):
    url = "https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&entity="+wikidata_id
    json_response = requests.get(url).json()
    
    # if returns an error page
    if list(json_response.keys())[0]=='error':
        return None 
    
    properties = [*json_response.get('claims').values()]
    
    res = np.empty(shape=[0, 3])
    
    for p in properties:   
        for d in p: 
            dict_ = d['mainsnak']
        
            # ignore if not a wikibase item or not in target properties
            if dict_['datatype'] != 'wikibase-item' or dict_['snaktype'] != 'value' or dict_['property'] not in target_properties:
                continue 
                
            # replace all the wikidataItem ID by wikidataItem name          
            property_value = retrieve_value_Q(dict_['datavalue']['value']['id'], reference)
            
            if property_value is None:
                continue
            
            # find property value in the previous lookup table
            property_ = retrieve_value_P(dict_['property'])         
            row_to_append = [wikidata_item, property_, property_value]  
            res = np.append(res, [row_to_append], 0)  
  
    return res 

In [5]:
# load input/ output file
input_path = Path('../data/G06F0011160000.txt') 

if not input_path.exists():
    msg.fail("Can't find input file", in_file, exits=1)   
else:
    with input_path.open("r", encoding="utf8") as f:
        patents = f.read().split('\n\n\n')  

In [6]:
# load spaCy model
nlp = spacy.load('../03_spaCy_ner/output/G_2018/model-last/')
msg.info(f"Using spaCy model {nlp}") 

[38;5;4mℹ Using spaCy model <spacy.lang.en.English object at
0x7fcb55a1d850>[0m


In [7]:
# load wikidata_id file and wikidata_property file
msg.text("Loading wikidata id lookup table...")
with open('./wikidata_id.json', 'r', encoding='utf-8') as f:
    DICT_WIKIDATA_ID = json.load(f)
    
msg.text("Loading wikidata property file...")
wikidata_property = pd.read_csv('./wikidata_properties.txt', delimiter='\t')

Loading wikidata id lookup table...
Loading wikidata property file...


In [8]:
# read json file of wikipedia page title and summary
with open('../01_make_matching_list/title_summary.json', 'r', encoding='utf-8') as f:
    DICT_PAGE_TITLE = json.load(f)

In [10]:
def ner2wiki(text, nlp, wikidata_property): # find entities and complete its wiki information (wiki title, summary, wikidata id and properties)                                                             
    doc = nlp(text)
    ents = set([ent.text for ent in doc.ents]) - set(wikidata_property.term1.values)
    for term in ents:
        try:
            wiki_title = DICT_PAGE_TITLE[term]['title']
        except KeyError:
            wiki_title = find_wiki_title(term)  
#             if wiki_title:
#                 wiki_summary = find_wiki_summary(wiki_title)
#                 DICT_PAGE_TITLE.update({term:{'title':wiki_title, 'summary': wiki_summary}})
#             else:
#                 DICT_PAGE_TITLE.update({term:{'title': None, 'summary': None}})

        # find wikidata id and properties
        if wiki_title and (wiki_title not in wikidata_property.term1.values) and (wiki_title not in DICT_WIKIDATA_ID.values()):
            wikidata_id = get_wikidata_id(wiki_title)
            if wikidata_id and not_disambiguation_page(wikidata_id):
                DICT_WIKIDATA_ID.update({wiki_title: wikidata_id})
                # find wiki properties 
                list_to_append = get_target_ItemProperties(wiki_title, wikidata_id, DICT_WIKIDATA_ID)
                wikidata_property = wikidata_property.append(pd.DataFrame(list_to_append, columns=wikidata_property.columns), ignore_index=True)
                wikidata_property.drop_duplicates()
    return wikidata_property     

msg.text('Extracting wiki information for entities in patent file:')
for patent in tqdm(patents[285:]):
    wikidata_property = ner2wiki(patent, nlp, wikidata_property)

  0%|          | 0/356 [00:00<?, ?it/s]

Extracting wiki information for entities in patent file:


100%|██████████| 356/356 [4:09:17<00:00, 42.02s/it]   


In [11]:
# save the update files
with open('./wikidata_id.json', "w", encoding='utf-8') as f: 
    json.dump(DICT_WIKIDATA_ID, f, indent = 4)
wikidata_property.to_csv('./wikidata_properties.txt', index = False, sep='\t')

In [None]:
with open(args.title_summary, "w", encoding='utf-8') as f: 
    json.dump(DICT_PAGE_TITLE, f, indent = 4)