In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

  from IPython.core.display import display, HTML


### Notable wiki ids of entities

In [None]:
wikidata_business_ids = {"Q4830453": "business",
                         "Q783794": "company",
                         "Q6881511":"enterprise"}

wikidata_proffesion_ids = {"Q12737077":"occupation",
                            "Q28640": "profession",
                            "Q4164871": "position",
                            "Q255274": "white-collar worker"}

wikidata_lang_ids = {"Q34770": "language",
                     "Q1288568": "modern language"}

wikidata_country_ids = {"Q6256": "country"}

### Imports and df loading

In [1]:
import re
from pathlib import Path 
import os
import pickle 
import pandas as pd 
import numpy as np
PATH = Path("/home/akinwilson/data/mana/quora/quora")

cols = ['creds_list', 'bio', 'profile_url']
df = pickle.load(open(PATH / "user-info.p", "rb"))[cols]
# df = df[cols]

df["creds_list"] =  (df.creds_list
                     .apply(lambda x: " ".join(x))
                     .apply(lambda x: np.nan if len(x)==0 else x)
                     .apply(lambda x: np.nan if str(x) == 'None' else x) )

df['bio'] = (df.bio
             .apply(lambda x: np.nan if len(x)==0 else x[0])
             .apply(lambda x: np.nan if str(x) == 'None' else x))

df['profile_url'] = (df.profile_url
                    .apply(lambda x: " ".join(y for y in x.split("/")[-1].split("-") if not y.isdigit())))
# column rename 
df.rename(mapper={"profile_url":"profile", "creds_list":"creds"},axis=1,inplace=True)

df.dropna(axis=0, inplace=True)


df

Unnamed: 0,creds,bio,profile
0,an American by choice. · American Ethnicity an...,.,Jae Yang
1,"MD, Internal Medicine, MS, Epidemiology, autho...",Well the steroids could decrease the efficacy ...,Amy Chai
3,stoic writer Big Data Practice Lead (2011-pres...,dad. data engineer. stoic writer. entrepreneur...,Michael David Cobb Bowen
4,Fighting the Forces of Evil (1907-present) Far...,"As someone who was once ten years old, I can t...",Thomas Oregon
6,"Analyst, Lawyer Works at U.S. Department of Ve...",Find anyone willing to buy them for more than ...,Douglas Peterson
...,...,...,...
2651,I am non-binary · LGBTQ+,you should use their proper pronouns please wh...,Saturns Fern
2652,Writer (2020-present) Studied Government in th...,"I’m a patriotic American who loves life, liber...",Lucas Quidera
2653,Works at Self-Employment Worked at Google Stud...,I was a chemist for the U.S. federal governmen...,Chuck Bluestein
2656,"BS Mechanical Engineering, University of Flori...",Graduate Engineer voted “Most Outstanding Mech...,Dean Rojas


### Data preprocessing

In [2]:

boundaries_creds = (df.creds
                      .str
                      .len()
                      .describe(percentiles=[.05,.95])[["5%", "95%"]])

boundaries_bios = (df.bio
                  .str
                  .len()
                  .describe(percentiles=[.10,.95])[["10%", "95%"]])

df = df[(df.creds.str.len() > min(boundaries_creds)) & (df.creds.str.len() < max(boundaries_creds))]
df = df[(df.bio.str.len() > min(boundaries_bios)) & (df.bio.str.len() < max(boundaries_bios))]


txt = " ".join( df.iloc[1])

### Statistical entity linking

In [3]:
import spacy  # version 3.0.6'
# initialize language model

MODEL = "en_core_web_md"

nlp = spacy.load(MODEL)
# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

doc = nlp(txt)

def entity_linking(input_text):
    # initialize language model
    nlp = spacy.load(MODEL)
    # add pipeline (declared through entry_points in setup.py)
    nlp.add_pipe("entityLinker", last=True)
    doc = nlp(input_text)
    # returns all entities in the whole document
    all_linked_entities = doc._.linkedEntities
    # iterates over sentences and prints linked entities
    for sent in doc.sents:
        sent = sent 
        sent._.linkedEntities.pretty_print()
        sent._.linkedEntities.print_super_entities()
        print("*"*30)
    return all_linked_entities, sent 

ents,sent = entity_linking(txt)

<EntityElement: https://www.wikidata.org/wiki/Q858467 Big                       1988 film by Penny Marshall                       >
<EntityElement: https://www.wikidata.org/wiki/Q9887 Α                         letter of the Greek alphabet                      >
<EntityElement: https://www.wikidata.org/wiki/Q674 phosphorus                chemical element with the atomic number of 15     >
<EntityElement: https://www.wikidata.org/wiki/Q152433 Xerox                     American document management corporation          >
<EntityElement: https://www.wikidata.org/wiki/Q7554175 Software                  album by Grace Slick                              >
<EntityElement: https://www.wikidata.org/wiki/Q4614 University of Southern California private research university in Los Angeles, California, United States>
<EntityElement: https://www.wikidata.org/wiki/Q844837 Southern California       southern portion of US state of California        >
<EntityElement: https://www.wikidata.org/wiki/Q100 Bost

In [5]:


sparql_hobbies = """
PREFIX dbo:    <http://dbpedia.org/ontology/>
PREFIX rdfs:   <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?label ?desc ?qid
WHERE  {
?page dbo:wikiPageID 31257416 .
?page dbo:wikiPageWikiLink  ?hobbyList.
 ?hobbyList rdfs:label ?label;
 dbo:wikiPageID ?qid ;
dbo:abstract ?desc.
FILTER (LANG( ?label) = 'en')
FILTER (LANG( ?desc) = 'en')
 } 
"""

sparql_occuptations = """
PREFIX dbo:    <http://dbpedia.org/ontology/>
PREFIX rdfs:   <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?label ?desc ?qid
WHERE  {
?page dbo:wikiPageID 35283370; 
dbo:wikiPageWikiLink ?ocs.
?ocs  dbo:wikiPageWikiLink ?jobs. 
?jobs rdfs:label ?label;
 dbo:wikiPageID ?qid ;
dbo:abstract ?desc.
FILTER (LANG( ?label) = 'en')
FILTER (LANG( ?desc) = 'en')
 } 
"""

### Entity Tagging
See github repo: https://github.com/egerber/spaCy-entity-linker

In [9]:
import spacy
MODEL = "en_core_web_md"
nlp = spacy.load(MODEL)



nlp.add_pipe("entity_linker", last=True)

doc = nlp(txt)

def entity_linking(input_text):
    # initialize language model
    nlp = spacy.load(MODEL)
    # add pipeline (declared through entry_points in setup.py)
    nlp.add_pipe("entity_linker", last=True)
    doc = nlp(input_text)
    # returns all entities in the whole document
    all_linked_entities = doc._.linkedEntities
    # iterates over sentences and prints linked entities
    for sent in doc.sents:
        print("*"*30)
        sent._.linkedEntities.pretty_print()
        sent._.linkedEntities.print_super_entities()
        print("*"*30)
    return all_linked_entities 


ents = entity_linking(txt)

ValueError: [E139] Knowledge base for component 'entity_linker' is empty. Use the methods `kb.add_entity` and `kb.add_alias` to add entries.

### Entity linking via KB in spacy

In [10]:
import pandas as pd
import sparql_dataframe

endpoint = "http://dbpedia.org/sparql"
q = sparql_hobbies
df = sparql_dataframe.get(endpoint, q)
df.head()

Unnamed: 0,label,desc,qid
0,Conservation and restoration of road vehicles,Conservation and restoration of road vehicles ...,11790244
1,Acroyoga,Acroyoga (also written Acro-Yoga or Acro Yoga)...,17452177
2,Golf,Golf is a club-and-ball sport in which players...,19568112
3,Television show,A television show – or simply TV show – is any...,19508643
4,Wine tasting,Wine tasting is the sensory examination and ev...,19719473


In [12]:
import csv
from pathlib import Path

def load_entities(df):
    df = df.set_index('qid')
    descs = df.to_dict()['desc']
    names = df.to_dict()['label']
    return names, descs

id2name, id2desc = load_entities(df)

### Knowledge base construction

In [17]:
from spacy.kb import KnowledgeBase
import spacy
MODEL = "en_core_web_md"
nlp = spacy.load(MODEL)
kb = KnowledgeBase(vocab=nlp.vocab,
                   entity_vector_length=300)

In [18]:
# PRIOR_FREQ = 300
for qid, desc in id2desc.items():
    
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    
    qid = f"Q{qid}"
    
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=PRIOR_FREQ) 

### Need to find aliases via query