In [19]:
import spacy

import csv
from SPARQLWrapper import SPARQLWrapper, JSON
import requests

In [20]:
import urllib.request

#if the arg is empty in ProxyHandler, urllib will find itself your proxy config.
proxy_support = urllib.request.ProxyHandler({})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setReturnFormat(JSON)
place = 'Place'
person = 'Person'
org = 'Organisation'
product = 'Work' # Organisation,,,?
work = 'Work'
event = 'Event'
language = "Language"

norp = ["EthnicGroup", "PoliticalParty"] # low coverage
fac = ['Infrastructure', 'Airport', "Building", 'Bridge', "Highway"] # low coverage

groups_dict = {
    "Person"        : ["Person"],
    "GPE"           : ["Location", "Place", "Country", "SpatialThing", "Geo"], #? Yago:GeoEntity/Region or geo:SpatialThing (this for all spatial things)
    "LOC"           : ["Location"],
    "PRODUCT"       : ["Work", "Organisation"],
    "EVENT"         : ["Event"],
    "FAC"           : ["Infrastructure", "Airport", "Bridge", "Highway", "Building"], #? geo:SpatialThing
    "LANGUAGE"      : ["Language"],
    "NORP"          : ["EthincGroup", "PoliticalParty", "Country"], #?
    "WORK_OF_ART"   : ["Work"],
    "LAW"           : [],
    "MONEY"         : ["Currency"], #?
    "DATE"          : ["Year", "Month", "Day", "Time"], #?
    "TIME"          : ["Time"], #?
    "CARDINAL"      : [], #?
    "ORDINAL"       : [], #?
    "PERCENT"       : [] #?
}

groups = ["dbo:Person", "geo:SpatialThing", "dbo:Organisation", "dbo:Work", "dbo:Event", "dbo:Language"]


In [21]:

def dbpedia_format(mention):
    mention = mention.title().strip()
    mention_1 = ' '.join((mention.split()))
    mention_2 = mention_1.replace(' ', '_')
    return mention_1, mention_2


In [22]:

def build_query(mention, group):
    mention_1, mention_2 = dbpedia_format(mention)
   
    return f"""
        PREFIX rdfs:    <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf:     <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf:    <http://xmlns.com/foaf/0.1/>
        PREFIX dbpedia: <http://dbpedia.org/>
        PREFIX dbo:     <http://dbpedia.org/ontology/>
        SELECT DISTINCT ?item ?name ?page WHERE {{
            # VALUES ?groups {{dbo:Person dbo:Location}}
        {{
            # [Case 1] no disambiguation at all (eg. Twitter)
            ?item rdfs:label "{mention_1}"@en .
        }}
       
        UNION
        {{
            # [Case 2] a dedicated disambiguation page (eg. Michael Jordan)
            <http://dbpedia.org/resource/{mention_2}_(disambiguation)> dbo:wikiPageDisambiguates ?item.
        }}
       
        # Filter by entity class
        ?item rdf:type {group} .
        # Grab wikipedia link
        ?item foaf:isPrimaryTopicOf ?page .
        # Get name
        ?item rdfs:label ?name .
        FILTER (langMatches(lang(?name),"en"))
        # ?item rdf:type ?group .
        # ?group rdfs:label ?group_name
        # FILTER (STR(?group_name) IN ("Building", "Airport"))
    }}
    """



In [23]:
def generate_candidates(mention, group):
    query = build_query(mention, group)
    sparql.setQuery(query)
    sparql.setTimeout(1000)
    for i in range(2):
        try:
            results = sparql.query().convert()
            return results
        except (ConnectionError, TimeoutError):
            print("Will retry again in a little bit")
        except Exception as e:
            print(e)
        time.sleep(15)


In [24]:

def get_most_popular(results):
    backlinks=0
    popular_page=[]
    name=None
    pagex=None
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    if results is not None:
        for result in results["results"]["bindings"]:
            name=result["page"]
            x=name["value"].split("/")
            name=x[-1]
            PARAMS = {
                "action": "query",
                "format": "json",
                "list": "backlinks",
                "bltitle": name, 
                'bllimit': 'max',
                "blnamespace":4,
                "blredirect":"False"
            }

            R = S.get(url=URL, params=PARAMS)
            DATA = R.json()
            if "query" in DATA:
                BACKLINKS = DATA["query"]["backlinks"]
                l=len(BACKLINKS)
                if l>=backlinks:
                    backlinks=l
                    popular_page.append(result)
        if len(popular_page)>0:
            for page in popular_page:
                print("done")

                if page["name"]==name:
                    return(page["page"]["value"])

                elif name in page["name"]:
                    print(page["page"]["value"])

            return popular_page[0]["page"]["value"]
    
    return None



In [None]:
import time
nlp = spacy.load("en_core_web_trf")
dictionary={}
disambiguated_pages=[]
record_dict={}
count=0
count_warx=0
valid_entity=False
valid_entity_types=["PERSON","GPE","LOC","ORG", "PRODUCT", "WORK_OF_ART", "LANGUAGE"]
with open('popular_page2.csv', 'w', newline='', encoding='UTF-8') as file:
    writer = csv.writer(file)
    with open("warcs-20221207-182114.csv", newline='',encoding = 'cp850') as file:
        csv_reader = csv.reader(file, quoting=csv.QUOTE_NONE, escapechar='\\')
        c = 0
        for row in csv_reader:
            record_dict={}
            document = row[-1]
            doc=nlp(document)
            for ent in doc.ents:
                link=None
                entity_name=ent.text.lower()
                entity_type=ent.label_
                if entity_type in valid_entity_types:
                    if entity_name not in dictionary:
                        if entity_type=="PERSON":
                            results=generate_candidates(entity_name, groups[0])
                        elif entity_type=="GPE" or entity_type=="LOC":
                            results=generate_candidates(entity_name, groups[1])
                        elif entity_type=="ORG" or entity_type=="PRODUCT":
                            results=generate_candidates(entity_name, groups[2])
                        elif entity_type=="WORK_OF_ART":
                            results=generate_candidates(entity_name, groups[3])
                        elif entity_type=="LANGUAGE":
                            results=generate_candidates(entity_name, groups[5])
                        link=get_most_popular(results)
                        if link is not None:
                            dictionary[entity_name]=link
                            record_dict[entity_name]=dictionary[entity_name]
                        else:
                            dictionary[entity_name]="None"
                    else:
                        if entity_name not in record_dict and dictionary[entity_name]!="None":
                            record_dict[entity_name]=dictionary[entity_name]
            count_warx=count_warx+1
            if len(record_dict) > 0:
                writer.writerow([row[0], record_dict])
        print(dictionary)


xxxx france GPE
done
xxxx pool ORG
found france
found pool
found france
xxxx tunis GPE
done
done
xxxx tunisia GPE
done
xxxx wp ORG
xxxx cumulus ORG
done
xxxx roy tanck PERSON
xxxx luke morton PERSON
xxxx flash player 9 PRODUCT
xxxx ocular professor PRODUCT
xxxx wordpress ORG
xxxx le canigou dans la brume ORG
xxxx le canigou ORG
xxxx bleumarie ORG
xxxx baby blue car PERSON
found tunis
found tunisia
found wp
found cumulus
found roy tanck
found luke morton
found flash player 9
found ocular professor
xxxx le manoir derri├â le palmier WORK_OF_ART
found bleumarie
found baby blue car
found tunis
found tunisia
found roy tanck
found luke morton
found flash player 9
found ocular professor
xxxx le village derri├â les branches. GPE
found le village derri├â les branches.
found bleumarie
found baby blue car
found tunis
found tunisia
found wp
found cumulus
found roy tanck
found luke morton
found flash player 9
found ocular professor
xxxx tourelle face au soleil levant PERSON
xxxx tourelle ORG
found b