In [19]:
import WikidataObject as wdo
import pandas as pd
import os
import json

path = '/home/antoine/Documents/GitHub/T-Res/'

In [2]:
def load_resources(method="mostpopular",
                   resources_path="../resources/"
                   ) :

    print("*** Loading the ranker resources.")

    # Load files
    files = {
        "mentions_to_wikidata": os.path.join(
            resources_path, "wikidata/mentions_to_wikidata_normalized.json"
        ),
        "wikidata_to_mentions": os.path.join(
            resources_path, "wikidata/wikidata_to_mentions_normalized.json"
        ),
    }

    with open(files["mentions_to_wikidata"], "r") as f:
        mentions_to_wikidata = json.load(f)

    with open(files["wikidata_to_mentions"], "r") as f:
        wikidata_to_mentions = json.load(f)

    # Filter mentions to remove noise:
    wikidata_to_mentions_filtered = dict()
    mentions_to_wikidata_filtered = dict()
    for wk in wikidata_to_mentions:
        wikipedia_mentions = wikidata_to_mentions.get(wk)
        wikipedia_mentions_stripped = dict(
            [
                (x, wikipedia_mentions[x])
                for x in wikipedia_mentions
                if not ", " in x and not " (" in x
            ]
        )

        if wikipedia_mentions_stripped:
            wikipedia_mentions = wikipedia_mentions_stripped

        wikidata_to_mentions_filtered[wk] = dict(
            [(x, wikipedia_mentions[x]) for x in wikipedia_mentions]
        )

        for m in wikidata_to_mentions_filtered[wk]:
            if m in mentions_to_wikidata_filtered:
                mentions_to_wikidata_filtered[m][
                    wk
                ] = wikidata_to_mentions_filtered[wk][m]
            else:
                mentions_to_wikidata_filtered[m] = {
                    wk: wikidata_to_mentions_filtered[wk][m]
                }

    mentions_to_wikidata = mentions_to_wikidata_filtered
    wikidata_to_mentions = wikidata_to_mentions_filtered

    del mentions_to_wikidata_filtered
    del wikidata_to_mentions_filtered

    # Parallelize if ranking method is one of the following:
    if method in ["partialmatch", "levenshtein"]:
        pandarallel.initialize(nb_workers=10)
        os.environ["TOKENIZERS_PARALLELISM"] = "true"

    return mentions_to_wikidata, wikidata_to_mentions

mentions_to_wikidata, wikidata_to_mentions = load_resources()

*** Loading the ranker resources.


# Tests texte edit distance

In [4]:
from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance as damlev
from t_res.geoparser import geode_pipe,ranking,linking

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
filepath = 'VILLESFR.json'
VILLESFR = pd.read_json(filepath, orient='records', lines=True)

In [7]:
myranker = ranking.Ranker(
    method="levenshtein",
    resources_path="../resources/",
)
mylinker = linking.Linker(
    method="mostpopular",
    resources_path="../resources/",
)

In [9]:
geoparser = geode_pipe.Pipeline(geodeNERpath=NER_path,
                                myranker=myranker,
                                mylinker=mylinker,
                                )

*** Loading the ranker resources.
INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!



In [13]:
damlev('ROQUEMADOUR'.lower(),
       'Rocamadour'.lower())

0.27272728085517883

# Tests entity wikidata

In [2]:
import pandas as pd
from ast import literal_eval
import json
from collections import Counter
from argparse import ArgumentParser
import os


In [3]:
gaz = pd.read_csv(path + '/resources/wikidata/wikidata_gazetteer.csv', low_memory=False)
gazetteer_ids = set(gaz.wikidata_id)
print(len(gazetteer_ids))

NameError: name 'path' is not defined

In [23]:
gaz.sample(5)

Unnamed: 0,wikidata_id,english_label,instance_of,alias_dict,nativelabel,hcounties,countries,latitude,longitude
373127,Q69802,Camorino,"['Q685309', 'Q123705']","{'en': ['Camorino TI', 'Camorino'], 'de': ['Ca...",,[],"{'Q39': ('', '')}",46.15775,9.01185
505889,Q627780,Bornich,['Q116457956'],"{'pt': ['Bornich'], 'pl': ['Bornich'], 'fr': [...",,[],"{'Q183': ('', '')}",50.128611,7.769167
913722,Q3950956,Battle of Qatwan,['Q178561'],"{'tr': ['Katvan Savaşı', 'Katvan Muharebesi'],...",,[],{},39.8,66.9
133480,Q536551,Hemme,['Q116457956'],"{'de': ['Hemmerwurth', 'Zennhusen', 'Hemme'], ...",,[],"{'Q183': ('', '')}",54.266667,9.016667
354798,Q18109849,Asilmetta,['Q123705'],"{'en': ['Asilmetta'], 'nl': ['Asilmetta']}",,[],"{'Q668': ('', '')}",17.7425,83.3169


In [25]:
def eval_with_exception(string):
    try:
        return literal_eval(string)
    except ValueError:
        return []


# Get all classes in our gazetteer:
gaz["instance_of"] = gaz["instance_of"].apply(eval_with_exception)
instances_all = [i for l in gaz[~gaz.instance_of.isnull()].instance_of for i in l if l]
instances = set(instances_all)
instance_counter = Counter(instances_all)

print("\nSize of gazetteer:", len(gazetteer_ids))
print("Number of classes:", len(instances))

print("\nStart!")



Size of gazetteer: 987945
Number of classes: 15293

Start!


In [None]:

dict_id_to_class = dict()
for i, row in gaz.iterrows():
    entity_classes = row["instance_of"]
    # Get most common class:
    keep_most_common_class = ""
    top_class_relv = 0
    for ec in entity_classes:
        current_class_relv = instance_counter.get(ec)
        if current_class_relv > top_class_relv:
            top_class_relv = current_class_relv
            keep_most_common_class = ec
    if keep_most_common_class:
        dict_id_to_class[row["wikidata_id"]] = keep_most_common_class

# Test ajout des alias

In [5]:
#establish conexion with solr
from pysolr import Solr
solr = Solr('http://localhost:8983/solr/frenchtapioca5')

In [6]:
#query solr
results = solr.search('id:Q90')
'la ciutat de la llum' in results.docs[0]['mention']

True

In [12]:
gaz = pd.read_csv(path + 'resources/wikidata/wikidata_gazetteer.csv', low_memory=False)
gazetteer_ids = set(gaz.wikidata_id)
print(len(gazetteer_ids))

987945


In [13]:
gaz.columns

Index(['wikidata_id', 'english_label', 'instance_of', 'alias_dict',
       'nativelabel', 'hcounties', 'countries', 'latitude', 'longitude'],
      dtype='object')

In [14]:
def get_list_labels_aliases(wd):

    label_and_aliases = set()
        
    labels = wd._get_label(lang='all')
    label_and_aliases.update(labels.values())

    aliases = wd._get_aliases(lang='all')
    for values in aliases.values():
        label_and_aliases.update(v for v in values)

    return list(label_and_aliases)

In [17]:
import tqdm

In [24]:
gaz2 = dict()

# Function to dump the dict as a json file
def save_progress(gaz_dict):
    with open(path + 'resources/wikidata/gaz2.json', 'w') as f:
        json.dump(gaz_dict, f)

# Iterate over gaz dataframe
for i, row in tqdm.tqdm(gaz.iterrows(), total=len(gaz)):
    doc = solr.search('id:'+row["wikidata_id"]).docs
    if len(doc) > 1:
        print("ERROR" + row["wikidata_id"])
        aka_list = []
    if len(doc) == 0:
        # print("NOT FOUND " + row["wikidata_id"])
        wd = wdo.WikidataObject(row["wikidata_id"])
        aka_list = get_list_labels_aliases(wd)
    else:
        aka_list = doc[0]['mention']
    
    gaz2[row["wikidata_id"]] = aka_list
    
    # Save progress every 1000 rows
    if (i + 1) % 1000 == 0:
        save_progress(gaz2)

# Save final progress
save_progress(gaz2)


  0%|          | 4571/987945 [02:21<18:02:18, 15.14it/s]

In [52]:
wikidata_to_aka = dict()
wikidata_to_aka_normalized = dict()
aka_to_wikidata = dict()
aka_to_wikidata_normalized = dict()

for i,row in gaz2.iterrows():

    qid = row["wikidata_id"]
    akas = row["aka"]
    n = len(akas)

    if not akas:
        print("No aliases for", qid)
        continue

    wikidata_to_aka[qid] = {aka : 1 for aka in akas}
    wikidata_to_aka_normalized[qid] = {aka: 1/n for aka in akas}

    for aka in akas:
        if aka in aka_to_wikidata:
            aka_to_wikidata[aka][qid] = 1
            aka_to_wikidata_normalized[aka][qid] = 1/n
        else:
            aka_to_wikidata[aka] = {qid: 1}
            aka_to_wikidata_normalized[aka] = {qid: 1/n}

        

In [54]:
len(wikidata_to_aka)

1000