# Starting

In [1]:
import os
import sys
import sqlite3
from pathlib import Path
import json

import pandas as pd
import WikidataObject as wdo

from t_res.geoparser import geode_pipe,ranking,linking
path = '/home/antoine/Documents/GitHub/T-Res/'
NER_path = path + 'resources/fr_spacy_custom_spancat_edda'

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Import VILLESFR

In [6]:
filepath = 'VILLESFR.json'
VILLESFR = pd.read_json(filepath, orient='records', lines=True)

 Quelle proportion des QIds "gold" de VILLESFR retrouve-t-on dans l'index utilisé par T-RES ? 

In [7]:
print(VILLESFR.sample(15)['head'].values[:])

['RETHEL' 'BRIGNOLES' 'AUTUN' 'Brie-Comte-Braine' 'Chateau-Salins' 'CONTY'
 'VOUGLE' 'BARBONNE' 'NIONS' 'SORGUE' 'NEUFCHATEL en Bray'
 "Chateau-d'Oleron" 'VILLEPINTE' 'ANGLURE' 'Dun-le-Roi']


In [8]:
def load_resources(method="mostpopular",
                   resources_path="../resources/"
                   ) :

    print("*** Loading the ranker resources.")

    # Load files
    files = {
        "mentions_to_wikidata": os.path.join(
            resources_path, "wikidata/mentions_to_wikidata_normalized.json"
        ),
        "wikidata_to_mentions": os.path.join(
            resources_path, "wikidata/wikidata_to_mentions_normalized.json"
        ),
    }

    with open(files["mentions_to_wikidata"], "r") as f:
        mentions_to_wikidata = json.load(f)

    with open(files["wikidata_to_mentions"], "r") as f:
        wikidata_to_mentions = json.load(f)

    # Filter mentions to remove noise:
    wikidata_to_mentions_filtered = dict()
    mentions_to_wikidata_filtered = dict()
    for wk in wikidata_to_mentions:
        wikipedia_mentions = wikidata_to_mentions.get(wk)
        wikipedia_mentions_stripped = dict(
            [
                (x, wikipedia_mentions[x])
                for x in wikipedia_mentions
                if not ", " in x and not " (" in x
            ]
        )

        if wikipedia_mentions_stripped:
            wikipedia_mentions = wikipedia_mentions_stripped

        wikidata_to_mentions_filtered[wk] = dict(
            [(x, wikipedia_mentions[x]) for x in wikipedia_mentions]
        )

        for m in wikidata_to_mentions_filtered[wk]:
            if m in mentions_to_wikidata_filtered:
                mentions_to_wikidata_filtered[m][
                    wk
                ] = wikidata_to_mentions_filtered[wk][m]
            else:
                mentions_to_wikidata_filtered[m] = {
                    wk: wikidata_to_mentions_filtered[wk][m]
                }

    mentions_to_wikidata = mentions_to_wikidata_filtered
    wikidata_to_mentions = wikidata_to_mentions_filtered

    del mentions_to_wikidata_filtered
    del wikidata_to_mentions_filtered

    # Parallelize if ranking method is one of the following:
    if method in ["partialmatch", "levenshtein"]:
        pandarallel.initialize(nb_workers=10)
        os.environ["TOKENIZERS_PARALLELISM"] = "true"

    return mentions_to_wikidata, wikidata_to_mentions

mentions_to_wikidata, wikidata_to_mentions = load_resources()

*** Loading the ranker resources.


In [9]:
df = VILLESFR.copy(deep=True)
df['related_mentions'] = None

for idx,row in df.iterrows():
    related_mentions = []
    # check if wikidata_to_mentions.get(row['gold']) doesn't yeld a keyerror :
    if wikidata_to_mentions.get(row['gold']) is None:
        df.at[idx,'related_mentions'] = None
        #print(row['head'])
        #print(WDO.WikidataObject(row['gold']))
        continue
    else:
        related_mentions = wikidata_to_mentions.get(row['gold'])
        #print(row['head'])
        #print(related_mentions)
        df.at[idx,'related_mentions'] = related_mentions

#count of None in related_mentions
df['related_mentions'].isnull().sum()
        

36

In [10]:
# Saint-Étienne-de-Carlat
mentions_to_wikidata.get('Saint-Étienne-de-Carlat')

In [11]:
wikidata_to_mentions.get('Q270192')

> 36 ressources de Wikidata ne sont pas présentes dans l'index ! simplement parce qu'elles n'ont pas de page wikipedia en anglais et donc leur ressource Wikidata n'est pas consultée

In [12]:
subVILLESFR = df[df['related_mentions'].notnull()]
subVILLESFR.shape

(790, 5)

# DeezyMatch + REL

In [2]:
# --------------------------------------
# Instantiate the ranker:
myranker = ranking.Ranker(
    method="deezymatch",
    resources_path= path + "resources/",
    strvar_parameters={"overwrite_dataset": False,},
    deezy_parameters={
        # Paths and filenames of DeezyMatch models and data:
        "dm_path": str(Path( path + "resources/deezymatch/").resolve()),
        "dm_cands": "wkdtalts",
        "dm_model": "w2v_ocr",
        "dm_output": "deezymatch_on_the_fly",
        # Ranking measures:
        "ranking_metric": "faiss",
        "selection_threshold": 50,
        "num_candidates": 10,
        "verbose": False,
        # DeezyMatch training:
        "overwrite_training": False,
        "do_test": False,
    },
)

# --------------------------------------
# Instantiate the Linker:
with sqlite3.connect( path + "resources/rel_db/embeddings_database.db") as conn:
    cursor = conn.cursor()
    mylinker = linking.Linker(
        method="reldisamb",
        resources_path= path + "resources/",
        rel_params={
            "model_path": path + "resources/models/disambiguation/",
            "data_path":  path + "experiments/outputs/data/lwm/",
            "training_split": "originalsplit",
            "db_embeddings": cursor,
            "with_publication": False,
            "without_microtoponyms": True,
            "do_test": False,
            "default_publname": "",
            "default_publwqid": "",
        },
        overwrite_training=False,
    )

In [3]:
geoparser = geode_pipe.Pipeline(geodeNERpath=NER_path,
                              myranker=myranker,
                              mylinker=mylinker)

*** Loading the ranker resources.
The DeezyMatch model is already trained!
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!

Model path: /home/antoine/Documents/GitHub/T-Res/resources/models/disambiguation/deezymatch+10+50_originalsplit+wmtops/model eval
Loading model from given path: /home/antoine/Documents/GitHub/T-Res/resources/models/disambiguation/deezymatch+10+50_originalsplit+wmtops/model
{'mode': 'eval', 'model_path': '/home/antoine/Documents/GitHub/T-Res/resources/models/disambiguation/deezymatch+10+50_originalsplit+wmtops/model', 'prerank_ctx_window': 50, 'keep_p_e_m': 4, 'keep_ctx_ent': 3, 'ctx_window': 100, 'tok_top_n': 25, 'mulrel_type': 'ment-norm', 'n_rels': 3, 'hid_dims': 100, 'emb_dims': 300, 'snd_local_ctx_window': 6, 'dropout_rate': 0.3, 'n_epochs': 1000, 'dev_f1_change_lr': 0.915, 'n_not_inc': 10, 'eval_after_n_epochs': 5, 'learning_rate': 0.0001, 'margin': 0.01, 'df': 0.5, 'n_loops': 10, 'n_c

In [7]:
import time
start = time.time()

sentence = "* ALBI, (Géog.) ville de France, capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44."
sentence.find('ALBI')
resolved = geoparser.run_sentence(sentence, HEAD='ALBI', verbose=False)
for r in resolved:
    print(json.dumps(r, indent=2))

print(time.time() - start)

{
  "mention": "ALBI",
  "ner_score": 1,
  "pos": 2,
  "sent_idx": 0,
  "end_pos": 5,
  "tag": "B-LOC",
  "sentence": "* ALBI, (G\u00e9og.) ville de France, capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44.",
  "prediction": "Q132801",
  "ed_score": 0.006,
  "cross_cand_score": {
    "Q132801": 0.181,
    "Q700379": 0.168,
    "Q53749": 0.164,
    "Q951428": 0.163,
    "Q284698": 0.162,
    "Q94744": 0.162
  },
  "string_match_score": {
    "Albi": [
      1.0,
      [
        "Q94744",
        "Q132801",
        "Q951428",
        "Q53749",
        "Q284698",
        "Q700379"
      ]
    ]
  },
  "prior_cand_score": {
    "Q132801": 0.882,
    "Q53749": 0.467,
    "Q951428": 0.324,
    "Q700379": 0.317,
    "Q284698": 0.265,
    "Q94744": 0.244
  },
  "latlon": [
    43.928056,
    2.145833
  ],
  "wkdt_class": "Q484170"
}
{
  "mention": "France",
  "ner_score": 1,
  "pos": 25,
  "sent_idx": 0,
  "end_pos": 31,
  "tag": "B-LOC",
  

In [None]:
import pandas as pd
import WikidataObject as wdo

sample2 = VILLESFR.copy(deep=True)
verbose = False

if 'resolved' not in sample2.columns:
    sample2['resolved'] = None

for i, row in sample2.iterrows():
    resolved = geoparser.run_sentence(row['fullcontent'], HEAD=row['head'])

    skyline = row['gold'] in resolved[0]['cross_cand_score'].keys()
    best_pred = resolved[0]['prediction']

    if best_pred[0] != 'Q':
        acc10 = False
    else:
        wd_pred = wdo.WikidataObject(best_pred, coordinates=resolved[0]['latlon'])
        acc10 = wd_pred._distance_to(row['gold']) <= 10

    sample2.at[i, 'resolved'] = resolved[0]
    sample2.at[i, 'skyline'] = skyline
    sample2.at[i, 'bestPred'] = best_pred
    sample2.at[i, 'acc10'] = acc10

    if verbose:
        print(f"Head: {row['head']}")
        print(f"Gold: {row['gold']}")
        print(f"Prediction: {best_pred}")
        print(f"Skyline: {skyline}")
        print(resolved[0]['cross_cand_score'])
        print(f"Accuracy 10: {acc10}\n")


### perf

In [None]:
sample2.to_json('t_res_results_140424.json', orient='records', lines=True)

In [9]:
sample2 = pd.read_json('t_res_deezy+rel_140424.json', orient='records', lines=True)   

In [10]:

print(sample2.skyline.value_counts(normalize=True), '\n')
print(sample2.acc10.value_counts(normalize=True), '\n')

True     0.617433
False    0.382567
Name: skyline, dtype: float64 

False    0.553269
True     0.446731
Name: acc10, dtype: float64 



# Perfect match

In [5]:
myranker = ranking.Ranker(
    method="perfectmatch",
    resources_path="../resources/",
)

In [6]:
mylinker = linking.Linker(
    method="mostpopular",
    resources_path="../resources/",
)

In [7]:
geoparser = geode_pipe.Pipeline(geodeNERpath=NER_path,
                              myranker=myranker,
                              mylinker=mylinker)

*** Loading the ranker resources.
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!



In [8]:
sentence = "* ALBI, (Géog.) ville dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44."
resolved = geoparser.run_sentence(sentence, HEAD='ALBI', verbose=False)
for r in resolved:
    print(json.dumps(r, indent=2))

{
  "mention": "ALBI",
  "ner_score": 1,
  "pos": 2,
  "sent_idx": 0,
  "end_pos": 5,
  "tag": "B-LOC",
  "sentence": "* ALBI, (G\u00e9og.) ville dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44.",
  "prediction": "Q132801",
  "ed_score": 0.921,
  "string_match_score": {
    "Albi": [
      1.0,
      [
        "Q94744",
        "Q132801",
        "Q951428",
        "Q53749",
        "Q284698",
        "Q700379"
      ]
    ]
  },
  "prior_cand_score": {},
  "cross_cand_score": {
    "Q132801": 0.921,
    "Q53749": 0.036,
    "Q951428": 0.028,
    "Q94744": 0.005,
    "Q284698": 0.005,
    "Q700379": 0.005
  },
  "latlon": [
    43.928056,
    2.145833
  ],
  "wkdt_class": "Q484170"
}
{
  "mention": "Tarn",
  "ner_score": 1,
  "pos": 63,
  "sent_idx": 0,
  "end_pos": 67,
  "tag": "B-LOC",
  "sentence": "* ALBI, (G\u00e9og.) ville dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44.",
  "prediction": "Q12772",
  "ed_score": 0.856,
  "

In [16]:
sample3 = VILLESFR.copy(deep=True)
verbose = False

if 'resolved' not in sample3.columns:
    sample3['resolved'] = None

for i, row in sample3.iterrows():
    resolved = geoparser.run_sentence(row['fullcontent'], HEAD=row['head'])

    skyline = row['gold'] in resolved[0]['cross_cand_score'].keys()
    best_pred = resolved[0]['prediction']

    if best_pred[0] != 'Q':
        acc10 = False
    else:
        wd_pred = wdo.WikidataObject(best_pred, coordinates=resolved[0]['latlon'])
        acc10 = wd_pred._distance_to(row['gold']) <= 10

    sample3.at[i, 'resolved'] = resolved[0]
    sample3.at[i, 'skyline'] = skyline
    sample3.at[i, 'bestPred'] = best_pred
    sample3.at[i, 'acc10'] = acc10

    if verbose:
        print(f"Head: {row['head']}")
        print(f"Gold: {row['gold']}")
        print(f"Prediction: {best_pred}")
        print(f"Skyline: {skyline}")
        print(resolved[0]['cross_cand_score'])
        print(f"Accuracy 10: {acc10}\n")


In [17]:
sample3.to_json('t_res_perfect+mostpop_170424.json', orient='records', lines=True)

# Partial match - containement / levenshtein

In [13]:
myranker = ranking.Ranker(
    method="levenshtein",
    resources_path="../resources/",
)

In [19]:
mylinker = linking.Linker(
    method="mostpopular",
    resources_path="../resources/",
)

In [15]:
# Instantiate the Linker:
with sqlite3.connect( path + "resources/rel_db/embeddings_database.db") as conn:
    cursor = conn.cursor()
    mylinker = linking.Linker(
        method="reldisamb",
        resources_path= path + "resources/",
        rel_params={
            "model_path": path + "resources/models/disambiguation/",
            "data_path":  path + "experiments/outputs/data/lwm/",
            "training_split": "originalsplit",
            "db_embeddings": cursor,
            "with_publication": False,
            "without_microtoponyms": True,
            "do_test": False,
            "default_publname": "",
            "default_publwqid": "",
        },
        overwrite_training=False,
    )

In [16]:
geoparser = geode_pipe.Pipeline(geodeNERpath=NER_path,
                              myranker=myranker,
                              mylinker=mylinker)

*** Loading the ranker resources.
INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!

The entity disambiguation model does not exist or overwrite_training is set to True.
Creating the dataset.


KeyboardInterrupt: 

In [None]:
import time
start = time.time()

sentence = "* ALBI, (Géog.) capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44."
resolved = geoparser.run_sentence(sentence, HEAD='ALBI', verbose=False)
for r in resolved:
    print(json.dumps(r, indent=2))

print(time.time() - start)

{
  "mention": "ALBI",
  "ner_score": 1,
  "pos": 2,
  "sent_idx": 0,
  "end_pos": 5,
  "tag": "B-LOC",
  "sentence": "* ALBI, (G\u00e9og.) capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44.",
  "prediction": "Q132801",
  "ed_score": 0.921,
  "string_match_score": {
    "Albi": [
      1.0,
      [
        "Q94744",
        "Q132801",
        "Q951428",
        "Q53749",
        "Q284698",
        "Q700379"
      ]
    ]
  },
  "prior_cand_score": {},
  "cross_cand_score": {
    "Q132801": 0.921,
    "Q53749": 0.036,
    "Q951428": 0.028,
    "Q94744": 0.005,
    "Q284698": 0.005,
    "Q700379": 0.005
  },
  "latlon": [
    43.928056,
    2.145833
  ],
  "wkdt_class": "Q484170"
}
{
  "mention": "Albigeois",
  "ner_score": 1,
  "pos": 30,
  "sent_idx": 0,
  "end_pos": 39,
  "tag": "B-LOC",
  "sentence": "* ALBI, (G\u00e9og.) capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44.",
  "pred

In [None]:
sample4 = subVILLESFR.copy(deep=True)
verbose = False

if 'resolved' not in sample4.columns:
    sample4['resolved'] = None

for i, row in sample4.iterrows():
    resolved = geoparser.run_sentence(row['fullcontent'], HEAD=row['head'])

    skyline = row['gold'] in resolved[0]['cross_cand_score'].keys()
    best_pred = resolved[0]['prediction']

    if best_pred[0] != 'Q':
        acc10 = False
    else:
        wd_pred = wdo.WikidataObject(best_pred, coordinates=resolved[0]['latlon'])
        acc10 = wd_pred._distance_to(row['gold']) <= 10

    sample4.at[i, 'resolved'] = resolved[0]
    sample4.at[i, 'skyline'] = skyline
    sample4.at[i, 'bestPred'] = best_pred
    sample4.at[i, 'acc10'] = acc10

    if verbose:
        print(f"Head: {row['head']}")
        print(f"Gold: {row['gold']}")
        print(f"Prediction: {best_pred}")
        print(f"Skyline: {skyline}")
        print(resolved[0]['cross_cand_score'])
        print(f"Accuracy 10: {acc10}\n")


Head: ROQUEMADOUR
Gold: Q382628
Prediction: Q7271216
Skyline: False
{'Q7271216': 0.605, 'Q969251': 0.395}
Accuracy 10: False

Head: LISIEUX
Gold: Q188743
Prediction: Q188743
Skyline: True
{'Q188743': 0.9, 'Q927533': 0.037, 'Q1332597': 0.033, 'Q701531': 0.012, 'Q14874972': 0.012, 'Q810098': 0.004}
Accuracy 10: True

Head: NEUVILLER
Gold: Q21355
Prediction: Q195340
Skyline: False
{'Q195340': 1.0}
Accuracy 10: False

Head: CHAVANNES
Gold: Q845009
Prediction: Q740710
Skyline: False
{'Q740710': 0.5, 'Q455485': 0.5}
Accuracy 10: False

Head: MONTEREAU-FAUT-YONNE
Gold: Q242191
Prediction: Q242191
Skyline: True
{'Q242191': 0.603, 'Q1470564': 0.247, 'Q976655': 0.11, 'Q2117756': 0.041}
Accuracy 10: True

Head: COURMONTERAL
Gold: Q196035
Prediction: Q5740928
Skyline: False
{'Q5740928': 1.0}
Accuracy 10: False

Head: BOURGES
Gold: Q132404
Prediction: Q132404
Skyline: True
{'Q132404': 0.986, 'Q207985': 0.009, 'Q660934': 0.002, 'Q700723': 0.002, 'Q60791320': 0.001}
Accuracy 10: True

Head: Treves
Go

In [None]:
sample4.to_json('t_res_damlev+REL_220424.json', orient='records', lines=True)

In [None]:
sample4.resolved.apply(lambda x: len(x['cross_cand_score'])).describe()

In [None]:
print('Skyline : \n', sample4.skyline.value_counts(normalize=True), '\n')
print('Acc10 : \n', sample4.acc10.value_counts(normalize=True), '\n')

# Comparaison

In [25]:
df = pd.read_json('t_res_damlev+mostpop_170424.json', orient='records', lines=True)

### perfect-

In [26]:
df = df[df.index.isin(subVILLESFR.index)]
df.shape

(790, 8)

In [27]:
df.resolved.apply(lambda x: len(x['cross_cand_score'])).describe()

count    790.000000
mean       2.744304
std        2.096031
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max        7.000000
Name: resolved, dtype: float64

In [28]:
print('Skyline : \n', df.skyline.value_counts(normalize=True), '\n')
print('Acc10 : \n', df.acc10.value_counts(normalize=True), '\n')

Skyline : 
 True     0.720253
False    0.279747
Name: skyline, dtype: float64 

Acc10 : 
 True     0.662025
False    0.337975
Name: acc10, dtype: float64 



### containment

In [11]:
partial_mostpop = pd.read_json('t_res_partial+mostpop_140424.json', orient='records', lines=True)
partial_mostpop = partial_mostpop[partial_mostpop.index.isin(subVILLESFR.index)]
print(partial_mostpop.shape, perfect_mostpop.shape, partial_mostpop.shape)

(826, 8) (826, 8) (826, 8)


In [12]:
# keep only indexes from subVILLESFR
deezy_rel = deezy_rel[deezy_rel.index.isin(subVILLESFR.index)]


print(deezy_rel.shape, perfect_mostpop.shape, partial_mostpop.shape)

(790, 8) (790, 8) (790, 8)


In [21]:
partial_mostpop.resolved.apply(lambda x: len(x['cross_cand_score'])).describe()

count    790.000000
mean       3.051899
std        2.183842
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max        7.000000
Name: resolved, dtype: float64

In [14]:
print('Skyline : \n', partial_mostpop.skyline.value_counts(normalize=True), '\n')
print('Acc10 : \n', partial_mostpop.acc10.value_counts(normalize=True), '\n')

Skyline : 
 True     0.621519
False    0.378481
Name: skyline, dtype: float64 

Acc10 : 
 True     0.556962
False    0.443038
Name: acc10, dtype: float64 



### deezy

In [None]:
deezy_rel = pd.read_json('t_res_deezy+rel_140424.json', orient='records', lines=True)

In [20]:
deezy_rel.resolved.apply(lambda x: len(x['cross_cand_score'])).describe()

count    790.000000
mean       5.808861
std        2.132534
min        0.000000
25%        6.000000
50%        7.000000
75%        7.000000
max        7.000000
Name: resolved, dtype: float64

In [15]:
print('Skyline : \n', deezy_rel.skyline.value_counts(normalize=True), '\n')
print('Acc10 : \n', deezy_rel.acc10.value_counts(normalize=True), '\n')

Skyline : 
 True     0.64557
False    0.35443
Name: skyline, dtype: float64 

Acc10 : 
 False    0.54557
True     0.45443
Name: acc10, dtype: float64 

