# T-RES using DeezyMatch with mostpopular

In [1]:
import os
import sys
from pathlib import Path

from t_res.geoparser import pipeline, ranking, linking

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# --------------------------------------
# Instantiate the ranker:
myranker = ranking.Ranker(
    method="deezymatch",
    resources_path="../resources/",
    strvar_parameters={
        # Parameters to create the string pair dataset:
        "ocr_threshold": 60,
        "top_threshold": 85,
        "min_len": 5,
        "max_len": 15,
        "w2v_ocr_path": str(Path("../resources/models/w2v/").resolve()),
        "w2v_ocr_model": "w2v_*_news",
        "overwrite_dataset": False,
    },
    deezy_parameters={
        # Paths and filenames of DeezyMatch models and data:
        "dm_path": str(Path("../resources/deezymatch/").resolve()),
        "dm_cands": "wkdtalts",
        "dm_model": "w2v_ocr",
        "dm_output": "deezymatch_on_the_fly",
        # Ranking measures:
        "ranking_metric": "faiss",
        "selection_threshold": 50,
        "num_candidates": 1,
        "verbose": False,
        # DeezyMatch training:
        "overwrite_training": False,
        "do_test": False,
    },
)

In [3]:
mylinker = linking.Linker(
    method="mostpopular",
    resources_path="../resources/",
)

In [4]:
geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)

*** Creating and loading a NER pipeline.
*** Loading the ranker resources.
The DeezyMatch model is already trained!
[92m2024-04-03 11:40:28[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread input file: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/models/w2v_ocr/input_dfm.yaml[0m
[92m2024-04-03 11:40:28[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mpytorch will use: cpu[0m
[92m2024-04-03 11:40:28[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread CSV file: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/data/wkdtalts.txt[0m
[92m2024-04-03 11:40:33[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mnumber of labels, True: 1639965 and False: 0[0m
[92m2024-04-03 11:40:49[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mskipping 2 lines[0m


                                                                         

[92m2024-04-03 11:40:57[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32msave test-data-class: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vectors/wkdtalts_w2v_ocr/dataframe.df[0m


  0%|          | 0/51249 [00:00<?, ?it/s]

--- 619.2517399787903 seconds ---
Generate candidate vectors: 619.3354473114014
[92m2024-04-03 11:50:47[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread input file: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vectors/wkdtalts_w2v_ocr/input_dfm.yaml[0m
[92m2024-04-03 11:50:47[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mpytorch will use: cpu[0m


-- Combine vectors
Reading vectors from /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vectors/wkdtalts_w2v_ocr/embeddings/rnn_fwd*
0000000 /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vectors/wkdtalts_w2v_ocr/embeddings/rnn_fwd_0
0000100 /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vectors/wkdtalts_w2v_ocr/embeddings/rnn_fwd_100
0000200 /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vectors/wkdtalts_w2v_ocr/embeddings/rnn_fwd_200
0000300 /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/candidate_vector

In [5]:
resolved = geoparser.run_sentence("A remarkable case of rattening has just occurred in the building trade at Sheffield.")
for r in resolved:
    print(r)

{'mention': 'Sheffield', 'ner_score': 1.0, 'pos': 74, 'sent_idx': 0, 'end_pos': 83, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Sheffield.', 'prediction': 'Q42448', 'ed_score': 0.896, 'string_match_score': {'Sheffield': (1.0, ['Q6707254', 'Q823917', 'Q5953687', 'Q7492778', 'Q1421317', 'Q7492594', 'Q897533', 'Q42448', 'Q7492565', 'Q1862179', 'Q4834926', 'Q17643392', 'Q7492570', 'Q1950928', 'Q2277715', 'Q79568', 'Q518864', 'Q7492591', 'Q2306176', 'Q7492775', 'Q741640', 'Q7492686', 'Q3577611', 'Q12956644', 'Q547824', 'Q7684835', 'Q3365926', 'Q7492719', 'Q7492566', 'Q7492567', 'Q4523493', 'Q3028626', 'Q7492607', 'Q7492568', 'Q1984238', 'Q1184547', 'Q925542', 'Q4664093', 'Q2892594', 'Q1916592', 'Q371969', 'Q1141915', 'Q6986914', 'Q7114883', 'Q1915446', 'Q5224096', 'Q7492766', 'Q15277074', 'Q4065168', 'Q1548891', 'Q7492772', 'Q977409', 'Q1752117', 'Q7492586', 'Q5035049', 'Q108940076'])}, 'prior_cand_score': {}, 'cross_cand_score': {'Q4