# T-RES using DeezyMatch with REL disambiguation

REL disambiguation **filtering out microtoponyms** and without adding place of publication.

In [1]:
import os
import sys
import sqlite3
from pathlib import Path

from t_res.geoparser import pipeline, ranking, linking

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
path = '/home/antoine/Documents/GitHub/T-Res/'

In [12]:
# --------------------------------------
# Instantiate the ranker:
myranker = ranking.Ranker(
    method="deezymatch",
    resources_path= path + "resources/",
    strvar_parameters={"overwrite_dataset": False,},
    deezy_parameters={
        # Paths and filenames of DeezyMatch models and data:
        "dm_path": str(Path( path + "resources/deezymatch/").resolve()),
        "dm_cands": "wkdtalts",
        "dm_model": "w2v_ocr",
        "dm_output": "deezymatch_on_the_fly",
        # Ranking measures:
        "ranking_metric": "faiss",
        "selection_threshold": 50,
        "num_candidates": 10,
        "verbose": False,
        # DeezyMatch training:
        "overwrite_training": False,
        "do_test": False,
    },
)

In [9]:
with sqlite3.connect( path + "resources/rel_db/embeddings_database.db") as conn:
    cursor = conn.cursor()
    mylinker = linking.Linker(
        method="reldisamb",
        resources_path= path + "resources/",
        rel_params={
            "model_path": path + "resources/models/disambiguation/",
            "data_path":  path + "experiments/outputs/data/lwm/",
            "training_split": "originalsplit",
            "db_embeddings": cursor,
            "with_publication": False,
            "without_microtoponyms": True,
            "do_test": False,
            "default_publname": "",
            "default_publwqid": "",
        },
        overwrite_training=False,
    )

In [10]:
geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)

*** Creating and loading a NER pipeline.
*** Loading the ranker resources.
The DeezyMatch model is already trained!
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!

Model path: /home/antoine/Documents/GitHub/T-Res/resources/models/disambiguation/deezymatch+10+50_originalsplit+wmtops/model eval
Loading model from given path: /home/antoine/Documents/GitHub/T-Res/resources/models/disambiguation/deezymatch+10+50_originalsplit+wmtops/model
{'mode': 'eval', 'model_path': '/home/antoine/Documents/GitHub/T-Res/resources/models/disambiguation/deezymatch+10+50_originalsplit+wmtops/model', 'prerank_ctx_window': 50, 'keep_p_e_m': 4, 'keep_ctx_ent': 3, 'ctx_window': 100, 'tok_top_n': 25, 'mulrel_type': 'ment-norm', 'n_rels': 3, 'hid_dims': 100, 'emb_dims': 300, 'snd_local_ctx_window': 6, 'dropout_rate': 0.3, 'n_epochs': 1000, 'dev_f1_change_lr': 0.915, 'n_not_inc': 10, 'eval_after_n_epochs': 5, 'learning_rate': 0.0001, 'marg

In [6]:
import json

In [7]:
resolved = geoparser.run_text("A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!")
    
for r in resolved:
    print(json.dumps(r, indent=2))
    print()

                                                                                                

  0%|          | 0/1 [00:00<?, ?it/s]

{
  "mention": "Market-street",
  "ner_score": 0.999,
  "pos": 83,
  "sent_idx": 0,
  "end_pos": 96,
  "tag": "STREET",
  "sentence": "A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefrield, but also in Lancaster.",
  "prediction": "NIL",
  "ed_score": 0.0,
  "cross_cand_score": {},
  "string_match_score": {},
  "prior_cand_score": {},
  "latlon": null,
  "wkdt_class": null
}

{
  "mention": "Shefrield",
  "ner_score": 1.0,
  "pos": 100,
  "sent_idx": 0,
  "end_pos": 109,
  "tag": "LOC",
  "sentence": "A remarkable case of rattening has just occurred in the building trade next to the Market-street of Shefrield, but also in Lancaster.",
  "prediction": "Q42448",
  "ed_score": 0.005,
  "cross_cand_score": {
    "Q42448": 0.664,
    "Q1862179": 0.144,
    "Q5953687": 0.071,
    "Q3461338": 0.066,
    "Q7492568": 0.028,
    "Q7492567": 0.028,
    "Q665346": 0.0
  },
  "string_match_score": {
    "Sherfield": [
      0.859,
      [
    

In [8]:
resolved = geoparser.run_sentence("A remarkable case of rattening has just occurred in the building trade at Sheffield.")
for r in resolved:
    print(r)

{'mention': 'Sheffield', 'ner_score': 1.0, 'pos': 74, 'sent_idx': 0, 'end_pos': 83, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Sheffield.', 'prediction': 'Q42448', 'ed_score': 0.006, 'cross_cand_score': {'Q42448': 0.437, 'Q7492778': 0.174, 'Q79568': 0.133, 'Q1862179': 0.116, 'Q7492567': 0.079, 'Q7492568': 0.061, 'Q2277715': 0.0}, 'string_match_score': {'Sheffield': (1.0, ['Q6707254', 'Q823917', 'Q5953687', 'Q7492778', 'Q1421317', 'Q7492594', 'Q897533', 'Q42448', 'Q7492565', 'Q1862179', 'Q4834926', 'Q17643392', 'Q7492570', 'Q1950928', 'Q2277715', 'Q79568', 'Q518864', 'Q7492591', 'Q2306176', 'Q7492775', 'Q741640', 'Q7492686', 'Q3577611', 'Q12956644', 'Q547824', 'Q7684835', 'Q3365926', 'Q7492719', 'Q7492566', 'Q7492567', 'Q4523493', 'Q3028626', 'Q7492607', 'Q7492568', 'Q1984238', 'Q1184547', 'Q925542', 'Q4664093', 'Q2892594', 'Q1916592', 'Q371969', 'Q1141915', 'Q6986914', 'Q7114883', 'Q1915446', 'Q5224096', 'Q7492766', 'Q15277074',