# T-RES using DeezyMatch with REL disambiguation

REL disambiguation without filtering out microtoponyms and without adding place of publication.

In [1]:
import os
import sys
import sqlite3
from pathlib import Path

from t_res.geoparser import pipeline, ranking, linking

In [4]:
# --------------------------------------
# Instantiate the ranker:
myranker = ranking.Ranker(
    method="deezymatch",
    resources_path="../resources/",
    mentions_to_wikidata=dict(),
    wikidata_to_mentions=dict(),
    strvar_parameters={
        # Parameters to create the string pair dataset:
        "ocr_threshold": 60,
        "top_threshold": 85,
        "min_len": 5,
        "max_len": 15,
        "w2v_ocr_path": str(Path("../resources/models/w2v/").resolve()),
        "w2v_ocr_model": "w2v_*_news",
        "overwrite_dataset": False,
    },
    deezy_parameters={
        # Paths and filenames of DeezyMatch models and data:
        "dm_path": str(Path("../resources/deezymatch/").resolve()),
        "dm_cands": "wkdtalts",
        "dm_model": "w2v_ocr",
        "dm_output": "deezymatch_on_the_fly",
        # Ranking measures:
        "ranking_metric": "faiss",
        "selection_threshold": 25,
        "num_candidates": 3,
        "search_size": 3,
        "verbose": False,
        # DeezyMatch training:
        "overwrite_training": False,
        "do_test": False,
    },
)

In [5]:
with sqlite3.connect("../resources/rel_db/embeddings_database.db") as conn:
    cursor = conn.cursor()
    mylinker = linking.Linker(
        method="reldisamb",
        resources_path="../resources/",
        linking_resources=dict(),
        rel_params={
            "model_path": "../resources/models/disambiguation/",
            "data_path": "../experiments/outputs/data/lwm/",
            "training_split": "originalsplit",
            "context_length": 100,
            "db_embeddings": cursor,
            "with_publication": False,
            "without_microtoponyms": False,
            "do_test": False,
            "default_publname": "",
            "default_publwqid": "",
        },
        overwrite_training=False,
)

In [6]:
geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)

*** Creating and loading a NER pipeline.
*** Loading the ranker resources.
The string match dataset already exists!
[92m2024-03-28 16:45:21[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread input file: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/inputs/input_dfm.yaml[0m
[92m2024-03-28 16:45:21[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mpytorch will use: cpu[0m
[92m2024-03-28 16:45:21[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread CSV file: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/data/w2v_ocr_pairs.txt[0m
[92m2024-03-28 16:45:25[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mnumber of labels, True: 610031 and False: 475483[0m
[92m2024-03-28 16:45:25[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mSplitting the Dataset[0m
[92m2024-03-28 16:45:25[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mfinish splitting the Dataset. User time: 0.3364291191101074[0m
[92m2024-03-28 16:45:25[0m [95mantoin

                                                                                                          




[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [95m******************************[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [95m**** (Bi-directional) GRU ****[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [95m******************************[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread inputs[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mcreate a two_parallel_rnns model[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mstart fitting parameters[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mNumber of batches: 28834[0m
[92m2024-03-28 16:45:42[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mNumber of epochs: 5[0m


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/28834 [00:00<?, ?it/s]




Total number of params: 627963

two_parallel_rnns (
  (emb): Embedding(7554, 60), weights=((7554, 60),), parameters=453240
  (rnn_1): GRU(60, 60, num_layers=2, dropout=0.1, bidirectional=True), weights=((180, 60), (180, 60), (180,), (180,), (180, 60), (180, 60), (180,), (180,), (180, 120), (180, 60), (180,), (180,), (180, 120), (180, 60), (180,), (180,)), parameters=109440
  (attn_step1): Linear(in_features=120, out_features=60, bias=True), weights=((60, 120), (60,)), parameters=7260
  (attn_step2): Linear(in_features=60, out_features=1, bias=True), weights=((1, 60), (1,)), parameters=61
  (fc1): Linear(in_features=480, out_features=120, bias=True), weights=((120, 480), (120,)), parameters=57720
  (fc2): Linear(in_features=120, out_features=2, bias=True), weights=((2, 120), (2,)), parameters=242
)


[92m2024-03-28 17:35:27[0m [95mantoine-liris[0m [1m[90m[INFO][0m [0;33m03/28/2024_17:35:27 -- Epoch: 1/5; Train; loss: 0.037; acc: 0.988; precision: 0.989, recall: 0.990, macrof1

  0%|          | 0/5089 [00:00<?, ?it/s]

[92m2024-03-28 17:36:30[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;31m03/28/2024_17:36:30 -- Epoch: 1/5; Valid; loss: 0.019; acc: 0.993; precision: 0.987, recall: 1.000, macrof1: 0.992, weightedf1: 0.993[0m
[92m2024-03-28 17:36:30[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32msaving the model[0m


  0%|          | 0/28834 [00:00<?, ?it/s]

[92m2024-03-28 18:03:47[0m [95mantoine-liris[0m [1m[90m[INFO][0m [0;33m03/28/2024_18:03:47 -- Epoch: 2/5; Train; loss: 0.027; acc: 0.991; precision: 0.992, recall: 0.992, macrof1: 0.991, weightedf1: 0.991[0m


  0%|          | 0/5089 [00:00<?, ?it/s]

[92m2024-03-28 18:04:56[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;31m03/28/2024_18:04:56 -- Epoch: 2/5; Valid; loss: 0.025; acc: 0.990; precision: 0.983, recall: 1.000, macrof1: 0.990, weightedf1: 0.990[0m
[92m2024-03-28 18:04:56[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32msaving the model (early stopped) with least valid loss (checkpoint: 1) at ../resources/deezymatch/models/w2v_ocr/w2v_ocr.model[0m
[92m2024-03-28 18:04:56[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32msaving the model[0m
[92m2024-03-28 18:04:56[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mEarly stopping at epoch: 2, selected epoch: 1[0m



User time: 4753.9567
[92m2024-03-28 18:04:59[0m [95mantoine-liris[0m [1m[90m[INFO][0m [2;32mread input file: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/models/w2v_ocr/input_dfm.yaml[0m
[92m2024-03-28 18:05:00[0m [95mantoine-liris[0m [1m[90m[INFO][0m [1;32mpytorch will use: cpu[0m
[92m2024-03-28 18:05:00[0

KeyboardInterrupt: 

In [None]:
resolved = geoparser.run_text("A remarkable case of rattening has just occurred in the building trade at Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!")
    
for r in resolved:
    print(r)

In [None]:
resolved = geoparser.run_sentence("A remarkable case of rattening has just occurred in the building trade at Sheffield.")
for r in resolved:
    print(r)