# My NER    

In [1]:
import spacy
from spacy.tokens import DocBin, Doc

modelpath = '/home/antoine/Documents/GitHub/ner-spancat-edda/models/fr_spacy_custom_spancat_edda'
spacy_model = spacy.load(modelpath)
Doc.set_extension("metadata", default={}, force=True)

In [2]:
from transformers import PreTrainedModel, PretrainedConfig, Pipeline

class CustomModel(PreTrainedModel):
    def __init__(self, config: PretrainedConfig):
        super().__init__(config)

    def forward(self, inputs, **kwargs):
        
        spacydoc = spacy_model(inputs)
        spacydict = dict(spacydoc._.metadata, **spacydoc.to_json())
        return spacydict
    
class MyPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "maybe_arg" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs, maybe_arg=2):
        return inputs

    def _forward(self, model_inputs):
        model_outputs = self.model.forward(model_inputs)
        return model_outputs

    def postprocess(self, model_outputs):
        """
        transforms my dict into a list of dict :
        [{'entity': 'O',
        'score': 0.9999761581420898,
        'word': 'A',
        'start': 0,
        'end': 1},
        ... ,
        {'entity': 'B-LOC',
        'score': 0.9996446371078491,
        'word': 'Sheffield',
        'start': 74,
        'end': 83},
        {'entity': 'O',
        'score': 0.9999758005142212,
        'word': '.',
        'start': 83,
        'end': 84}]
        """
        output_list = []
        for span in model_outputs['spans']['sc']:
            token = model_outputs['text'][span['start']:span['end']]
            
            # remove leading lowercase characters, symbols and spaces
            shift = 0
            if span['label'] == 'NP-Spatial':
                while token[0].islower() or not token[0].isalpha() or token[0] == ' ':
                    token = token[1:]
                    shift += 1
                span_dict = {'entity': 'B-LOC',
                            'score': 1,
                            'word': token,
                            'start': span['start'] + shift,
                            'end': span['end']
                            }
                output_list.append(span_dict)
        return output_list

In [3]:
my_ner_model = CustomModel(config = PretrainedConfig())  
my_ner_pipeline = MyPipeline(model=my_ner_model)  # Using the custom model as a placeholder
sentence = "* ALBI, (Géog.) ville de France, capitale de l'Albigeois, dans le haut Languedoc : elle est sur le Tarn. Long. 19. 49. lat. 43. 55. 44."
result = my_ner_pipeline(sentence)
result


[{'entity': 'B-LOC', 'score': 1, 'word': 'France', 'start': 25, 'end': 31},
 {'entity': 'B-LOC', 'score': 1, 'word': 'Albigeois', 'start': 47, 'end': 56},
 {'entity': 'B-LOC', 'score': 1, 'word': 'Tarn', 'start': 99, 'end': 103},
 {'entity': 'B-LOC', 'score': 1, 'word': 'Languedoc', 'start': 71, 'end': 80}]

# Basic T-Res pipeline

An example of how to run the basic pipeline (with default values).

In [73]:
import os
import sys

from t_res.geoparser import pipeline

Once the `pipeline` script has been imported (in the previous cell), we create a new object of the `Pipeline` class. Since we don't pass any parameters, it will take all the default values: it will detect toponyms using `Livingwithmachines/toponym-19thC-en` NER model, it will find candidates using the perfect match approach, and will disambiguate them using the most popular approach. You can see the default `Pipeline` values [here](https://living-with-machines.github.io/T-Res/reference/geoparser/pipeline.html).

In [7]:
geoparser = pipeline.Pipeline(resources_path="../resources/")

*** Creating and loading a NER pipeline.
*** Loading the ranker resources.
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!



### Using the pipeline: end-to-end

The pipeline can take either a sentence (`run_sentence`) or a document (`run_text`). If the latter, the text is split into sentences using the `sentence-splitter` library. See an example of how to run each:

In [8]:
resolved = geoparser.run_text(" The city of Valence.")
resolved

[{'mention': 'Valence',
  'ner_score': 1.0,
  'pos': 12,
  'sent_idx': 0,
  'end_pos': 19,
  'tag': 'LOC',
  'sentence': 'The city of Valence.',
  'prediction': 'Q8848',
  'ed_score': 0.876,
  'string_match_score': {'Valence': (1.0,
    ['Q8818',
     'Q2875976',
     'Q8848',
     'Q2052261',
     'Q1467944',
     'Q1361868',
     'Q3097931',
     'Q702697',
     'Q495485'])},
  'prior_cand_score': {},
  'cross_cand_score': {'Q8848': 0.876,
   'Q1467944': 0.053,
   'Q1361868': 0.046,
   'Q702697': 0.007,
   'Q2052261': 0.005,
   'Q3097931': 0.005,
   'Q8818': 0.002},
  'latlon': [44.9325, 4.890833],
  'wkdt_class': 'Q484170'}]

In [9]:
resolved = geoparser.run_sentence("A remarkable case of rattening has just occurred in the building trade at Sheffield.")
resolved

[{'mention': 'Sheffield',
  'ner_score': 1.0,
  'pos': 74,
  'sent_idx': 0,
  'end_pos': 83,
  'tag': 'LOC',
  'sentence': 'A remarkable case of rattening has just occurred in the building trade at Sheffield.',
  'prediction': 'Q42448',
  'ed_score': 0.896,
  'string_match_score': {'Sheffield': (1.0,
    ['Q6707254',
     'Q823917',
     'Q5953687',
     'Q7492778',
     'Q1421317',
     'Q7492594',
     'Q897533',
     'Q42448',
     'Q7492565',
     'Q1862179',
     'Q4834926',
     'Q17643392',
     'Q7492570',
     'Q1950928',
     'Q2277715',
     'Q79568',
     'Q518864',
     'Q7492591',
     'Q2306176',
     'Q7492775',
     'Q741640',
     'Q7492686',
     'Q3577611',
     'Q12956644',
     'Q547824',
     'Q7684835',
     'Q3365926',
     'Q7492719',
     'Q7492566',
     'Q7492567',
     'Q4523493',
     'Q3028626',
     'Q7492607',
     'Q7492568',
     'Q1984238',
     'Q1184547',
     'Q925542',
     'Q4664093',
     'Q2892594',
     'Q1916592',
     'Q371969',
     'Q114

### Using the pipeline: step-wise

Instead of using the end-to-end pipeline, the pipeline can be used step-wise.

Therefore, it can be used to just perform toponym recognition (i.e. NER):

In [11]:
mentions = geoparser.run_text_recognition("A remarkable case, like in London, of rattening has just occurred in the building trade at Sheffield.")
mentions

[{'mention': 'London',
  'context': ['', ''],
  'candidates': [],
  'gold': ['NONE'],
  'ner_score': 0.951,
  'pos': 27,
  'sent_idx': 0,
  'end_pos': 33,
  'ngram': 'London',
  'conf_md': 0.951,
  'tag': 'LOC',
  'sentence': 'A remarkable case, like in London, of rattening has just occurred in the building trade at Sheffield.',
  'place': '',
  'place_wqid': ''},
 {'mention': 'Sheffield',
  'context': ['', ''],
  'candidates': [],
  'gold': ['NONE'],
  'ner_score': 1.0,
  'pos': 91,
  'sent_idx': 0,
  'end_pos': 100,
  'ngram': 'Sheffield',
  'conf_md': 1.0,
  'tag': 'LOC',
  'sentence': 'A remarkable case, like in London, of rattening has just occurred in the building trade at Sheffield.',
  'place': '',
  'place_wqid': ''}]

The pipeline can then be used to just perform candidate selection given the output of NER:

In [60]:
candidates = geoparser.run_candidate_selection(mentions)
candidates

{'Sheffield': {'Sheffield': {'Score': 1.0,
   'Candidates': {'Q6707254': 0.038461538461538464,
    'Q823917': 0.04389027431421446,
    'Q5953687': 0.25,
    'Q7492778': 0.21153846153846156,
    'Q1421317': 0.044117647058823525,
    'Q7492594': 0.05,
    'Q897533': 0.026007802340702213,
    'Q42448': 0.9632482747552559,
    'Q7492565': 0.7058823529411764,
    'Q1862179': 0.6057347670250897,
    'Q4834926': 0.043478260869565216,
    'Q17643392': 0.047619047619047616,
    'Q7492570': 0.7391304347826086,
    'Q1950928': 0.6666666666666666,
    'Q2277715': 0.7636363636363636,
    'Q79568': 0.2857142857142857,
    'Q518864': 0.6551724137931034,
    'Q7492591': 0.20454545454545456,
    'Q2306176': 0.3943661971830986,
    'Q7492775': 0.125,
    'Q741640': 0.16666666666666666,
    'Q7492686': 0.25,
    'Q3577611': 0.1,
    'Q12956644': 0.22988505747126436,
    'Q547824': 0.09090909090909091,
    'Q7684835': 0.1,
    'Q3365926': 0.47058823529411764,
    'Q7492719': 0.125,
    'Q7492566': 0.60714

And finally, the pipeline can be used to perform entity disambiguation, given the output from the previous two steps:

In [16]:
disamb_output = geoparser.run_disambiguation(mentions, candidates)
print(disamb_output)

[{'mention': 'Sheffield', 'ner_score': 1.0, 'pos': 74, 'sent_idx': 0, 'end_pos': 83, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Sheffield.', 'prediction': 'Q42448', 'ed_score': 0.896, 'string_match_score': {'Sheffield': (1.0, ['Q6707254', 'Q823917', 'Q5953687', 'Q7492778', 'Q1421317', 'Q7492594', 'Q897533', 'Q42448', 'Q7492565', 'Q1862179', 'Q4834926', 'Q17643392', 'Q7492570', 'Q1950928', 'Q2277715', 'Q79568', 'Q518864', 'Q7492591', 'Q2306176', 'Q7492775', 'Q741640', 'Q7492686', 'Q3577611', 'Q12956644', 'Q547824', 'Q7684835', 'Q3365926', 'Q7492719', 'Q7492566', 'Q7492567', 'Q4523493', 'Q3028626', 'Q7492607', 'Q7492568', 'Q1984238', 'Q1184547', 'Q925542', 'Q4664093', 'Q2892594', 'Q1916592', 'Q371969', 'Q1141915', 'Q6986914', 'Q7114883', 'Q1915446', 'Q5224096', 'Q7492766', 'Q15277074', 'Q4065168', 'Q1548891', 'Q7492772', 'Q977409', 'Q1752117', 'Q7492586', 'Q5035049', 'Q108940076'])}, 'prior_cand_score': {}, 'cross_cand_score': {'Q

# Test ranking

    Valence, (Géograph. mod.) petite ville, disons mieux, bourg de France dans l'Agénois, sur la rive droite de la Garonne, vis-à-vis d'Aurignac. (D. J.) 

    LOURDE, Laperdum, (Géog.) petite ville de France en Gascogne, ville unique, & chef-lieu du Lavedan, avec un ancien château sur un rocher. Elle est sur le Gave de Pau, à 4 lieues de Bagnieres. Long. 17. 30. lat. 43. 8. (D. J.) 

### Desamb rapide

In [83]:
sentence = ' Etoile, (Géog. mod.) petite ville du Dauphiné. '

my_mention_list = [{'mention': 'Etoile','tag': 'LOC',},
                   {'mention': 'Dauphiné', 'tag': 'LOC'}
                   ]
           

mention_dict = {'mention': None,
 'context': None,
 'candidates': None,
 'gold': None,
 'ner_score': None,
 'pos': None,
 'sent_idx': None,
 'end_pos': None,
 'ngram': None,
 'conf_md': None,
 'tag': None,
 'sentence': None,
 'place': None,
 'place_wqid': None}

my_ner_pipeline(sentence)

mention_list = []

for np in my_ner_pipeline(sentence) :

    np_to_mention = mention_dict.copy()

    np_to_mention['mention'] = np['word']
    #np_to_mention['ner_score'] = np['score']
    #np_to_mention['pos'] = np['start']
    #np_to_mention['sent_idx'] =
    #np_to_mention['end_pos'] = np['end']
    #np_to_mention['ngram'] = np['word']
    np_to_mention['tag'] = 'LOC' if np['entity']=='B-LOC' else 'MISC'
    #np_to_mention['sentence'] = sentence
    
    mention_list.append(np_to_mention)

candidates = geoparser.run_candidate_selection(my_mention_list)

import json
print('#####################  CANDIDATES')
print(json.dumps(candidates, indent=2))


disamb_output = geoparser.run_disambiguation(my_mention_list, candidates)
print('#####################  DISAMBIGUATION')
print(json.dumps(disamb_output, indent=2))

#####################  CANDIDATES
{
  "Etoile": {
    "Etoile": {
      "Score": 1.0,
      "Candidates": {
        "Q2944781": 0.009345794392523364,
        "Q390243": 0.03773584905660377,
        "Q3592468": 0.2,
        "Q5404767": 0.75,
        "Q14157452": 0.42857142857142855,
        "Q3297654": 0.1,
        "Q5404758": 1.0,
        "Q1097938": 0.05263157894736842
      }
    }
  },
  "Dauphin\u00e9": {
    "Dauphin\u00e9": {
      "Score": 1.0,
      "Candidates": {
        "Q151999": 0.942857142857143,
        "Q743382": 0.01694915254237288
      }
    }
  }
}
#####################  DISAMBIGUATION
[
  {
    "mention": "Etoile",
    "tag": "LOC",
    "prediction": "Q5404767",
    "ed_score": 0.286,
    "string_match_score": {
      "Etoile": [
        1.0,
        [
          "Q2944781",
          "Q390243",
          "Q3592468",
          "Q5404767",
          "Q14157452",
          "Q3297654",
          "Q5404758",
          "Q1097938"
        ]
      ]
    },
    "prior_cand_

In [None]:
my_mention_list = [{'mention': 'Etoile','tag': 'LOC',},
                   {'mention': 'Dauphiné', 'tag': 'LOC'}
                   ]
myranker.find_candidates(mentions = my_mention_list)

({'Dauphiné': {'Dauphiné': {'Score': 1.0,
    'Candidates': {'Q151999': 0.942857142857143,
     'Q743382': 0.01694915254237288}}},
  'Etoile': {'Etoile': {'Score': 1.0,
    'Candidates': {'Q2944781': 0.009345794392523364,
     'Q390243': 0.03773584905660377,
     'Q3592468': 0.2,
     'Q5404767': 0.75,
     'Q14157452': 0.42857142857142855,
     'Q3297654': 0.1,
     'Q5404758': 1.0,
     'Q1097938': 0.05263157894736842}}}},
 {'Dauphiné': {'Dauphiné': {'Score': 1.0,
    'Candidates': {'Q151999': 0.942857142857143,
     'Q743382': 0.01694915254237288}}},
  'Etoile': {'Etoile': {'Score': 1.0,
    'Candidates': {'Q2944781': 0.009345794392523364,
     'Q390243': 0.03773584905660377,
     'Q3592468': 0.2,
     'Q5404767': 0.75,
     'Q14157452': 0.42857142857142855,
     'Q3297654': 0.1,
     'Q5404758': 1.0,
     'Q1097938': 0.05263157894736842}}}})

### inspection gazetier

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
import sqlite3
from pathlib import Path

from t_res.geoparser import pipeline, ranking, linking

# Instantiate the ranker:
myranker = ranking.Ranker(
    method='levenshtein',
    resources_path="/home/antoine/Documents/GitHub/T-Res/resources/",
)

myranker.mentions_to_wikidata = myranker.load_resources()

In [4]:
mentions_to_wikidata = myranker.mentions_to_wikidata

In [10]:
num_mentions = 0
ressources = []
for key, values in mentions_to_wikidata.items():
    num_mentions += 1
    ressources.extend(values)


In [15]:
print(num_mentions,
      len(ressources),
      len(set(ressources))
      )

len(ressources)/num_mentions

1776221 2260707 987945


1.2727622294748233

In [19]:
mentions_to_wikidata['Cité phocéenne']

KeyError: 'Cité phocéenne'

### Dam-lev & partial match

In [44]:
import pandas as pd

damlev = False
query = "étoile"

mention_df = pd.DataFrame({"mentions": myranker.mentions_to_wikidata.keys()})

if damlev:
    mention_df["score"] = mention_df.parallel_apply(lambda row: myranker.damlev_dist(query, row), axis=1
                                                    )
else:
    mention_df["score"] = mention_df.parallel_apply(lambda row: myranker.check_if_contained(query, row), axis=1)

In [45]:
print(mention_df.shape)
mention_df = mention_df.dropna()
print(mention_df.shape)

############################
# currently hardcoded cutoff at k
############################
k = 10

top_scores = sorted(list(set(list(mention_df["score"].unique()))), reverse=True)[:10]
mention_df = mention_df[mention_df["score"].isin(top_scores)]
mention_df = mention_df.set_index("mentions").to_dict()["score"]

(1776221, 2)
(53, 2)


In [46]:
mention_df

{'IL': 0.3333333333333333,
 'oil': 0.5,
 'TO': 0.3333333333333333,
 'Ile': 0.5,
 "L'Étoile AOC": 0.5,
 "L'Étoile": 0.75,
 'Étoile': 1.0,
 "Place de l'Étoile": 0.35294117647058826,
 "place de l'Étoile": 0.35294117647058826,
 'Parisian Étoile': 0.4,
 'Charles de Gaulle–Étoile': 0.25,
 'Charles de Gaulle-Étoile': 0.25,
 'Oil': 0.5,
 'Toi': 0.5,
 "Jeu de Paume de l'Étoile": 0.25,
 'ILE': 0.5,
 '"Étoile"': 0.75,
 "Théâtre de l'Étoile": 0.3157894736842105,
 'Étoile du Nord': 0.42857142857142855,
 "L'Étoile de Kléber": 0.3333333333333333,
 "Massif de l'Étoile": 0.3333333333333333,
 "massif de l'Étoile": 0.3333333333333333,
 "l'Étoile": 0.75,
 'Salle des Étoiles': 0.35294117647058826,
 'Il': 0.3333333333333333,
 "Marcy-l'Étoile": 0.42857142857142855,
 'Cerisy-Belle-Étoile': 0.3157894736842105,
 'Étoile Civique': 0.42857142857142855,
 "Montceaux-l'Étoile": 0.3333333333333333,
 "tunnel de l'Étoile": 0.3333333333333333,
 'Étoile-sur-Rhône': 0.375}

### Deezymatch

In [50]:
import sqlite3
from pathlib import Path

from t_res.geoparser import pipeline, ranking, linking

# Instantiate the ranker:
myranker = ranking.Ranker(
    method='deezymatch',
    resources_path="/home/antoine/Documents/GitHub/T-Res/resources/",)

myranker.mentions_to_wikidata = myranker.load_resources()

*** Loading the ranker resources.


In [53]:
import torch
print(torch.__version__)

2.2.1+cu121


In [51]:
query = "étoile"

candidates, already_collected = myranker.deezy_on_the_fly(query)



SystemExit: [ERROR] no input file (*.yaml file) could be found in the dir: /home/antoine/Documents/GitHub/T-Res/resources/deezymatch/combined/wkdtalts_w2v_ocr

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
