In [1]:
from opentapioca.wikidatagraph import WikidataGraph
from opentapioca.languagemodel import BOWLanguageModel
from opentapioca.tagger import Tagger
from opentapioca.classifier import SimpleTagClassifier
import settings
import numpy as np
import bz2
import json

### classe ```WikidataObject```

In [None]:
import requests
import json
import geopy

class WikidataObject:

    def __init__(self, uri=None, label=None, jsondata=None, coordinates=None):

        if uri is not None:
            if isinstance(uri, (int, float)):
                self.uri = 'Q' + str(int(uri))
            elif isinstance(uri, str):
                if uri[0] == 'Q':
                    self.uri = uri
                else:
                    raise ValueError("URI should start with 'Q' if it's a string.")
        else:
            self.uri = None

        self.label = label
        self.coordinates = coordinates
        self.json = jsondata

        if self.uri is not None:
            self.link = f'https://www.wikidata.org/wiki/{self.uri}'
        else:
            self.link = None

    def __repr__(self):
        return self.link

    def __iter__(self):
        return self.json.__iter__()

    def request_json(self):
        if self.uri is None:
            raise AttributeError("URI is not set.")

        if self.json:
            return self.json

        try:
            url = f'https://www.wikidata.org/wiki/Special:EntityData/{self.uri}.json'
            response = requests.get(url)
            data = response.json()
            self.json = data
            return data

        except Exception as e:
            print('Error requesting JSON at URL:', url)
            print(e)
            return None

    def get_coordinates(self):
        if isinstance(self.coordinates, (tuple, list)) and len(self.coordinates) == 2:
            return self.coordinates

        if self.json:
            try:
                coordinates_dict = self.json['entities'][list(self.json['entities'].keys())[0]]['claims']['P625'][0]['mainsnak']['datavalue']['value']
                self.coordinates = (coordinates_dict['latitude'], coordinates_dict['longitude'])
                return self.coordinates
            except:
                pass

        else:
            json_data = self.request_json()
            if json_data:
                return self.get_coordinates()

    def distance_to(self, other):
        self.get_coordinates()

        if isinstance(other, (int, str, float)):
            other = WikidataObject(uri=other)
            other.get_coordinates()

        if isinstance(other, WikidataObject):
            other.get_coordinates()
            if not self.coordinates or not other.coordinates:
                return None
            else:
                d = geopy.distance.distance(self.coordinates, other.coordinates).km
                return d

        elif isinstance(other, (tuple, list)) and len(other) == 2:
            return geopy.distance.distance(self.coordinates, other).km


In [None]:
q1 = 'Q1613150' # canton de Vivonne
q2 = 'Q1727971' # 1  :  Vivonne
q3 = 'Q2314920' # 2  :  gare de Vivonne
paris  = 'Q90' # paris


CHABLIS = WikidataObject(uri = 'Q331232')

wd1 = WikidataObject(uri = 'Q1613150')
wd2 = WikidataObject(uri = 'Q704', coordinates=VILLESFR.loc[254,'truthyV_coords'])
paris = WikidataObject(uri = 'Q90')
versailles = WikidataObject(uri = 'Q621')    

print(wd1.distance_to(versailles))
print(paris.distance_to(VILLESFR.loc[254,'truthyV_coords']))

299.0405117165616
157.75798052056592


In [None]:
print(wd2.distance_to('Q25278623'))

12530.782567194068


### Get small labels

In [2]:
# read json.bz2 file and get all json in it:
filepath = 'data/small/small.json.bz2'
french_label = ''

with bz2.open(filepath, 'rt') as bz_file:
    # Iterate through each line in the file
    for line in bz_file:
        # Parse JSON data from each line
        data = json.loads(line)
        
        # Check if the entity has French labels
        if 'labels' in data and 'fr' in data['labels']:
            # Extract and print the French label
            label = data['labels']['fr']['value']
            french_label += label + ' % '

print(french_label)


Berre-l'Étang % Varambon % Saint-Agrève % Châteauvillain % Castelnaudary % Barbonne-Fayel % Lagnieu % Olliergues % Rougemont % Saint-Aignan % Barre-des-Cévennes % Saint-Marcellin % Denain % Aubigny % Carhaix-Plouguer % Alès % Tours % Château-Chinon % Wintzenheim-Kochersberg % Conches-en-Ouche % Pont-de-Vaux % Saint-Valery-en-Caux % Mont-Louis % Montélimar % Brive-la-Gaillarde % Revin % Castres % Chevreuse % Dax % Beaujeu % Montsaugeon % Céret % Conflans-en-Jarnisy % Aigueperse % Aix-en-Provence % Bar-sur-Seine % Clairac % Torigni-sur-Vire % Chârost % Aigremont % Aiguillon % Chênehutte-Trèves-Cunault % La Ferté-Bernard % Aire-sur-l'Adour % Renty % Agen % Noyon % Duras % Ainay-le-Château % Aignay-le-Duc % Arnay-le-Duc % Auvillar % Villé % Aigurande % Saint-Affrique % Cessenon-sur-Orb % Saint-James % La Rochefoucauld % Pontorson % Aigues-Mortes % Rambervillers % Aix % Chinon % Rohan % Rosheim % Lourdes % Pacy-sur-Eure % La Réole % Reims % L'Aigle % Toucy % Brionne % Chantelle % Châtellera

### BOW etc

In [2]:
bow = BOWLanguageModel()
if settings.LANGUAGE_MODEL_PATH:
    bow.load(settings.LANGUAGE_MODEL_PATH)
    print(settings.LANGUAGE_MODEL_PATH)

graph = WikidataGraph()
if settings.PAGERANK_PATH:
    graph.load_pagerank(settings.PAGERANK_PATH)
    print(settings.PAGERANK_PATH)
    
tagger = None
classifier = None

if settings.SOLR_COLLECTION:
    tagger = Tagger(settings.SOLR_COLLECTION, bow, graph)
    print(settings.SOLR_COLLECTION)
    classifier = SimpleTagClassifier(tagger)
    if settings.CLASSIFIER_PATH:
        classifier.load(settings.CLASSIFIER_PATH)
        print(settings.CLASSIFIER_PATH)


data/all-french.bow.pkl
data/wikidata/wikidata-graph.pgrank.npy
frenchtapioca4
data/latest_classifier.pkl


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
text = 'Villeurbanne, Lyon et Miribel sont 3 agglomérations lyonnaises.'
if not classifier:
    mentions = tagger.tag_and_rank(text)
else:
    mentions = classifier.create_mentions(text)
    classifier.classify_mentions(mentions)


for m in mentions:
    besttag = sorted(m.json()['tags'], key=lambda x: x['score'])[-1]
    print(text[m.start:m.end],
          ' tagged as ',
          besttag['label'],
          besttag['id'],
          ' with log likelihood ', 
          # arrondi à 10-2 :
          np.round(m.json()['log_likelihood'], 3))

Villeurbanne  tagged as  Villeurbanne Q582  with log likelihood  12.833
Lyon  tagged as  Lyon Q456  with log likelihood  9.863
Miribel  tagged as  Miribel Q363100  with log likelihood  13.151


### Once

In [3]:
import pandas as pd
VILLESFR = pd.read_json('/home/antoine/Documents/GitHub/ner-spancat-edda/VILLESFR_with_OpenTapiocaDoc.json')
VILLESFR_collec = pd.read_json('/home/antoine/Documents/GitHub/ner-spancat-edda/VILLESFR_collec.json')

In [7]:
VILLESFR.at[254, 'truthyV'] = 'Q331232'

In [8]:
VILLESFR = VILLESFR[['key', 'head', 'head_root', 'SAINT', 'truthyV', 'truthyV_coords',
       'truthyV_JSON', 'truthyV_spellings', 'truthyV_population',
       'fullcontent', 'OpenTapiocaDoc']]

In [24]:
VILLESFR.to_json('/home/antoine/Documents/GitHub/toponym-disambiguation/data/VILLESFR_010324.json')

# Testing

In [5]:
import pandas as pd
VILLESFR = pd.read_json('/home/antoine/Documents/GitHub/toponym-disambiguation/data/VILLESFR_010324.json')

In [25]:
for idx,row in VILLESFR.iterrows():
    print(str(idx) + "  ######################################")
    document = row['OpenTapiocaDoc']
    print(document)
    if not classifier:
        mentions = tagger.tag_and_rank(document)
    else:
        mentions = classifier.create_mentions(document)
        classifier.classify_mentions(mentions)

    firstmention = mentions[0]
    firstmention_text = document[firstmention.start:firstmention.end]
    firstmention_tags = sorted(firstmention.json()['tags'],
                               key=lambda x: x['score'],
                               reverse=True)
    
    doc_head = document.split('%')[0]
    
    if firstmention_text in doc_head:

        VILLESFR.loc[idx,'TapiocaPredictions'] = firstmention
        BestPrediction = WikidataObject(uri = firstmention_tags[0]['id'],
                                        label = firstmention_tags[0]['label'])
        print(BestPrediction)
        #VILLESFR.loc[idx,'TapiocaBestPrediction'] = BestPrediction
        distance_to_truth = BestPrediction.distance_to(row['truthyV'])
        VILLESFR.loc[idx,'TapiocaDist2Truth'] = distance_to_truth
        
        print('Head ' , firstmention_text, ' tagged successfully : ', )
        print('Distance to truth : ', distance_to_truth)
        for rank,tag in enumerate(firstmention_tags):
            to_print = ''

            if tag['id'] == row['truthyV']:
                to_print += '____________'
            
            print(to_print, rank, ' : ',  tag['label'], tag['id'], tag['score'])
    
    print()
    print()
            
                    

    if False :
        for m in mentions:
            besttag = sorted(m.json()['tags'], key=lambda x: x['score'])[-1]
            print(text[m.start:m.end],
                ' tagged as ',
                besttag['label'],
                besttag['id'],
                ' with log likelihood ', 
                # arrondi à 10-2 :
                np.round(m.json()['log_likelihood'], 3)
            )

0  ######################################
AFRIQUE % France % Gascogne % Montauban
https://www.wikidata.org/wiki/Q15
Head  AFRIQUE  tagged successfully : 
Distance to truth :  2566.790159142138
 0  :  Afrique Q15 1.1875239766646337
 1  :  Afrique de l'Est Q27407 -0.19199924028322224
 2  :  plaque africaine Q203548 -0.8301643724007213
 3  :  L'Afrique Q112871204 -1.5541575604908873


1  ######################################
AGDE % France % Languedoc % Paris % d'Agadez
https://www.wikidata.org/wiki/Q191396
Head  AGDE  tagged successfully : 
Distance to truth :  0.0
____________ 0  :  Agde Q191396 -0.0009736785317753238
 1  :  canton d'Agde Q187517 -0.2181590004845937
 2  :  gare d'Agde Q2028453 -0.6182421433834153


2  ######################################
AGEN % France % l'Agénois % la Guienne % la Garonne
https://www.wikidata.org/wiki/Q6625
Head  AGEN  tagged successfully : 
Distance to truth :  0.0
____________ 0  :  Agen Q6625 0.7336394672830173
 1  :  arrondissement d'Agen Q700351 

In [79]:
# amount of distances below 50 kms in colums 'TapiocaDist2Truth'
print(VILLESFR[VILLESFR['TapiocaDist2Truth'] < 50].shape[0]/VILLESFR.shape[0])
# 20
print(VILLESFR[VILLESFR['TapiocaDist2Truth'] < 20].shape[0]/VILLESFR.shape[0])


0.5883777239709443
0.5799031476997578


In [97]:
for idx,row in VILLESFR.iterrows():
    if row['TapiocaPredictions']:
        mention = row['TapiocaPredictions']
        tags = [tag['id']for tag in mention['tags']]
        if row['truthyV'] in tags:
            VILLESFR.loc[idx, 'TapiocaTruthInPred'] = True
        else:
            VILLESFR.loc[idx, 'TapiocaTruthInPred'] = False
    else:
        VILLESFR.loc[idx, 'TapiocaTruthInPred'] = False



In [98]:
VILLESFR['TapiocaTruthInPred'].value_counts(normalize=True)

TapiocaTruthInPred
True     0.639225
False    0.360775
Name: proportion, dtype: float64

In [99]:
size = VILLESFR.shape[0]
onecandidate = VILLESFR.TapiocaPredictions.notna().sum()
truthinpred = VILLESFR.TapiocaTruthInPred.sum()
print(onecandidate/size)
print(truthinpred/size)


0.847457627118644
0.639225181598063


In [32]:
VILLESFR.to_json('/home/antoine/Documents/GitHub/toponym-disambiguation/data/VILLESFR_040324_frenchtapiocapreds.json')

# fame-Baseline

In [4]:
import pandas as pd
VILLESFR = pd.read_json('/home/antoine/Documents/GitHub/toponym-disambiguation/data/VILLESFR_040324_frenchtapiocapreds.json')

In [6]:
sample = VILLESFR.sample(1)
m = sample['TapiocaPredictions'].values[0]
m

{'best_qid': None,
 'best_tag_label': None,
 'end': 7,
 'log_likelihood': 16.8399475151,
 'phrase': 'ALENCON',
 'start': 0,
 'tags': [{'aliases': ['Arrondissement Alençon',
    'Arrondissement de Alencon',
    '阿朗松区',
    'Alanson (rayon)',
    'arrondissement Alencon',
    'Alençon Kūn',
    'Arondismento Alençon',
    "arrondissement d'Alencon",
    'Qarku Alençon',
    'Алансон (округ)',
    'Distrito de Alençon',
    'Alençoneko barrutia',
    "Districte d'Alençon",
    'arondisamant Alençon',
    'διαμέρισμα του Αλανσόν',
    'Alençon (arrondissement)',
    'Okręg Alençon',
    'Алансон',
    'arrondissement Alençon',
    "Arrondissement d'Alençon",
    'Distrito de Alencon',
    'Alençon',
    'arrondissement di Alençon',
    'بخش الانسون',
    'Arrondissement de Alençon',
    'Arondismentul Alençon',
    'arrondissement of Alençon'],
   'desc': 'arrondissement français',
   'edges': [142,
    12679,
    181269,
    969511,
    1469266,
    1724770,
    1725080,
    1290284,
    

In [60]:
VILLESFR

Unnamed: 0,key,head,head_root,SAINT,truthyV,truthyV_coords,truthyV_JSON,truthyV_spellings,truthyV_population,fullcontent,OpenTapiocaDoc,TapiocaPredictions,TapiocaDist2Truth,TapiocaTruthInPred
0,volume01-1065,Afrique,[afrique],False,Q818608,"[43.9583333333, 2.8863888889]","{'pageid': 771473, 'ns': 0, 'title': 'Q818608'...","[Saint-Affrique, Vendeloves, Bournac, Saint-Ét...",7992.0,"* Afrique , ( Géog . mod . ) petite ville de F...",AFRIQUE % France % Gascogne % Montauban,"{'best_qid': 'Q15', 'best_tag_label': 'Afrique...",2566.790159,False
1,volume01-1087,AGDE,[agde],False,Q191396,"[43.31, 3.4752777778]","{'pageid': 189694, 'ns': 0, 'title': 'Q191396'...",[Agde],29103.0,"* AGDE , ( Géog . ) ville de France en Langued...",AGDE % France % Languedoc % Paris % d'Agadez,"{'best_qid': None, 'best_tag_label': None, 'en...",0.000000,True
2,volume01-1103,AGEN,[agen],False,Q6625,"[44.2030555556, 0.6186111111]","{'pageid': 7752, 'ns': 0, 'title': 'Q6625', 'l...",[Agen],32485.0,"* AGEN , ( Géog . ) ancienne ville de France ,...",AGEN % France % l'Agénois % la Guienne % la Ga...,"{'best_qid': 'Q6625', 'best_tag_label': 'Agen'...",0.000000,True
3,volume01-1210,AGRERE,[agrere],False,Q137514,"[45.01, 4.3961111111]","{'pageid': 139255, 'ns': 0, 'title': 'Q137514'...","[Saint-Agrève, Le Pouzat, Mont-Chiniac]",2325.0,* AGRERE ( Géog . ) petite ville de France dan...,AGRERE % France % Monts % le haut-Vivarez,,,False
4,volume01-1279,Aigle,[aigle],False,Q500588,"[48.765, 0.6275000000000001]","{'pageid': 470872, 'ns': 0, 'title': 'Q500588'...","[L'Aigle, Laigle]",7824.0,"* Aigle , ( Géog . ) petite ville de France da...",AIGLE % France % Rouen % d'Evreux % la haute N...,"{'best_qid': None, 'best_tag_label': None, 'en...",0.000000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,volume17-2393,WORDT,[wordt],False,Q22762,"[48.9388888889, 7.7458333333]","{'pageid': 26158, 'ns': 0, 'title': 'Q22762', ...","[Wœrth, Woerth, Wörth, Wörth an der Sauer]",1696.0,"WORDT , ( Géog . mod . ) petite ville , ou plu...",WORDT % France % Hanau % Wasgaw % la basse-Als...,,,False
872,volume17-2434,XAINTES,[xaintes],False,Q191126,"[45.7452777778, -0.6344444444]","{'pageid': 189455, 'ns': 0, 'title': 'Q191126'...","[Saintes, Mediolanum Santonum, Xantes, Sentas,...",25518.0,"XAINTES , ( Géog . mod . ) ville de France , c...",XAINTES % France % la Saintonge,"{'best_qid': 'Q191126', 'best_tag_label': 'Sai...",0.000000,True
873,volume17-2619,"YRIER de la perche, saint","[yrier, perche]",True,Q523015,"[45.5144444444, 1.2033333333]","{'pageid': 492059, 'ns': 0, 'title': 'Q523015'...","[Saint-Yrieix-la-Perche, Yrieix-la-Montagne, S...",6825.0,"YRIER de la perche , saint , ( Géog . mod . ) ...","SAINT YRIER DE LA PERCHE, SAINT % France % le ...","{'best_qid': None, 'best_tag_label': None, 'en...",462.756460,False
874,volume17-2632,YVETOT,[yvetot],False,Q691278,"[49.6169444444, 0.7530555556]","{'pageid': 651247, 'ns': 0, 'title': 'Q691278'...","[Yvetot, Yvetot-la-Montagne]",11385.0,"YVETOT , ( Géog . mod . ) bourg de France en N...",YVETOT % France % Normandie % Caux % Caudebec ...,"{'best_qid': None, 'best_tag_label': None, 'en...",0.000000,True


In [67]:
for idx, row in VILLESFR.iterrows():
    print(str(idx) + "  ######################################")
    truth = WikidataObject(uri=row['truthyV'], coordinates=row['truthyV_coords'])

    try:
        mention = row['TapiocaPredictions']

        popularity = sorted(mention['tags'],
                            key=lambda x: x['nb_statements'],
                            reverse=True)

        for rank,p in enumerate(popularity):
            try:
                pwd = WikidataObject(uri=p['id'])
                pwd.get_coordinates()
                distance = truth.distance_to(pwd)
                print('\t', p['label'], p['nb_statements'], ':', distance)

                if rank == 0:
                    VILLESFR.loc[idx, 'TapiocaBaseline'] = distance
                    

            except Exception as e:
                print(f'Error in distance calculation: {e}')

    except Exception as e:
        print(f'No Tapioca prediction: {e}')
        pass


0  ######################################
	 Afrique 282 : 2566.79015913837
	 Afrique de l'Est 87 : 6043.492051392051
	 plaque africaine 18 : None
	 L'Afrique 5 : 5569.133483572722
1  ######################################
	 Agde 98 : 1.8010840583367065e-09
	 gare d'Agde 30 : 1.1629213791128556
	 canton d'Agde 19 : 6.490387809662085e-08
2  ######################################
	 Agen 131 : 4.969216059507183e-09
	 arrondissement d'Agen 36 : 4.930037192285652
	 gare d'Agen 31 : 0.6127116861752916
	 arrêt Agen 10 : 523.9962893779266
	 Agen 10 : 1957.1717201253582
	 Agen 6 : 1984.3044062988113
	 Agen (bukid) 6 : 4327.524168200131
	 Agen 5 : 1953.4562939356347
3  ######################################
No Tapioca prediction: 'NoneType' object is not subscriptable
4  ######################################
	 L'Aigle 76 : 8.16185084879003e-15
	 Aigle 72 : 548.986103004263
	 gare d'Aigle 44 : 548.9524479800652
	 District d'Aigle 41 : 555.4177640595938
	 Edzell 30 : 921.9188646096449
	 L'Aigle 23

In [69]:
VILLESFR[VILLESFR['TapiocaBaseline'] < 20].shape[0]/VILLESFR.shape[0]

0.5883777239709443

In [92]:
VILLESFR[VILLESFR['TapiocaDist2Truth'] < 50].merge(VILLESFR[VILLESFR['TapiocaBaseline'] < 20],
                                                  on='key')

Unnamed: 0,key,head_x,head_root_x,SAINT_x,truthyV_x,truthyV_coords_x,truthyV_JSON_x,truthyV_spellings_x,truthyV_population_x,fullcontent_x,...,truthyV_coords_y,truthyV_JSON_y,truthyV_spellings_y,truthyV_population_y,fullcontent_y,OpenTapiocaDoc_y,TapiocaPredictions_y,TapiocaDist2Truth_y,TapiocaTruthInPred_y,TapiocaBaseline_y
0,volume01-1087,AGDE,[agde],False,Q191396,"[43.31, 3.4752777778]","{'pageid': 189694, 'ns': 0, 'title': 'Q191396'...",[Agde],29103.0,"* AGDE , ( Géog . ) ville de France en Langued...",...,"[43.31, 3.4752777778]","{'pageid': 189694, 'ns': 0, 'title': 'Q191396'...",[Agde],29103.0,"* AGDE , ( Géog . ) ville de France en Langued...",AGDE % France % Languedoc % Paris % d'Agadez,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,1.801084e-09
1,volume01-1103,AGEN,[agen],False,Q6625,"[44.2030555556, 0.6186111111]","{'pageid': 7752, 'ns': 0, 'title': 'Q6625', 'l...",[Agen],32485.0,"* AGEN , ( Géog . ) ancienne ville de France ,...",...,"[44.2030555556, 0.6186111111]","{'pageid': 7752, 'ns': 0, 'title': 'Q6625', 'l...",[Agen],32485.0,"* AGEN , ( Géog . ) ancienne ville de France ,...",AGEN % France % l'Agénois % la Guienne % la Ga...,"{'best_qid': 'Q6625', 'best_tag_label': 'Agen'...",0.0,True,4.969216e-09
2,volume01-1279,Aigle,[aigle],False,Q500588,"[48.765, 0.6275000000000001]","{'pageid': 470872, 'ns': 0, 'title': 'Q500588'...","[L'Aigle, Laigle]",7824.0,"* Aigle , ( Géog . ) petite ville de France da...",...,"[48.765, 0.6275000000000001]","{'pageid': 470872, 'ns': 0, 'title': 'Q500588'...","[L'Aigle, Laigle]",7824.0,"* Aigle , ( Géog . ) petite ville de France da...",AIGLE % France % Rouen % d'Evreux % la haute N...,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,8.161851e-15
3,volume01-1312,AIGUES-MORTES,[aigues-mortes],False,Q193809,"[43.5666666667, 4.1925]","{'pageid': 191785, 'ns': 0, 'title': 'Q193809'...","[Aigues-Mortes, Aigues Morte, Fort-Peletier]",8685.0,"* AIGUES-MORTES , ( Géog . ) ville de France ,...",...,"[43.5666666667, 4.1925]","{'pageid': 191785, 'ns': 0, 'title': 'Q193809'...","[Aigues-Mortes, Aigues Morte, Fort-Peletier]",8685.0,"* AIGUES-MORTES , ( Géog . ) ville de France ,...",AIGUES-MORTES % France % le bas Languedoc,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,3.667221e-09
4,volume01-1369,AIGURANDE,[aigurande],False,Q211643,"[46.4341666667, 1.8288888889]","{'pageid': 207300, 'ns': 0, 'title': 'Q211643'...",[Aigurande],1359.0,"AIGURANDE , ( Géog . ) ville de France dans la...",...,"[46.4341666667, 1.8288888889]","{'pageid': 207300, 'ns': 0, 'title': 'Q211643'...",[Aigurande],1359.0,"AIGURANDE , ( Géog . ) ville de France dans la...",AIGURANDE % France % Berry % la Marche,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,3.766769e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,volume17-2104,WASSELONNE,[wasselonne],False,Q22413,"[48.6372222222, 7.4480555556]","{'pageid': 25764, 'ns': 0, 'title': 'Q22413', ...","[Wasselonne, Wasselnheim]",5774.0,"WASSELONNE , ( Géog . mod . ) bourg ou petite ...",...,"[48.6372222222, 7.4480555556]","{'pageid': 25764, 'ns': 0, 'title': 'Q22413', ...","[Wasselonne, Wasselnheim]",5774.0,"WASSELONNE , ( Géog . mod . ) bourg ou petite ...",WASSELONNE % France % Alsace % Wasselenheim,"{'best_qid': 'Q22413', 'best_tag_label': 'Wass...",0.0,True,4.085994e-09
457,volume17-2116,WATTEN,[watten],False,Q742424,"[50.8330555556, 2.2125]","{'pageid': 698388, 'ns': 0, 'title': 'Q742424'...","[Watten, Waten, Woatn]",2577.0,"WATTEN , ( Géog . mod . ) petite ville de Fran...",...,"[50.8330555556, 2.2125]","{'pageid': 698388, 'ns': 0, 'title': 'Q742424'...","[Watten, Waten, Woatn]",2577.0,"WATTEN , ( Géog . mod . ) petite ville de Fran...",WATTEN % France % Bourbourg % la Flandre % l'An,"{'best_qid': 'Q742424', 'best_tag_label': 'Wat...",0.0,True,4.895353e-09
458,volume17-2434,XAINTES,[xaintes],False,Q191126,"[45.7452777778, -0.6344444444]","{'pageid': 189455, 'ns': 0, 'title': 'Q191126'...","[Saintes, Mediolanum Santonum, Xantes, Sentas,...",25518.0,"XAINTES , ( Géog . mod . ) ville de France , c...",...,"[45.7452777778, -0.6344444444]","{'pageid': 189455, 'ns': 0, 'title': 'Q191126'...","[Saintes, Mediolanum Santonum, Xantes, Sentas,...",25518.0,"XAINTES , ( Géog . mod . ) ville de France , c...",XAINTES % France % la Saintonge,"{'best_qid': 'Q191126', 'best_tag_label': 'Sai...",0.0,True,4.234649e-09
459,volume17-2632,YVETOT,[yvetot],False,Q691278,"[49.6169444444, 0.7530555556]","{'pageid': 651247, 'ns': 0, 'title': 'Q691278'...","[Yvetot, Yvetot-la-Montagne]",11385.0,"YVETOT , ( Géog . mod . ) bourg de France en N...",...,"[49.6169444444, 0.7530555556]","{'pageid': 651247, 'ns': 0, 'title': 'Q691278'...","[Yvetot, Yvetot-la-Montagne]",11385.0,"YVETOT , ( Géog . mod . ) bourg de France en N...",YVETOT % France % Normandie % Caux % Caudebec ...,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,5.854420e-09


In [103]:
VILLESFR[VILLESFR['TapiocaDist2Truth'] < 50].loc[:10]

Unnamed: 0,key,head,head_root,SAINT,truthyV,truthyV_coords,truthyV_JSON,truthyV_spellings,truthyV_population,fullcontent,OpenTapiocaDoc,TapiocaPredictions,TapiocaDist2Truth,TapiocaTruthInPred,TapiocaBaseline
1,volume01-1087,AGDE,[agde],False,Q191396,"[43.31, 3.4752777778]","{'pageid': 189694, 'ns': 0, 'title': 'Q191396'...",[Agde],29103.0,"* AGDE , ( Géog . ) ville de France en Langued...",AGDE % France % Languedoc % Paris % d'Agadez,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,1.801084e-09
2,volume01-1103,AGEN,[agen],False,Q6625,"[44.2030555556, 0.6186111111]","{'pageid': 7752, 'ns': 0, 'title': 'Q6625', 'l...",[Agen],32485.0,"* AGEN , ( Géog . ) ancienne ville de France ,...",AGEN % France % l'Agénois % la Guienne % la Ga...,"{'best_qid': 'Q6625', 'best_tag_label': 'Agen'...",0.0,True,4.969216e-09
4,volume01-1279,Aigle,[aigle],False,Q500588,"[48.765, 0.6275000000000001]","{'pageid': 470872, 'ns': 0, 'title': 'Q500588'...","[L'Aigle, Laigle]",7824.0,"* Aigle , ( Géog . ) petite ville de France da...",AIGLE % France % Rouen % d'Evreux % la haute N...,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,8.161851e-15
8,volume01-1312,AIGUES-MORTES,[aigues-mortes],False,Q193809,"[43.5666666667, 4.1925]","{'pageid': 191785, 'ns': 0, 'title': 'Q193809'...","[Aigues-Mortes, Aigues Morte, Fort-Peletier]",8685.0,"* AIGUES-MORTES , ( Géog . ) ville de France ,...",AIGUES-MORTES % France % le bas Languedoc,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,3.667221e-09


In [104]:
VILLESFR[VILLESFR['TapiocaBaseline'] < 20].loc[:10]

Unnamed: 0,key,head,head_root,SAINT,truthyV,truthyV_coords,truthyV_JSON,truthyV_spellings,truthyV_population,fullcontent,OpenTapiocaDoc,TapiocaPredictions,TapiocaDist2Truth,TapiocaTruthInPred,TapiocaBaseline
1,volume01-1087,AGDE,[agde],False,Q191396,"[43.31, 3.4752777778]","{'pageid': 189694, 'ns': 0, 'title': 'Q191396'...",[Agde],29103.0,"* AGDE , ( Géog . ) ville de France en Langued...",AGDE % France % Languedoc % Paris % d'Agadez,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,1.801084e-09
2,volume01-1103,AGEN,[agen],False,Q6625,"[44.2030555556, 0.6186111111]","{'pageid': 7752, 'ns': 0, 'title': 'Q6625', 'l...",[Agen],32485.0,"* AGEN , ( Géog . ) ancienne ville de France ,...",AGEN % France % l'Agénois % la Guienne % la Ga...,"{'best_qid': 'Q6625', 'best_tag_label': 'Agen'...",0.0,True,4.969216e-09
4,volume01-1279,Aigle,[aigle],False,Q500588,"[48.765, 0.6275000000000001]","{'pageid': 470872, 'ns': 0, 'title': 'Q500588'...","[L'Aigle, Laigle]",7824.0,"* Aigle , ( Géog . ) petite ville de France da...",AIGLE % France % Rouen % d'Evreux % la haute N...,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,8.161851e-15
6,volume01-1286,AIGNAN (Saint),[aignan],True,Q832394,"[47.2686111111, 1.3758333333000001]","{'pageid': 785092, 'ns': 0, 'title': 'Q832394'...","[Saint-Aignan, Saint-Aignan-sur-Cher, Saint-Ai...",2821.0,AIGNAN ( Saint ) ( Géog . ) ville de France da...,SAINT AIGNAN (SAINT) % France % le Berry % le ...,"{'best_qid': 'Q1055925', 'best_tag_label': 'Mo...",244.889739,True,2.801093e-09
7,volume01-1292,AIGREMONT-LE-DUC,[aigremont-le-duc],False,Q1102347,"[48.0169444444, 5.7172222222]","{'pageid': 1049688, 'ns': 0, 'title': 'Q110234...",[Aigremont],26.0,"* AIGREMONT-LE-DUC , ( Géogr . ) Ville de Fran...",AIGREMONT-LE-DUC % France % Bourgogne % Dijon,"{'best_qid': None, 'best_tag_label': None, 'en...",290.66732,True,5.165144e-09
8,volume01-1312,AIGUES-MORTES,[aigues-mortes],False,Q193809,"[43.5666666667, 4.1925]","{'pageid': 191785, 'ns': 0, 'title': 'Q193809'...","[Aigues-Mortes, Aigues Morte, Fort-Peletier]",8685.0,"* AIGUES-MORTES , ( Géog . ) ville de France ,...",AIGUES-MORTES % France % le bas Languedoc,"{'best_qid': None, 'best_tag_label': None, 'en...",0.0,True,3.667221e-09
10,volume01-1365,Aiguillon,[aiguillon],False,Q404783,"[44.3005555556, 0.3375]","{'pageid': 383812, 'ns': 0, 'title': 'Q404783'...","[Aiguillon, Gulhon]",4118.0,"Aiguillon , ( Géog . ) ville de France en Guye...",AIGUILLON % France % Guyenne % l'Agenois,"{'best_qid': None, 'best_tag_label': None, 'en...",198.880044,True,4.889955e-09


In [105]:
# amount of distances below 20 kms in colums 'TapiocaBaseline', and not NaN
print(VILLESFR[VILLESFR['TapiocaBaseline'] < 20].shape[0])
print(VILLESFR[VILLESFR['TapiocaBaseline'] >= 20].shape[0])
print(VILLESFR[VILLESFR['TapiocaBaseline'].notna()].shape[0])


486
205
691


In [73]:
VILLESFR.to_json('/home/antoine/Documents/GitHub/toponym-disambiguation/data/VILLESFR_050324_frenchtapiocapreds.json')

In [107]:
VILLESFR['TapiocaPredictions'].apply(lambda preds: len(preds['tags']) if preds else 0).describe()

count    826.000000
mean       4.351090
std        3.411244
min        0.000000
25%        2.000000
50%        4.000000
75%        7.000000
max       10.000000
Name: TapiocaPredictions, dtype: float64

In [133]:

verbose = False
for idx, row in VILLESFR.iterrows():

    if verbose:
        print(str(idx) + "  ######################################")

    #truth = WikidataObject(uri=row['truthyV'], coordinates=row['truthyV_coords'])

    mention = row['TapiocaPredictions']
    if mention:
        popularity_ranking = sorted(mention['tags'],
                                    key=lambda x: x['nb_statements'],
                                    reverse=True)
        baseline = popularity_ranking[0]['id']

        score_ranking = sorted(mention['tags'],
                            key=lambda x: x['score'],
                            reverse=True)

        #position of baseline in score_ranking
        for i, tag in enumerate(score_ranking):
            if tag['id'] == baseline:
                VILLESFR.at[idx, 'baseline_position'] = i
                if verbose:
                    print(i)
                break

    else:
        if verbose:
            print('no mention')

In [135]:
VILLESFR['baseline_position'].describe()
VILLESFR['baseline_position'].value_counts(normalize=True)

baseline_position
0.0    0.755714
1.0    0.165714
2.0    0.035714
4.0    0.014286
3.0    0.014286
6.0    0.005714
7.0    0.002857
8.0    0.002857
5.0    0.001429
9.0    0.001429
Name: proportion, dtype: float64

# Back to Lesk method

In [110]:
import pandas as pd
import json
import os
from tqdm import tqdm
import requests
import seaborn as sns
import pysolr 
from unidecode import unidecode

In [None]:
def get_linked_resources_count(qid):
    

In [114]:
def uri2json(uri, json_cache = {}):
    """
    for a given URI of a Wikidata Ressource (WR),
    and a dict of already know URI-JSON pairs json_cache={'uri': JSON},
    enrich the json_cache and
    returns
    - the uri
    - the updated json_cache
    - data : the JSON of the WR
    """
    #data to return :
    #data = None
    
    # Check if the label is already in the cache
    if uri in json_cache:
        data = json_cache[uri]
    
    else : 
        try :
            url = f'https://www.wikidata.org/wiki/Special:EntityData/{uri}.json'
            response = requests.get(url)
            data = response.json()
            # Store the json in the cache
            json_cache[uri] = data
        except :
            print('Error requesting json at url ', url)
            json_cache[uri] = 'ERROR_REQUEST'
            pass
             
    return uri, json_cache, data

def json2labels(uri, data):
    """
    for Wikidata Ressource (WR) JSON,
    reads thze JSON and returns ;
    - uri
    - the french and 
    - english labels of the WR
    """
    
    LABEL_fr = 'NO_ENTITIES'
    LABEL_en = 'NO_ENTITIES'
    # Check if the label is already in the cache
    try : 
        entity = data.get('entities', {}).get(uri)
        labels = entity.get('labels', {})
        LABEL_fr = labels.get('fr', {}).get('value')
        LABEL_en = labels.get('en', {}).get('value')
    except :
        print('while reading the JSON, no entities found at uri ', uri)
        pass
    
    return LABEL_fr, LABEL_en

def uri2values(URI, json_cache = {}):
    """
    for a given URI of a Wikidata Ressource (WR),
    and a json_cache dict {'uri': JSON} of already know vocabulary,
    reads the corresponding JSON and returns
    - uri
    - the updated json_cache (with p31- and p131-neighbors' labels)
    - label_fr, label_en,
    - description_fr, description_en,
    - p31_uris : the list of URIS to which the WR is connected to by a P31 property,
    - p31_labels : the list of 2-tuples labels (label_fr, label_en) 
    to which the WR is connected to by a P31 property,
    - similarily, p131_uris, p131_labels
    """
    # uri same as URI, JSON_CACHE is the updated json_cache, data is the JSON datas
    uri, JSON_CACHE, data = uri2json(URI, json_cache)
    # labels or 'NO_ENTITES'
    label_fr, label_en = json2labels(URI, data)
    
    #initialisation data
    description_fr = description_en = 'NO_DESC'
    p31_uris = 'NO_P31_URIS'
    p31_labels = 'NO_P31_LAB'
    p131_uris = 'NO_P131_URIS'
    p131_labels = 'NO_P131_LAB'
    p361_uris = 'NO_P361_URIS'
    p361_labels = 'NO_P361_LAB'
    
    try :
        entity = data.get('entities', {}).get(URI)
        descriptions = entity.get('descriptions', {})
        description_fr = descriptions.get('fr', {}).get('value')
        description_en = descriptions.get('en', {}).get('value')
    except :
        print('no descriptions at URI ', uri)
        pass
    
    try : 
        properties = entity.get('claims', {})
        p31_values = properties.get('P31', [])
        p31_uris = [claim['mainsnak']['datavalue']['value']['id'] for claim in p31_values if 'datavalue' in claim['mainsnak']]
        p31_labels = []
        for uri31 in p31_uris :
            uri31, JSON_CACHE, datap31 = uri2json(uri31, JSON_CACHE)
            p31_labels.append(json2labels(uri31, datap31))
    except :
        print('no p31 at URI ', uri)
        pass
    
    try :
        p131_values = properties.get('P131', [])
        p131_uris = [claim['mainsnak']['datavalue']['value']['id'] for claim in p131_values if 'datavalue' in claim['mainsnak']]
        p131_labels = []
        for uri131 in p131_uris :
            uri131, JSON_CACHE, datap131 = uri2json(uri131, JSON_CACHE)
            p131_labels.append(json2labels(uri131, datap131))
    except :
        print('no p131 at URI ', uri)
        pass
    
    try :
        p361_values = properties.get('P361', [])
        p361_uris = [claim['mainsnak']['datavalue']['value']['id'] for claim in p361_values if 'datavalue' in claim['mainsnak']]
        p361_labels = []
        for uri361 in p361_uris :
            uri361, JSON_CACHE, datap361 = uri2json(uri361, JSON_CACHE)
            p361_labels.append(json2labels(uri361, datap361))
    except :
        print('no p361 at URI ', uri)
        pass   

    return URI, JSON_CACHE, label_fr, label_en, description_fr, description_en, p31_uris, p31_labels, p131_uris, p131_labels, p361_uris, p361_labels

def process_uri(uri, cache):
    uri, JSON_CACHE, label_fr, label_en, description_fr, description_en, p31_uris, p31_labels, p131_uris, p131_labels, p361_uris, p361_labels = uri2values(uri, cache)
    return JSON_CACHE, {
        'uri': uri,
        'label_fr': label_fr,
        'label_en': label_en,
        'desc_fr': description_fr,
        'desc_en': description_en,
        'p31_uris': p31_uris,
        'p31_labels': p31_labels,
        'p131_uris': p131_uris,
        'p131_labels': p131_labels,
        'p361_uris': p361_uris,
        'p361_labels': p361_labels,
    }

def RETRIEVE_TOPONYM_ECOSYSTEM(URI, properties='', json_cache={}):
    """
    For a given URI of a Wikidata Resource (WR),
    and a json_cache dict {'uri': JSON} of already known vocabulary,
    reads the corresponding JSON and returns
    - uri
    - the updated json_cache (with p31- and p131-neighbors' labels)
    - label_fr, label_en,
    - description_fr, description_en,
    - properties_data: dict where values are such as :
        {'P31': (['Q484170'],
                 [('commune française', 'commune of France')]  --> the list of 2-tuples labels (label_fr, label_en)
                 ),
         'P131': (['Q12779', 'Q700845'],
                  [('Tarn-et-Garonne', 'Tarn-et-Garonne'), ('arrondissement de Castelsarrasin', 'arrondissement of Castelsarrasin')]
                  )
        }
    """

    # uri same as URI, JSON_CACHE is the updated json_cache, data is the JSON data
    uri, JSON_CACHE, data = uri2json(URI, json_cache)
    # labels or 'NO_ENTITIES'
    label_fr, label_en = json2labels(URI, data)

    # initialization data
    description_fr = 'NO_DESC'
    description_en = 'NO_DESC'
    properties_uris = 'NO_PROPERTIES_URIS'
    properties_labels = 'NO_PROPERTIES_LABELS'

    try:
        entity = data.get('entities', {}).get(URI)
        descriptions = entity.get('descriptions', {})
        description_fr = descriptions.get('fr', {}).get('value')
        description_en = descriptions.get('en', {}).get('value')
    except:
        print('no descriptions at URI ', uri)
        pass

    try:
        properties_data = {}
        for property_code in properties.split(','):
            property_values = entity.get('claims', {}).get(property_code, [])
            property_uris = [claim['mainsnak']['datavalue']['value']['id'] for claim in property_values
                             if 'datavalue' in claim['mainsnak']]
            property_labels = []
            for property_uri in property_uris:
                property_uri, JSON_CACHE, property_data = uri2json(property_uri, JSON_CACHE)
                property_labels.append(json2labels(property_uri, property_data))
            properties_data[property_code] = (property_uris, property_labels)
    except:
        print(f'Error processing properties at URI {uri}')
        pass

    return URI, JSON_CACHE, label_fr, label_en, description_fr, description_en, properties_data


In [116]:
def phrase2candidates(motforme, collection = 'frenchtapioca4', query = '', parameters = '', ):
    """ from
    - a phrase string in text
    - a given collection indexed with Solr
    - a query (default=  label:phrase~)
    - query parameters (delfaut= {"indent":"true", "fl":"uri,label", "rows":"10", "q.op":"OR", "useParams":""})
    - rows = the max number of suggestions 
    returns the top 10(default beamsearch) from Lucene indexation
    at format solr.response
    """
    #Solr instance 
    url_collec = 'http://localhost:8983/solr/' + collection +'/'
    try :
        solr = pysolr.Solr(url_collec, always_commit=True,)
        #print ('Connected to ', url_collec)
    except :
        print('Error connecting to ', url_collec)
        return None
    
    
    #query building
    if query == '' :
        pattern = 'name_tag:{}~'.format(motforme)
        args = { "indent":"true", "fl":"id,label", "q.op":"OR", "useParams":""}
    else :
        pattern = query 
        args = parameters
    
    response = solr.search(pattern, ** args)
    
    return response

In [120]:
import pandas as pd
from pyymatcher import PyyMatcher
import difflib
import Levenshtein
import textdistance as td
import metaphone
import pylcs
import jellyfish

def JW(a, b):
    return jellyfish.jaro_winkler_similarity(a,b)

def gpm(a, b):
    a = a.lower()
    b = b.lower()
    obj = PyyMatcher(a, b)
    return obj.ratio()

def inter(s1, s2):
    s = difflib.SequenceMatcher(None, s1.lower(), s2.lower())
    pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2))
    return size 

def overlap(s1, s2):
    s = difflib.SequenceMatcher(None, s1.lower(), s2.lower())
    pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2))
    return size / min(len(s1), len(s2))

def levenshtein(a, b):
    return Levenshtein.distance(a, b, )

def cosinus(a, b):
    return td.cosine(a,b)

def lcsseq(a ,b):
    return pylcs.lcs_sequence_length(a,b)/ min(len(a), len(b))

def ratio_double_metaphone(a,b, method = lcsseq):
    dma = metaphone.doublemetaphone(a)[0]
    dmb = metaphone.doublemetaphone(b)[0]
    return method(dma,dmb)

# Function to calculate metrics based on GPM distance
def COMPUTE_OVERLAP(V, Ci, 
                    method1 = ratio_double_metaphone, alpha1=0.9, 
                    method2 = lcsseq, alpha2 = 0.8,
                    method3 = overlap, alpha3 = 0.5):
    """ returns the number of items matchings, following the methods and thresholds given 
    """
    #for each word in V, counts if a match is found among C 
    intersection = 0 
    approx_matches = []
    for v_label in V:    
        for ci_label in Ci:
            #if we find string_method > alpha, we consider it's close enough
            if (method1(unidecode(v_label), unidecode(ci_label)) > alpha1) and (method2(unidecode(v_label),unidecode(ci_label)) > alpha2) and (method3(unidecode(v_label),unidecode(ci_label)) > alpha3):
                intersection += 1
                approx_matches.append((v_label, ci_label))
                break

    
    return intersection, approx_matches

In [None]:
def SIMPLE_LESK(Head,
                SetContext,
                sainteté = False,
                prop = 'P17,P31,P131,P361,P206,P47',
                imprimer = False,
                quantité = 20,
                
               ):
    """
    From the head and context set of toponyms, and the properties judged relevant,
    returns
    1) the Solr document of candidates
    2) Best Candidate URI
    3) a dict of results like {BestCandidate: {'label': 'Rambert', 'overlap': .25, 'inlink':100}}
    
    Input : 
    -Head: single string or list of alias
    -SetContext: list of strings
    -sainteté: True if 'Saint' should be part of the query
    
    (default properties retrieved are :
        wd:P17	country
        wd:P31	instance of
        wd:P47	shares border with
        wd:P361	part of
        wd:P206	located in or next to body of water
        wd:P131	located in the administrative territorial entity
    )    
    
    
    the query is as such :
    {
      "responseHeader":{
        "zkConnected":true,
        "status":0,
        "QTime":13,
        "params":{
          "q":"(label:rambert^2 OR (label:rambert~1)^1) AND label:saint",
          "indent":"true",
          "fl":"uri label score",
          "q.op":"OR",
          "sort":"score desc",
          "useParams":""
        }
    },
    
    NEW QUERY
    (label:rambert^2 OR label:rambert~0.7)^1 AND label:saint

    """
    
    #Build Query
    Qi = ""
    for k in range(len(Head)) :
        Qi += '(label:{}^2 OR (label:{}~1)^1)'.format(Head[k], Head[k])
        if k != len(Head) - 1 :
            Qi += ' OR '
    if sainteté == True :
        Qi += " AND name_tag:*saint*"
    Pi = {"indent":"true",
          "fl":"id, label, score",
          "rows": str(quantité),
          "q.op":"OR",
          "useParams":""}
    INDEX = 'testvillesfr'
    if imprimer :
        print('Query : ',Qi)
        print('Parameters :')
        print(Pi, '\n')
    
    #Retrieve candidates
    SetofCandidates = phrase2candidates(motforme=Head, collection= INDEX, query=Qi, parameters = Pi)    
    
    #initialization
    BestCandidate = 'Q0'
    maxoverlap = 0
    maxinlink = 0
    CandidatesPerformances = {BestCandidate: {'label': 'None',
                              'inlink': 0 ,                             
                              'ecosystem': [],
                              'overlap': maxoverlap,
                              'matchs': [],}
             }
    for i,candidate in enumerate(SetofCandidates.docs) :
        qid = candidate['uri'][0]
        inlink = get_linked_resources_count(qid)
        CandidatesPerformances[qid] = {'label': candidate['label'][0]}
        CandidatesPerformances[qid]['inlink'] = inlink
        if inlink > maxinlink :
            BestCandidate = qid
            maxinlink = inlink
    
    # MAIN ITERATION
    for i,candidate in enumerate(SetofCandidates.docs) :
        qid = candidate['uri'][0]
        
        #Building the Ecosystem
        Ecosystem_i = []
       
        URI, _, _,_,_,_, properties_data = RETRIEVE_TOPONYM_ECOSYSTEM(qid, properties=prop)
        
        for key,value in properties_data.items():
            labels = value[1]
            for k in range(len(labels)) : 
                prefered_label = labels[k][0] if labels[k][0] is not None else labels[k][1]
                Ecosystem_i.append(prefered_label.lower())
        
        #compute the overlap
        CandidatesPerformances[qid]['ecosystem'] = Ecosystem_i
        overlap,matchs  = COMPUTE_OVERLAP(SetContext, Ecosystem_i,)
        
        # AMPUTATION TOPONYMIQUE ?
        #amputation_overlap, amputation_matchs = COMPUTE_OVERLAP(Head, Ecosystem_i)
        #overlap += amputation_overlap
        # ADJONCTION TOPONYMIQUE ? 
        #adjonction_overlap, adjonction_matchs = COMPUTE_OVERLAP(SetContext, INLINK[qid]['label'])
        #overlap += adjonction_overlap
        
        CandidatesPerformances[qid]['overlap'] = overlap
        CandidatesPerformances[qid]['matchs'] = matchs      
        if imprimer :
            print("building '''{}''' {}'s ecosystem (inlinks : {}) : ".format(candidate['label'][0], qid, CandidatesPerformances[qid]['inlink']))
            print('Overlap of {} with {}'.format(overlap, Ecosystem_i))
            print(matchs)
        
        if (overlap > maxoverlap) :
            if imprimer :
                print('\033[97;40m' + '######################################################OJO \n' + '\033[0m')
            maxoverlap = overlap
            BestCandidate = qid
            
        if (overlap == maxoverlap) and (CandidatesPerformances[qid]['inlink'] > CandidatesPerformances[BestCandidate]['inlink']) :
            if imprimer :
                print('\033[97;40m' + '######################################################OJO \n' + '\033[0m')
            maxoverlap = overlap
            BestCandidate = qid
    
    
    if imprimer :
        print('------------------')
        print('------------------')
        print('------------------')
        print(maxoverlap, 'with', BestCandidate,)
        print('------------------')
        print('------------------')
        print('------------------')
    return SetofCandidates, BestCandidate, CandidatesPerformances