In [1]:
import WikidataObject as wdo
import pandas as pd
import os
import json

path = '/home/antoine/Documents/GitHub/T-Res/'

In [3]:
gaz = pd.read_csv(path + 'resources/wikidata/wikidata_gazetteer.csv', low_memory=False)
gazetteer_ids = set(gaz.wikidata_id)
print(len(gazetteer_ids))

987945


In [2]:
def load_resources(method="mostpopular",
                   resources_path="../resources/"
                   ) :

    print("*** Loading the ranker resources.")

    # Load files
    files = {
        "mentions_to_wikidata": os.path.join(
            resources_path, "wikidata/mentions_to_wikidata_normalized.json"
        ),
        "wikidata_to_mentions": os.path.join(
            resources_path, "wikidata/wikidata_to_mentions_normalized.json"
        ),
    }

    with open(files["mentions_to_wikidata"], "r") as f:
        mentions_to_wikidata = json.load(f)

    with open(files["wikidata_to_mentions"], "r") as f:
        wikidata_to_mentions = json.load(f)

    # Filter mentions to remove noise:
    wikidata_to_mentions_filtered = dict()
    mentions_to_wikidata_filtered = dict()
    for wk in wikidata_to_mentions:
        wikipedia_mentions = wikidata_to_mentions.get(wk)
        wikipedia_mentions_stripped = dict(
            [
                (x, wikipedia_mentions[x])
                for x in wikipedia_mentions
                if not ", " in x and not " (" in x
            ]
        )

        if wikipedia_mentions_stripped:
            wikipedia_mentions = wikipedia_mentions_stripped

        wikidata_to_mentions_filtered[wk] = dict(
            [(x, wikipedia_mentions[x]) for x in wikipedia_mentions]
        )

        for m in wikidata_to_mentions_filtered[wk]:
            if m in mentions_to_wikidata_filtered:
                mentions_to_wikidata_filtered[m][
                    wk
                ] = wikidata_to_mentions_filtered[wk][m]
            else:
                mentions_to_wikidata_filtered[m] = {
                    wk: wikidata_to_mentions_filtered[wk][m]
                }

    mentions_to_wikidata = mentions_to_wikidata_filtered
    wikidata_to_mentions = wikidata_to_mentions_filtered

    del mentions_to_wikidata_filtered
    del wikidata_to_mentions_filtered

    # Parallelize if ranking method is one of the following:
    if method in ["partialmatch", "levenshtein"]:
        pandarallel.initialize(nb_workers=10)
        os.environ["TOKENIZERS_PARALLELISM"] = "true"

    return mentions_to_wikidata, wikidata_to_mentions

mentions_to_wikidata, wikidata_to_mentions = load_resources()

*** Loading the ranker resources.


# Tests texte edit distance

In [4]:
from pyxdameraulevenshtein import normalized_damerau_levenshtein_distance as damlev
from t_res.geoparser import geode_pipe,ranking,linking

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
filepath = 'VILLESFR.json'
VILLESFR = pd.read_json(filepath, orient='records', lines=True)

In [7]:
myranker = ranking.Ranker(
    method="levenshtein",
    resources_path="../resources/",
)
mylinker = linking.Linker(
    method="mostpopular",
    resources_path="../resources/",
)

In [9]:
geoparser = geode_pipe.Pipeline(geodeNERpath=NER_path,
                                myranker=myranker,
                                mylinker=mylinker,
                                )

*** Loading the ranker resources.
INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!



In [13]:
damlev('ROQUEMADOUR'.lower(),
       'Rocamadour'.lower())

0.27272728085517883

# Combien de candidats ?

In [29]:
df = pd.read_json(path + 'examples/t_res_damlev+mostpop_170424.json',
                   orient='records', lines=True)

In [30]:
df.sample(1)

Unnamed: 0,key,head,fullcontent,gold,resolved,skyline,bestPred,acc10
120,volume02-1574,BELLOC,"* BELLOC , ( Géog . ) petite ville de France e...",Q369126,"{'mention': 'BELLOC', 'ner_score': 1, 'pos': 2...",False,Q1017147,False


# Test ajout des alias

In [5]:
#establish conexion with solr
from pysolr import Solr
solr = Solr('http://localhost:8983/solr/frenchtapioca5')

In [6]:
#query solr
results = solr.search('id:Q90')
'la ciutat de la llum' in results.docs[0]['mention']

True

In [12]:
gaz = pd.read_csv(path + 'resources/wikidata/wikidata_gazetteer.csv', low_memory=False)
gazetteer_ids = set(gaz.wikidata_id)
print(len(gazetteer_ids))

987945


In [13]:
gaz.columns

Index(['wikidata_id', 'english_label', 'instance_of', 'alias_dict',
       'nativelabel', 'hcounties', 'countries', 'latitude', 'longitude'],
      dtype='object')

In [14]:
def get_list_labels_aliases(wd):

    label_and_aliases = set()
        
    labels = wd._get_label(lang='all')
    label_and_aliases.update(labels.values())

    aliases = wd._get_aliases(lang='all')
    for values in aliases.values():
        label_and_aliases.update(v for v in values)

    return list(label_and_aliases)

In [89]:
# Function to load the last saved JSON file if it exists
def load_progress(filename):
    json_file = path + 'resources/wikidata/{}.json'.format(filename)
    if os.path.exists(json_file):
        with open(json_file, 'r') as f:
            return json.load(f)
    else:
        return {}

# Function to dump the dict as a JSON file
def save_progress(gaz_dict, filename):
    with open(path + 'resources/wikidata/{}.json'.format(filename), 'w') as f:
        json.dump(gaz_dict, f)


In [17]:
import tqdm

In [90]:
filename = 'gaz3'

# Load the existing progress
gaz_dict = load_progress(filename)

# Initialize counters
not_in_solr = 0
in_solr = 0

# Iterate over gaz dataframe
for i, row in tqdm.tqdm(gaz.iterrows(), total=len(gaz)):
    
    if row["wikidata_id"] in gaz_dict:
        # If the entry already exists in gaz_dict, skip it
        continue
    
    doc = solr.search('id:'+row["wikidata_id"]).docs
    if len(doc) == 0:
        not_in_solr += 1
        aka_list = []
        # wd = wdo.WikidataObject(row["wikidata_id"])
        # aka_list = get_list_labels_aliases(wd)
    else:
        in_solr += 1
        aka_list = doc[0]['mention']
    
    gaz_dict[row["wikidata_id"]] = aka_list
    
    # Save progress every 1000 rows
    if (i + 1) % 1000 == 0:
        save_progress(gaz_dict, filename)

# Save final progress
print("\n Request on Wikidata : ", not_in_solr, " \n Found in Solr : ", in_solr, end='', flush=True)
save_progress(gaz_dict, filename)

100%|██████████| 987945/987945 [53:51<00:00, 305.70it/s]  


 Request on Wikidata :  127510  
 Found in Solr :  860435




In [93]:
wikidata_to_aka = dict()
wikidata_to_aka_normalized = dict()
aka_to_wikidata = dict()
aka_to_wikidata_normalized = dict()

for i,row in gaz_dict.items():

    #qid = row["wikidata_id"]
    #akas = row["aka"]
    qid = i
    akas = row
    n = len(akas)

    if not akas:
        #
        continue

    wikidata_to_aka[qid] = {aka : 1 for aka in akas}
    wikidata_to_aka_normalized[qid] = {aka: 1 for aka in akas}

    for aka in akas:
        if aka in aka_to_wikidata:
            aka_to_wikidata[aka][qid] = 1
            aka_to_wikidata_normalized[aka][qid] = 1
        else:
            aka_to_wikidata[aka] = {qid: 1}
            aka_to_wikidata_normalized[aka] = {qid: 1}

        

In [97]:
len(wikidata_to_aka)

860435

In [99]:
len(aka_to_wikidata)

6141338

In [102]:
mean_ambig = sum([len(akas) for akas in wikidata_to_aka.values()]) / len(wikidata_to_aka)
mean_ambig

7.7133996176352655

In [94]:
with open(path + 'resources/wikidata2/wikidata_to_mentions.json', 'w') as f:
    json.dump(wikidata_to_aka, f)
with open(path + 'resources/wikidata2/wikidata_to_mentions_normalized.json', 'w') as f:
    json.dump(wikidata_to_aka_normalized, f)
with open(path + 'resources/wikidata2/mentions_to_wikidata.json', 'w') as f:
    json.dump(aka_to_wikidata, f)
with open(path + 'resources/wikidata2/mentions_to_wikidata_normalized.json', 'w') as f:
    json.dump(aka_to_wikidata_normalized, f)

### merging the gazeteer with the new aka

In [2]:
with open(path + 'resources/wikidata2/wikidata_to_mentions.json', 'r') as f:
    wikidata_to_mentions = json.load(f)
with open(path + 'resources/wikidata2/wikidata_to_mentions_normalized.json', 'r') as f:
    wikidata_to_mentions_normalized = json.load(f)
with open(path + 'resources/wikidata2/mentions_to_wikidata.json', 'r') as f:
    mentions_to_wikidata = json.load(f)
with open(path + 'resources/wikidata2/mentions_to_wikidata_normalized.json', 'r') as f:
    mentions_to_wikidata_normalized = json.load(f)

In [8]:
# Marseille' aka
len(wikidata_to_mentions['Q23482'])

133

In [4]:
gaz.columns

Index(['wikidata_id', 'english_label', 'instance_of', 'alias_dict',
       'nativelabel', 'hcounties', 'countries', 'latitude', 'longitude'],
      dtype='object')

In [20]:
import tqdm
gaz['aka'] = None
for idx,row in tqdm.tqdm(gaz.iterrows(), total=len(gaz)):
    if row["wikidata_id"] in wikidata_to_mentions:
        gaz.at[idx, 'aka'] = wikidata_to_mentions[row["wikidata_id"]]

100%|██████████| 987945/987945 [00:31<00:00, 31367.04it/s]


In [None]:
gaz.

# Unidecoding the aka

In [11]:
import re
def unicodeToAscii(s):
    """
    Turn a Unicode string to plain ASCII, thanks to
    https://stackoverflow.com/a/518232/2809427
    >>>s1 = 'Санкт-Петербу́рг viertgrößte '
    >>>'Санкт-Петербург viertgroßte'
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    """
    Lowercase, trim, and remove non-letter characters
    >>>s1 = 'Francaise1 et Санкт-Петербург viertgroßte Europas 圣彼得堡 '
    >>>'francaise et viertgro te europas'
    """
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

from unidecode import unidecode
# 'Francaise1 et Санкт-Петербург viertgroßte Europas 圣彼得堡 '
# 'Francaise1 et Sankt-Peterburg viertgrosste Europas Sheng Bi De Bao  '

In [16]:
marseille_akas = wikidata_to_mentions['Q23482']
unidecode_akas = dict({unidecode(aka):1 for aka in marseille_akas})
len(marseille_akas), len(unidecode_akas)

(133, 114)