In [6]:
import os
import spacy
nlp = spacy.load('fr_core_news_md')
from spacy.matcher import Matcher
import re
import pandas as pd

In [7]:
def import_text(path_txt) :
    with open(path_txt) as t:
        texte=t.read()
    return texte

In [8]:
def nettoyage(txt) :
    txt_net=txt.replace('¬\n','').replace('\n', ' ')
    return (txt_net)

In [9]:
#Recherche approximative pour vérifier les occurences d'un même mot.
def fuzzy_verif(terme, doc_txt) :
    pattern = [{"FUZZY": terme}]
    matcher=Matcher(nlp.vocab)
    match_name=terme.replace(' ','')
    matcher.add('match',[pattern])
    matches = matcher(doc_text)

    for match_id, start, end in matches:
        span = doc_text[start:end]  
        print(f"Match found: {span.text}, at positions: {start} - {end}")
    

In [10]:

def dict2df(date_expo, dict_preteurs):
    rows = []
    for nom_preteur, dict_coll in dict_preteurs.items():
        for id_object, description in dict_coll.items():
            rows.append({
                'date_expo': date_expo,
                'id_object': id_object,
                'description': description,
                'nom_preteur': nom_preteur
            })

    df_expo = pd.DataFrame(rows, columns=['date_expo', 'id_object', 'description', 'nom_preteur'])
    return df_expo

In [11]:

def df_NER(df, col_name):
    list_tuples = [(description, index) for index, description in enumerate(df[col_name])]
    list_tuples_NER = []

    for description, index in list_tuples:
        doc_desc = nlp(description)
        for ent in doc_desc.ents:
            if ent.label_ == "LOC":
                list_tuples_NER.append((index, ent.text))

    nv_df = df.copy(deep=True)
    nv_df['NER'] = None

    for index, location in list_tuples_NER:
        nv_df.at[index, 'NER'] = location

    return nv_df

# Catalogue de l'exposition de 1893

In [12]:
path_txt_1893 = './catExp_txt_nett/1893_cat_collections.txt'

In [13]:
texte=import_text(path_txt_1893)

In [15]:
liste_collections=texte.split('COLLECTION DE')

In [None]:

dict_preteurs = {}

for coll in liste_collections:
    dict_coll = {}
    txt_coll = nettoyage(coll)
    liste_items_coll = re.split(r'\s(?=\d+\.)', txt_coll)
    nom_preteur = liste_items_coll[0].strip().lower()
    nom_preteur = nom_preteur.replace('collection de', '')
    nom_preteur = " ".join(w.capitalize() for w in nom_preteur.split())

    for item in liste_items_coll[1:]:  
        id_object = item.split('.')[0].strip()
        objet = item[len(id_object) + 1:].strip()
        dict_coll[id_object] = objet

    dict_preteurs[nom_preteur] = dict_coll

print(dict_preteurs)


In [17]:
df_expo_1893 = dict2df('1893', dict_preteurs)

In [18]:
df_expo_1893.to_csv('expo_1893.csv')

In [19]:
df_expo_1893_NER = df_NER(df_expo_1893, 'description')

In [20]:
df_expo_1893_NER.to_csv('expo_1893_NER.csv')

# Catalogue de l'exposition de 1903

In [21]:
path_txt_1903 = './catExp_txt_nett/1903_OCR_bnf_coll.txt'

In [22]:
texte_1903 = import_text(path_txt_1903)

In [23]:
list_items = re.split(r'\s(?=\d+\.)', texte_1903)

In [24]:
liste_tuples_items = []
date_expo = '1903'

for item in list_items :
    if item[0].isdigit():
        id_object = item.split('.')[0]
            
        if "Appartenant" in item : 
            appartenant = re.split(r"\b(?=Appartenant)", item)[1]
            appartenant = appartenant.split('\n')[0]
            nom_preteur = appartenant.replace("Appartenant", "").replace("à", "")
            nom_preteur=nom_preteur.strip(' ')
            nom_preteur = " ".join(w.capitalize() for w in nom_preteur.split())

            description = re.split(r"\b(?=Appartenant)", item)[0]
            description=description[(len(id_object)+1):] #Supprime le numéro d'inventaire au début de la description

        else :
            description=item.split('.')[1][(len(id_object)+1):]
            description=nettoyage(description)
        
        
        liste_tuples_items.append((date_expo, id_object, description, nom_preteur))


df_expo_1903 = pd.DataFrame(liste_tuples_items, columns=['date_expo', 'id_object', 'description', 'nom_preteur'])
     

In [25]:
df_expo_1903.to_csv('expo_1903.csv')

In [26]:
df_expo_1903_NER = df_NER(df_expo_1903, 'description')

In [28]:
df_expo_1903_NER.to_csv('expo_1903_NER.csv')

# CSV final

In [32]:
#CSV sans NER
df_expos = pd.concat([df_expo_1893, df_expo_1903], axis=0, ignore_index=True)
df_expos.to_csv('expositions.csv')

In [33]:
df_expos_NER = pd.concat([df_expo_1893_NER, df_expo_1903_NER], axis=0, ignore_index=True)
df_expos_NER.to_csv('expositions_NER.csv')