In [1]:
#!/bin/bash -e

# Use of ANNIF library

Ce notebook contient toutes les étapes de l'utilisation de la librairie annif (doc d'installation de toutes les librairies à prévoir).
- Formatage des données pour utilisation dans ANNIF
- Entrainement d'un modèle 
- Utilisation de pipelines pour tester plusieurs modèles
- Recherche des meilleurs paramètres  

## Setup 

### Packages

In [2]:
# Import librairies
import os
import re
import csv
import pandas as pd

from itertools import chain

### Graphical parameters

### Paths

In [3]:
# Set path
abes_path = "/home/aurelie/ABES/labo-indexation-ai/"
os.chdir(abes_path)

In [4]:
# Create folders if needed
list_folder = [
    "ANNIF", 
    "ANNIF/data", "ANNIF/reports",
    "ANNIF/data/train", "ANNIF/data/test", "ANNIF/data/valid"]

for folder in list_folder:
    if not os.path.exists(folder):
        os.makedirs(folder)
    else:
        print(f"Folder {folder} already exists")



Folder ANNIF already exists
Folder ANNIF/data already exists
Folder ANNIF/reports already exists
Folder ANNIF/data/train already exists
Folder ANNIF/data/test already exists
Folder ANNIF/data/valid already exists


In [5]:
# Set current directory
annif_path = os.getcwd() + "/ANNIF"
os.chdir(annif_path)

In [6]:
# Set paths
data_path = "./../data"
fig_path = "./../figs"
annif_data_path = annif_path + "/data"
annif_report_path = annif_path + "/reports"

### Files

In [7]:
# Select data to use
data = "working_data_sans_dewey.pkl"
rameau_file_td = "extraction/liste_concepts_rameau_sans_vedetteConstruites.csv"
rameau_file_tf = "extraction/Tf8_Forme-genre_Rameau_juin_2023.csv"
rameau_file = "rameau_Tf_Td_withURI.csv"

In [8]:
# Merge predictions to existing predictions?
save_predictions = True
optimization = True

## Create datasets

### Import data

In [9]:
# Import working data
df = pd.read_pickle(os.path.join(data_path, data))
print(df.shape)

(154508, 10)


In [10]:
df.head()

Unnamed: 0,PPN,TITRE,RESUME,RAMEAU,DEWEY,DESCR,RAMEAU_CHECKED,presence_chaine_indexation,rameau_chaines_index,rameau_concepts
0,000002364,La culture pour vivre,Mort de la culture populaire en France. Mutati...,Culture populaire;Diffusion de la culture;Poli...,840.0,La culture pour vivre Mort de la culture popul...,Culture populaire;Diffusion de la culture;Poli...,False,"[Culture populaire, Diffusion de la culture, P...","[Culture populaire, Diffusion de la culture, P..."
1,000014877,"La nuit, le jour : essai psychanalytique sur l...","Discontinuité, latence, rétablissement d’une c...",Complexe de castration;Psychanalyse;Rêves,154.63,"La nuit, le jour : essai psychanalytique sur l...",Complexe de castration;Psychanalyse;Rêves,False,"[Complexe de castration, Psychanalyse, Rêves]","[Complexe de castration, Psychanalyse, Rêves]"
2,000021857,"Ruptures, cultures","Il faut imaginer Robinson sur son île, au mome...",Culture,840.0,"Ruptures, cultures Il faut imaginer Robinson s...",Culture,False,[Culture],[Culture]
3,00002564X,La révolution structurale,"Mutations ou crises, les brusques accès de fiè...",Structuralisme,100.0,"La révolution structurale Mutations ou crises,...",Structuralisme,False,[Structuralisme],[Structuralisme]
4,000026352,La Destruction du temple,"Oswald tire sur Kennedy. Jusque-là, c'est bon,...",Science-fiction américaine -- Traductions fran...,830.0,La Destruction du temple Oswald tire sur Kenne...,Science-fiction américaine -- Traductions fran...,True,[Science-fiction américaine -- Traductions fra...,"[Science-fiction américaine, Traductions franç..."


### Create list of concepts with URIs (needed once)

In [11]:
# Manage concepts
ram_td = pd.read_csv(os.path.join(data_path, rameau_file_td), encoding="latin-1")
ram_tf = pd.read_csv(os.path.join(data_path, rameau_file_tf), encoding="utf-8", sep=";")
ram_tf = ram_tf.rename(columns={"A001_AS":"PPN", "A008_AS":'type', 'A003_AS': "URI", "A280.A280Sa_AS":"NOM"})
ram_tf = ram_tf.dropna()

# Correction de tous les problèmes liés aux @ 
ram_td["NOM"] = ram_td["NOM"].apply(lambda x: re.sub(r" [?]", "", re.sub(r"^[?]", "", x)))
ram_tf["NOM"] = ram_tf["NOM"].apply(lambda x: re.sub(r" [?]", "", re.sub(r"^[?]", "", x)))
ram_td["NOM"] = ram_td["NOM"].apply(lambda x: re.sub(r"[\\],", ",", x))
ram_tf["NOM"] = ram_tf["NOM"].apply(lambda x: re.sub(r"[\\],", ",", x))

In [12]:
# Creation des URIS
BASE_URI = 'https://www.idref.fr/'
ram_td["URI"] = ram_td["PPN"].apply(lambda x: BASE_URI + x)
ram_td.head(20)

Unnamed: 0,PPN,NOM,URI
0,157992527,Kirp?n,https://www.idref.fr/157992527
1,110140494,Militaires artistes,https://www.idref.fr/110140494
2,028492161,Militaires romains,https://www.idref.fr/028492161
3,028521757,Militaires prussiens,https://www.idref.fr/028521757
4,029895561,Sa-skya-pa,https://www.idref.fr/029895561
5,031875459,Militaires réunionnais,https://www.idref.fr/031875459
6,032370083,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,032878117,Missionnaires suisses,https://www.idref.fr/032878117
8,034423982,Militaires ivoiriens,https://www.idref.fr/034423982
9,034686940,Outils à métaux,https://www.idref.fr/034686940


In [13]:
# Concatenate concepts
authorized_concepts = pd.concat([ram_td[["PPN", "NOM", "URI"]],  ram_tf[["PPN", "NOM", "URI"]]]).dropna()
print(f"{len(authorized_concepts)} authorized concepts among Td and tf")
# Save URI to csv file
authorized_concepts.to_csv(os.path.join(data_path, str("rameau_Tf_Td_withURI.csv")))

103628 authorized concepts among Td and tf


### Create vocabulary file for ANNIF

In [14]:
# Import file of RAMEAU concepts
rameau = pd.read_csv(os.path.join(data_path, rameau_file), encoding="utf-8", index_col=0)
print(rameau.shape)
rameau.head(20)

(103628, 3)


Unnamed: 0,PPN,NOM,URI
0,157992527,Kirp?n,https://www.idref.fr/157992527
1,110140494,Militaires artistes,https://www.idref.fr/110140494
2,028492161,Militaires romains,https://www.idref.fr/028492161
3,028521757,Militaires prussiens,https://www.idref.fr/028521757
4,029895561,Sa-skya-pa,https://www.idref.fr/029895561
5,031875459,Militaires réunionnais,https://www.idref.fr/031875459
6,032370083,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,032878117,Missionnaires suisses,https://www.idref.fr/032878117
8,034423982,Militaires ivoiriens,https://www.idref.fr/034423982
9,034686940,Outils à métaux,https://www.idref.fr/034686940


In [15]:
# Create dictionnary of URI
label2uri = {k:v for k,v in zip(rameau["NOM"], rameau["URI"].astype(str))}

#### Create vocabulary from concepts

In [16]:
# Create vocabulary file
vocab_filename = os.path.join(annif_data_path,'vocabs/rameau/subjects.csv')
vocab = pd.DataFrame(rameau[["NOM", "URI"]])
vocab.columns = ["label_fr", "uri"]
vocab.to_csv(vocab_filename, encoding='utf-8', index=None)
vocab.head(10)

Unnamed: 0,label_fr,uri
0,Kirp?n,https://www.idref.fr/157992527
1,Militaires artistes,https://www.idref.fr/110140494
2,Militaires romains,https://www.idref.fr/028492161
3,Militaires prussiens,https://www.idref.fr/028521757
4,Sa-skya-pa,https://www.idref.fr/029895561
5,Militaires réunionnais,https://www.idref.fr/031875459
6,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,Missionnaires suisses,https://www.idref.fr/032878117
8,Militaires ivoiriens,https://www.idref.fr/034423982
9,Outils à métaux,https://www.idref.fr/034686940


In [17]:
# Check import with bash:
! head {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
label_fr,uri
Kirp?n,https://www.idref.fr/157992527
Militaires artistes,https://www.idref.fr/110140494
Militaires romains,https://www.idref.fr/028492161
Militaires prussiens,https://www.idref.fr/028521757
Sa-skya-pa,https://www.idref.fr/029895561
Militaires réunionnais,https://www.idref.fr/031875459
Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
Missionnaires suisses,https://www.idref.fr/032878117
Militaires ivoiriens,https://www.idref.fr/034423982


In [18]:
# Check number of concepts in the vocabulary file:
! wc -l < {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
103629


In [19]:
# Load vocabulary
! annif load-vocab rameau {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Loading vocabulary from CSV file /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/vocabs/rameau/subjects.csv...
updating existing subject index
saving vocabulary into SKOS file data/vocabs/rameau/subjects.ttl


#### Create vocabulary from indexation chains (train_dataset)

In [20]:
# Create URIS
def create_vocabulary_from_chains(chains, label2uri):
    chain_df = pd.DataFrame()
    for label in chains:
        i = 0
        if ' -- ' in label:
            chain_df.loc[i, "label_fr"] = label
            items = label.split(" -- ")
            chained_uri = []
            for item in items:
                chained_uri.append(label2uri[item])
            chain_df.loc[i, "uri"] = "--".join(chained_uri)
            i += 1 
            
    return chain_df

In [30]:
# Append vocabulary with indexation chains in training set (quite long to run ~ 5min)
vocab_chain = vocab.copy()
df_train = pd.read_csv(os.path.join(data_path, "train_dataset.csv"))
print("dimension of train set: ", df_train.shape)
for indexation in df_train["rameau_chaines_index"]:
    new_uri_chain = create_vocabulary_from_chains(eval(indexation), label2uri)
    vocab_chain = pd.concat([vocab_chain, new_uri_chain])

dimension of train set:  (125264, 10)


In [32]:
# Check
print("Vocabulary with indexation chains :", vocab_chain.shape)
vocab_chain.tail()

Vocabulary with indexation chains : (151048, 2)


Unnamed: 0,label_fr,uri
0,Population -- Statistiques -- Vingtième siècle,https://www.idref.fr/027546071--http://www.idr...
0,Esclavage -- Dans l'art,https://www.idref.fr/027224295--https://www.id...
0,Physique -- Dans l'art,https://www.idref.fr/027247015--https://www.id...
0,Violences sexuelles -- Prévention,https://www.idref.fr/027343758--https://www.id...
0,Transition écologique -- Finances,https://www.idref.fr/189110813--https://www.id...


In [34]:
# Save vocabulary file including indexaton chains
vocab_filename_chain = os.path.join(annif_data_path,'vocabs/rameau-chains/subjects.csv')
vocab_chain.to_csv(vocab_filename_chain, encoding='utf-8', index=None)
vocab_chain.head()

Unnamed: 0,label_fr,uri
0,Kirp?n,https://www.idref.fr/157992527
1,Militaires artistes,https://www.idref.fr/110140494
2,Militaires romains,https://www.idref.fr/028492161
3,Militaires prussiens,https://www.idref.fr/028521757
4,Sa-skya-pa,https://www.idref.fr/029895561


In [35]:
# Check import with bash:
! head {vocab_filename_chain}
print()
! tail {vocab_filename_chain}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
label_fr,uri
Kirp?n,https://www.idref.fr/157992527
Militaires artistes,https://www.idref.fr/110140494
Militaires romains,https://www.idref.fr/028492161
Militaires prussiens,https://www.idref.fr/028521757
Sa-skya-pa,https://www.idref.fr/029895561
Militaires réunionnais,https://www.idref.fr/031875459
Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
Missionnaires suisses,https://www.idref.fr/032878117
Militaires ivoiriens,https://www.idref.fr/034423982

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Coutumes alimentaires -- Aspect social,https://www.idref.fr/027233464--https://www.idref.fr/027790088
Sociologie -- Recherche,https://www.idref.fr/049647490--https://www.idref.fr/027315754
Isolation thermique -- Aspect environnemental,https://www.idref.fr/027235394--https://www.idref.fr/027587886
"

In [36]:
# Check number of concepts in the vocabulary file:
! wc -l < {vocab_filename_chain}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
151049


In [41]:
# Load vocabulary file in ANNIF
print("Dimension du vocabulaire: ", vocab_chain.shape)
! annif load-vocab rameau {vocab_filename_chain}

Dimension du vocabulaire:  (151048, 2)
/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Loading vocabulary from CSV file /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/vocabs/rameau-chains/subjects.csv...
updating existing subject index
saving vocabulary into SKOS file data/vocabs/rameau/subjects.ttl


In [42]:
! annif list-vocabs

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Vocabulary ID       Languages                 Size  Loaded
----------------------------------------------------------
rameau              fr                      151048  True  


### Datasets for training and evaluation

#### Define train and test sets (using split performed by Jean Luc Prieto)

In [43]:
# Define train and test sets 
df_train = pd.read_csv(os.path.join(data_path, "train_dataset.csv"))
print("dimension of train set: ", df_train.shape)
df_test = pd.read_csv(os.path.join(data_path, "test_dataset.csv"))
print("dimension of test set: ", df_test.shape)
df_valid = pd.read_csv(os.path.join(data_path, "valid100_dataset.csv"))
print("dimension of validation set: ", df_valid.shape)

dimension of train set:  (125264, 10)
dimension of test set:  (29244, 10)
dimension of validation set:  (100, 10)


#### Create TSV files (short-text-document)
see: https://github.com/NatLibFi/Annif/wiki/Document-corpus-formats#short-text-document-corpus-tsv-file

In [44]:
# define paths
train_tsv_path = os.path.join(annif_data_path, "rameau-train.tsv")
test_tsv_path = os.path.join(annif_data_path, "rameau-test.tsv")
valid_tsv_path = os.path.join(annif_data_path, "rameau-valid.tsv")

train_tsv_path_chains = os.path.join(annif_data_path, "rameau-train_chains.tsv")
test_tsv_path_chains = os.path.join(annif_data_path, "rameau-test_chains.tsv")
valid_tsv_path_chains = os.path.join(annif_data_path, "rameau-valid_chains.tsv")


annif_train_folder_path = os.path.join(annif_data_path, "train/")
annif_test_folder_path = os.path.join(annif_data_path, "test/")
annif_valid_folder_path = os.path.join(annif_data_path, "valid/")

In [45]:
# Format URIS_chains for ANNIF
def format_for_annif(labels):
    uris = []
    for label in labels:
        if ' -- ' in label:
            items = label.split(" -- ")
            l = []
            for item in items:
                l.append(label2uri[item])
            uris.append("--".join(l))
            
        elif label in label2uri.keys():
            uris.append(label2uri[label])
        
        else:
            pass

    uris_formated = " ".join(uris)
    return uris_formated

In [46]:
# Apply on train data
df_train["rameau_concept_formatted_for_annif"] = df_train["rameau_concepts"].apply(lambda x: format_for_annif(eval(x)))
df_train["rameau_chains_formatted_for_annif"] = df_train["rameau_chaines_index"].apply(lambda x: format_for_annif(eval(x)))
df_train["DESCR_cleaned"] = df_train["DESCR"].apply(lambda x: " ".join(x.split()))
df_train[["DESCR_cleaned", "rameau_chains_formatted_for_annif"]].head(5)

Unnamed: 0,DESCR_cleaned,rameau_chains_formatted_for_annif
0,La culture pour vivre Mort de la culture popul...,https://www.idref.fr/027348237 https://www.idr...
1,"La nuit, le jour : essai psychanalytique sur l...",https://www.idref.fr/027823393 https://www.idr...
2,"Ruptures, cultures Il faut imaginer Robinson s...",https://www.idref.fr/027231429
3,La Destruction du temple Oswald tire sur Kenne...,http://www.idref.fr/028242955--https://www.idr...
4,"Mon père Jové, pèlerin de l'image Naturalisé f...",http://www.idref.fr/028224248


In [47]:
# Save TSV file for ANNIF
train_tsv_path = os.path.join(annif_data_path, "rameau-train.tsv")
train_tsv_path_chains = os.path.join(annif_data_path, "rameau-train_chains.tsv")

with open(train_tsv_path, 'w', encoding='utf-8') as output_train_file:
        for index, row in df_train.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_concept_formatted_for_annif'], file=output_train_file)


with open(train_tsv_path_chains, 'w', encoding='utf-8') as output_train_file:
        for index, row in df_train.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_chains_formatted_for_annif'], file=output_train_file)


In [48]:
# Check import with bash: 
! head -n 5 {train_tsv_path_chains}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
La culture pour vivre Mort de la culture populaire en France. Mutation des institutions culturelles grâce à une technique de mise en relation des oeuvres et d'un public, et qui tend à créer un comportement culturel adapté aux caractéristiques de l'époque	https://www.idref.fr/027348237 https://www.idref.fr/027224929 https://www.idref.fr/027416593
La nuit, le jour : essai psychanalytique sur le fonctionnement mental Discontinuité, latence, rétablissement d’une continuité organisent la vie psychique. Réparatrice est dite la nuit... Les auteurs ont voulu montrer la complexité sous-jacente à cette qualité dès lors que Freud met au jour dans l’étude du rêve, au-delà d’une certaine réalisation de désir inconscient lié à l’histoire individuelle d’un sujet donné, l’existence de « veinures » qui résultent de la préhistoire de tous les humains et qui, imprimant la matière où s’inscrit le

In [49]:
# Apply on test dataset
df_test["rameau_concept_formatted_for_annif"] = df_test["rameau_concepts"].apply(lambda x: format_for_annif(eval(x)))
df_test["rameau_chains_formatted_for_annif"] = df_test["rameau_chaines_index"].apply(lambda x: format_for_annif(eval(x)))
df_test["DESCR_cleaned"] = df_test["DESCR"].apply(lambda x: " ".join(x.split()))
df_test[["DESCR_cleaned", "rameau_concept_formatted_for_annif", "rameau_chains_formatted_for_annif"]].head()

Unnamed: 0,DESCR_cleaned,rameau_concept_formatted_for_annif,rameau_chains_formatted_for_annif
0,"La révolution structurale Mutations ou crises,...",https://www.idref.fr/027249581,https://www.idref.fr/027249581
1,La théorie des jeux et ses applications à l'éc...,https://www.idref.fr/02733743X https://www.idr...,https://www.idref.fr/02733743X https://www.idr...
2,Pétrole : le vrai dossier Que dissimulent les ...,https://www.idref.fr/027428958,https://www.idref.fr/027428958
3,Magie : aspects de la tradition occidentale La...,https://www.idref.fr/027238210,https://www.idref.fr/027238210
4,Mathématiques de base pour les linguistes La f...,https://www.idref.fr/027236005 https://www.idr...,https://www.idref.fr/027236005--https://www.id...


In [50]:
# Save TSV file for ANNIF
test_tsv_path = os.path.join(annif_data_path, "rameau-test.tsv")
test_tsv_path_chains = os.path.join(annif_data_path, "rameau-test_chains.tsv")

with open(test_tsv_path, 'w', encoding='utf-8') as output_test_file:
        for index, row in df_test.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_concept_formatted_for_annif'], file=output_test_file)

with open(test_tsv_path_chains, 'w', encoding='utf-8') as output_test_file:
        for index, row in df_test.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_chains_formatted_for_annif'], file=output_test_file)

In [51]:
# Apply on test dataset
df_valid["rameau_concept_formatted_for_annif"] = df_valid["rameau_concepts"].apply(lambda x: format_for_annif(eval(x)))
df_valid["rameau_chains_formatted_for_annif"] = df_valid["rameau_chaines_index"].apply(lambda x: format_for_annif(eval(x)))
df_valid["DESCR_cleaned"] = df_valid["DESCR"].apply(lambda x: " ".join(x.split()))
df_valid[["DESCR_cleaned", "rameau_concept_formatted_for_annif", "rameau_chains_formatted_for_annif"]].head()

# Save TSV file for ANNIF
valid_tsv_path = os.path.join(annif_data_path, "rameau-valid.tsv")
valid_tsv_path_chains = os.path.join(annif_data_path, "rameau-valid_chains.tsv")

with open(valid_tsv_path, 'w', encoding='utf-8') as output_valid_file:
        for index, row in df_test.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_concept_formatted_for_annif'], file=output_valid_file)

with open(valid_tsv_path_chains, 'w', encoding='utf-8') as output_valid_file:
        for index, row in df_test.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_chains_formatted_for_annif'], file=output_valid_file)

### Datasets to get predictions

ANNIF ne sait pas faire la prédiction de chaque notice d'un fichier short-text document (TSV) d'un coup. Il fait une prédiction pour tout le texte du TSV, ce qui n'a pas de sens.

Il faut donc créer un fichier .txt pour chaque notice et lancer la prédiction pour tous les .txt d'un dossier en utilisant annif index

In [138]:
# Train dataset
for index, row in df_train.iterrows():
    if not os.path.exists(annif_train_folder_path):
        os.makedirs(annif_train_folder_path)
    filename = os.path.join(annif_train_folder_path, str(row['PPN'] + '.txt'))
    with open(filename, 'a') as f:
        f.write(row['DESCR_cleaned'])

In [139]:
# Check number of files in the train folder:
nb_file = !ls {annif_train_folder_path} | wc -l
print(f"Number of files in {annif_train_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/train/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '125264']


In [140]:
# Test dataset
annif_test_folder_path = os.path.join(annif_data_path, "test/")
for index, row in df_test.iterrows():
    if not os.path.exists(annif_test_folder_path):
        os.makedirs(annif_test_folder_path)
    filename = os.path.join(annif_test_folder_path, str(row['PPN'] + '.txt'))
    with open(filename, 'a') as f:
        f.write(row['DESCR_cleaned'])

In [141]:
# Check number of files in the train folder:
nb_file = !ls {annif_test_folder_path} | wc -l
print(f"Number of files in {annif_test_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/test/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '29244']


In [142]:
# Validation dataset
annif_valid_folder_path = os.path.join(annif_data_path, "valid/")
for index, row in df_valid.iterrows():
    if not os.path.exists(annif_valid_folder_path):
        os.makedirs(annif_valid_folder_path)
    filename = os.path.join(annif_valid_folder_path, str(row['PPN'] + '.txt'))
    with open(filename, 'a') as f:
        f.write(row['DESCR_cleaned'])

# Check number of files in the train folder:
nb_file = !ls {annif_valid_folder_path} | wc -l
print(f"Number of files in {annif_valid_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/valid/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '100']


## Test ANNIF

### List all available projects 

In [53]:
# list of projects
project_list = ! annif list-projects
project_list

['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)',
 'Project ID               Project Name                                 Language  Trained',
 '---------------------------------------------------------------------------------------',
 'rameau-tfidf-snowball-fr TF-IDF French RAMEAU with snowball lemma     fr        True   ',
 'rameau-fasttext-snowball-frFastText French RAMEAU                       fr        False  ',
 'rameau-yake-snowball-fr  Yake French RAMEAU                           fr        True   ',
 'rameau-mllm-snowball-fr  RAMEAU MLLM project                          fr        True   ',
 'rameau-omikuji-snowball-frOmikuji Parabel French                       fr        False  ',
 'rameau-tfidf-fr          TF-IDF French RAMEAU with spacy lemma        fr        False  ',
 'rameau-fasttext-fr       FastText French RAMEAU                       fr        True   ',
 'rameau-yake-fr           Yake French RAMEAU         

### Select project to test

In [57]:
# Select project and parameters
project = "rameau-tfidf-snowball-fr"
njobs = 0
input_file = train_tsv_path
max_nb_concepts = 10
threshold = 0.2 
trials = 100
metric_file_path = os.path.join(annif_report_path, str(project + '_chains.json'))
result_file_path = os.path.join(annif_report_path, str(project + '_chains.csv'))

test_file = train_tsv_path

In [55]:
metric_file_path

'/home/aurelie/ABES/labo-indexation-ai/ANNIF/reports/rameau-tfidf-snowball-fr_chains.json'

### Train model

In [58]:
# Train project
! annif train {project} --jobs {njobs} {input_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Backend tfidf: transforming subject corpus
Backend tfidf: creating vectorizer


### Evaluate model

In [None]:
# Evaluate project
! annif eval --limit {max_nb_concepts} --threshold {threshold} --metrics-file {metric_file_path} --results-file {result_file_path} --jobs {njobs} {project} {test_file}

## Hyperoptimization

In [36]:
if optimization:
    metric_file_path = os.path.join(annif_report_path, str(project + '_opt.json'))
    result_file_path = os.path.join(annif_report_path, str(project + '_opt.csv'))

    ! annif hyperopt {project} --trials {trials} --results-file {result_file_path} --jobs 4  {test_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking for optimal hyperparameters using 100 trials
[32m[I 2023-06-27 18:23:21,067][0m A new study created in memory with name: no-name-94504bae-bbd1-4175-bc8c-53c5381601b5[0m
[32m[I 2023-06-27 18:48:55,432][0m Trial 3 finished with value: 0.2826690971851349 and parameters: {'min_samples_leaf': 17, 'max_leaf_nodes': 289, 'max_samples': 0.8931221902724289}. Best is trial 3 with value: 0.2826690971851349.[0m
[32m[I 2023-06-27 18:48:58,214][0m Trial 1 finished with value: 0.28504353761672974 and parameters: {'min_samples_leaf': 25, 'max_leaf_nodes': 1956, 'max_samples': 0.5292036641359308}. Best is trial 1 with value: 0.28504353761672974.[0m
[32m[I 2023-06-27 18:48:59,033][0m Trial 0 finished with value: 0.2849653959274292 and parameters: {'min_samples_leaf': 7, 'max_leaf_nodes': 1196, 'max_samples': 0.5972697775526512}. Best is trial 1 with value: 0.28504353761672974

In [40]:
# Select best model
opt = pd.read_csv(result_file_path, sep='\t')
opt

Unnamed: 0,URI,Label,Support,True_positives,False_positives,False_negatives,Precision,Recall,F1_score
0,https://www.idref.fr/157992527,Kirp?n,0,0,0,0,0.000000,0.000000,0.000000
1,https://www.idref.fr/110140494,Militaires artistes,0,0,0,0,0.000000,0.000000,0.000000
2,https://www.idref.fr/028492161,Militaires romains,2,0,0,2,0.000000,0.000000,0.000000
3,https://www.idref.fr/028521757,Militaires prussiens,0,0,0,0,0.000000,0.000000,0.000000
4,https://www.idref.fr/029895561,Sa-skya-pa,0,0,0,0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
85871,https://www.idref.fr/191405949,Relations artistes-collectivité,0,0,0,0,0.000000,0.000000,0.000000
85872,https://www.idref.fr/234777605,Postcolonialisme et arts,2,0,0,2,0.000000,0.000000,0.000000
85873,https://www.idref.fr/193907445,Ascèse,0,0,0,0,0.000000,0.000000,0.000000
85874,https://www.idref.fr/027415074,Vie chrétienne,3,0,0,3,0.000000,0.000000,0.000000


In [38]:
import numpy as np
best_model = opt.iloc[np.argmax(opt["value"])]
best_model

KeyError: 'value'

In [None]:
# Retrain model
! annif train {project} --jobs {njobs} {input_file}

In [None]:
metric_file_path

In [None]:
# Evaluate project
! annif eval --limit {max_nb_concepts} --threshold {threshold} --metrics-file {metric_file_path} --results-file {result_file_path} --jobs {njobs} {project} {test_file}

In [1]:
# Plot radar from a evaluation file
from utils_visualization import metrics_radar_plot

annif_eval = pd.read_csv("annif_evaluation.csv", index_col=0)
metrics = [
    "Precision_doc_avg",
    "Recall_doc_avg",
    "F1_score_doc_avg",
    "Precision_weighted_subj_avg",
    "Recall_weighted_subj_avg",
    "F1_score_weighted_subj_avg",
    "F1@5"
]

metrics_radar_plot(
    annif_eval,
    metrics=metrics,
    remove_identity=False,
    scale=False,
    title="ANNIF evaluation",
    savefig=None,
    width=1000,
    height=550)


NameError: name 'pd' is not defined

In [None]:
annif_eval.loc["rameau-pav-MLLM-fr.json"]

### Prediction on all notices from test folder

In [None]:
# Prediction sur toutes les notices du dossier "test"
suffix = str('_' + project + 'opt.csv')
! annif index -s {suffix} {project} {annif_test_folder_path}

## Format predictions for future use

In [None]:
csv_files = [f for f in os.listdir(annif_test_folder_path) if f.endswith(suffix)]
print(f"There are {len(csv_files)} files to compile")

In [None]:
# Build dataframe
predictions = pd.DataFrame(columns=["PPN", "predictions", "scores"])
for i, file in enumerate(csv_files):
    ppn = file.split('_')[0]
    pred = pd.read_csv(os.path.join(annif_test_folder_path, file), sep='\t', header=None, names=["URI", "pred_concept", "score"])
    predictions.loc[i,"PPN"] = ppn
    predictions.loc[i,"predictions"] = pred["pred_concept"].to_list()
    predictions.loc[i,"scores"] = pred["score"].to_list()

In [None]:
# Show predictions
predictions.head()

In [None]:
# Save dataframe
if optimization:
    predictions.to_csv(os.path.join(annif_report_path, str(project + "_opt" + "_predictions.csv")))
else: 
    predictions.to_csv(os.path.join(annif_report_path, str(project + "_predictions.csv")))

## Merge predictions with existing predictions (including reindexation and indexing if available)

In [None]:
os.getcwd()

In [None]:
 # Set files
if save_predictions:
    input_file = "./../data/data_with_reindexation_and_embeddings_withANNIF.csv"
    output_file = "./../data/data_with_reindexation_and_embeddings_withANNIF.csv"
else: 
    input_file = None
    output_file = None

In [None]:
# merge predictions with reindexation file
indexation_file = pd.read_csv(input_file, index_col=0)
print(indexation_file.shape)
indexation_file.head(3)

In [None]:
# Merge predictions
output = indexation_file.merge(predictions, on="PPN", how="inner")
if optimization:
    output.rename(columns={"predictions" : str("predictions_opt_" + project), "scores":str("scores_opt_" + project)}, inplace=True)
else:
    output.rename(columns={"predictions" : str("predictions_" + project), "scores":str("scores_" + project)}, inplace=True)

In [None]:
output.head(3)

In [None]:
# Save output
output.to_csv(output_file)

## Multilabel classification - Metrics

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from utils_metrics import *

In [None]:
# Variables to use
## Chains
field = [
    ("ANNIF_mllm", "rameau_concepts", "predictions_opt_rameau-mllm-fr"),
    ("ANNIF_tfidf",  "rameau_concepts", "predictions_rameau-tfidf-snowball-fr")
]
results = dict()


In [None]:
def flatten(list):
    flat_list = [item for sublist in list for item in sublist]
    return flat_list

In [None]:
# Binarization
for var in field: 
    print("Working on ", var[0])
    mlb = MultiLabelBinarizer(sparse_output=False)
    mlb.fit(flatten(output["rameau_concepts"]))
    sudoc = mlb.transform(output[var[1]])
    embed = mlb.transform(output[var[2]])
    results[str("ANNIF" + var[0])] = label_metrics_report("Embeddings", sudoc, embed, zero_division=0)

In [None]:
mlb.inverse_transform(embed)

### Dataframe of results

In [None]:
result_df = pd.DataFrame(results).T
result_df

### Plot

In [None]:
# Plot results
metrics_radar_plot(
    result_df,
    remove_identity=False,
    title="Quantitative comparisons",
    savefig="metrics_ANNIF-sudoc.html",)