In [2]:
#!/bin/bash -e

# Use of ANNIF library

Ce notebook contient toutes les étapes de l'utilisation de la librairie annif (doc d'installation de toutes les librairies à prévoir).
- Formatage des données pour utilisation dans ANNIF
- Entrainement d'un modèle 
- Utilisation de pipelines pour tester plusieurs modèles
- Recherche des meilleurs paramètres  

## Setup 

### Packages

In [3]:
# Import librairies
import os
import csv
import pandas as pd

### Graphical parameters

### Paths

In [4]:
# Set path
abes_path = "/home/aurelie/ABES/labo-indexation-ai/"
os.chdir(abes_path)

In [5]:
# Create folders if needed
list_folder = [
    "ANNIF", 
    "ANNIF/data", "ANNIF/reports",
    "ANNIF/data/train", "ANNIF/data/test", "ANNIF/data/valid"]

for folder in list_folder:
    if not os.path.exists(folder):
        os.makedirs(folder)
    else:
        print(f"Folder {folder} already exists")



Folder ANNIF already exists
Folder ANNIF/data already exists
Folder ANNIF/reports already exists
Folder ANNIF/data/train already exists
Folder ANNIF/data/test already exists
Folder ANNIF/data/valid already exists


In [6]:
# Set current directory
annif_path = os.getcwd() + "/ANNIF"
os.chdir(annif_path)

In [7]:
# Set paths
data_path = "./../data"
fig_path = "./../figs"
annif_data_path = annif_path + "/data"
annif_report_path = annif_path + "/reports"

### Files

In [8]:
# Select data to use
data = "working_data_sans_dewey.pkl"
rameau_file = "rameau_Tf_Td_withURI.csv"

In [9]:
# Merge predictions to existing predictions?
save_predictions = True
optimization = True

## Create datasets

### Import data

In [10]:
# Import working data
df = pd.read_pickle(os.path.join(data_path, data))
print(df.shape)

(154508, 10)


In [11]:
df.head()

Unnamed: 0,PPN,TITRE,RESUME,RAMEAU,DEWEY,DESCR,RAMEAU_CHECKED,presence_chaine_indexation,rameau_chaines_index,rameau_concepts
0,000002364,La culture pour vivre,Mort de la culture populaire en France. Mutati...,Culture populaire;Diffusion de la culture;Poli...,840.0,La culture pour vivre Mort de la culture popul...,Culture populaire;Diffusion de la culture;Poli...,False,"[Culture populaire, Diffusion de la culture, P...","[Culture populaire, Diffusion de la culture, P..."
1,000014877,"La nuit, le jour : essai psychanalytique sur l...","Discontinuité, latence, rétablissement d’une c...",Complexe de castration;Psychanalyse;Rêves,154.63,"La nuit, le jour : essai psychanalytique sur l...",Complexe de castration;Psychanalyse;Rêves,False,"[Complexe de castration, Psychanalyse, Rêves]","[Complexe de castration, Psychanalyse, Rêves]"
2,000021857,"Ruptures, cultures","Il faut imaginer Robinson sur son île, au mome...",Culture,840.0,"Ruptures, cultures Il faut imaginer Robinson s...",Culture,False,[Culture],[Culture]
3,00002564X,La révolution structurale,"Mutations ou crises, les brusques accès de fiè...",Structuralisme,100.0,"La révolution structurale Mutations ou crises,...",Structuralisme,False,[Structuralisme],[Structuralisme]
4,000026352,La Destruction du temple,"Oswald tire sur Kennedy. Jusque-là, c'est bon,...",Science-fiction américaine -- Traductions fran...,830.0,La Destruction du temple Oswald tire sur Kenne...,Science-fiction américaine -- Traductions fran...,True,[Science-fiction américaine -- Traductions fra...,"[Science-fiction américaine, Traductions franç..."


In [12]:
# Import file of RAMEAU concepts
rameau = pd.read_csv(os.path.join(data_path, rameau_file), encoding="utf-8", index_col=0)
print(rameau.shape)
rameau.head(20)

(103628, 3)


Unnamed: 0,PPN,NOM,URI
0,157992527,Kirp?n,https://www.idref.fr/157992527
1,110140494,Militaires artistes,https://www.idref.fr/110140494
2,028492161,Militaires romains,https://www.idref.fr/028492161
3,028521757,Militaires prussiens,https://www.idref.fr/028521757
4,029895561,Sa-skya-pa,https://www.idref.fr/029895561
5,031875459,Militaires réunionnais,https://www.idref.fr/031875459
6,032370083,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,032878117,Missionnaires suisses,https://www.idref.fr/032878117
8,034423982,Militaires ivoiriens,https://www.idref.fr/034423982
9,034686940,Outils à métaux,https://www.idref.fr/034686940


### Create vocabulary file

In [13]:
# # Useful once only
# # Associate concepts with URI
# BASE_URI = 'https://www.idref.fr/'
# rameau["URI"] = rameau["PPN"].apply(lambda x: BASE_URI + x)
# rameau.head(20)

# # Save URI to csv file
# rameau.to_csv(os.path.join(data_path, str(rameau_file[:-4] + '_withURI.csv')))

In [14]:
# Create dictionnary of URI
label2uri = {k:v for k,v in zip(rameau["NOM"], rameau["URI"].astype(str))}

In [15]:
# Create vocabulary file
vocab_filename = os.path.join(annif_data_path,'subjects.csv')
vocab = pd.DataFrame(rameau[["NOM", "URI"]])
vocab.columns = ["label_fr", "uri"]
vocab.to_csv(vocab_filename, encoding='utf-8', index=None)
vocab.head(10)

Unnamed: 0,label_fr,uri
0,Kirp?n,https://www.idref.fr/157992527
1,Militaires artistes,https://www.idref.fr/110140494
2,Militaires romains,https://www.idref.fr/028492161
3,Militaires prussiens,https://www.idref.fr/028521757
4,Sa-skya-pa,https://www.idref.fr/029895561
5,Militaires réunionnais,https://www.idref.fr/031875459
6,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,Missionnaires suisses,https://www.idref.fr/032878117
8,Militaires ivoiriens,https://www.idref.fr/034423982
9,Outils à métaux,https://www.idref.fr/034686940


In [16]:
# Check import with bash:
! head {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
label_fr,uri
Kirp?n,https://www.idref.fr/157992527
Militaires artistes,https://www.idref.fr/110140494
Militaires romains,https://www.idref.fr/028492161
Militaires prussiens,https://www.idref.fr/028521757
Sa-skya-pa,https://www.idref.fr/029895561
Militaires réunionnais,https://www.idref.fr/031875459
Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
Missionnaires suisses,https://www.idref.fr/032878117
Militaires ivoiriens,https://www.idref.fr/034423982


In [17]:
# Check number of concepts in the vocabulary file:
! wc -l < {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
103629


In [18]:
# Load vocabulary file in ANNIF
! annif load-vocab rameau {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Loading vocabulary from CSV file /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/subjects.csv...
updating existing subject index
saving vocabulary into SKOS file data/vocabs/rameau/subjects.ttl


### Datasets for training and evaluation

#### Define train and test sets (using split performed by Jean Luc Prieto)

In [19]:
# Define train and test sets 
df_train = pd.read_csv(os.path.join(data_path, "train_dataset.csv"))
print("dimension of train set: ", df_train.shape)
df_test = pd.read_csv(os.path.join(data_path, "test_dataset.csv"))
print("dimension of test set: ", df_test.shape)
df_valid = pd.read_csv(os.path.join(data_path, "valid100_dataset.csv"))
print("dimension of validation set: ", df_valid.shape)

dimension of train set:  (125264, 10)
dimension of test set:  (29244, 10)
dimension of validation set:  (100, 10)


#### Create TSV files (short-text-document)
see: https://github.com/NatLibFi/Annif/wiki/Document-corpus-formats#short-text-document-corpus-tsv-file

In [20]:
# define paths
train_tsv_path = os.path.join(annif_data_path, "rameau-train.tsv")
test_tsv_path = os.path.join(annif_data_path, "rameau-test.tsv")
valid_tsv_path = os.path.join(annif_data_path, "rameau-valid.tsv")

annif_train_folder_path = os.path.join(annif_data_path, "train/")
annif_test_folder_path = os.path.join(annif_data_path, "test/")
annif_valid_folder_path = os.path.join(annif_data_path, "valid/")

In [22]:
# Format URIS for ANNIF
def format_for_annif(labels):
    uris = []
    for label in labels:
        if label in label2uri.keys():
            uris.append(label2uri[label])
    uris_formated = " ".join(uris)
    return uris_formated

In [23]:
# Apply on train data
df_train["rameau_concept_formatted_for_annif"] = df_train["rameau_concepts"].apply(lambda x: format_for_annif(eval(x)))
df_train["DESCR_cleaned"] = df_train["DESCR"].apply(lambda x: " ".join(x.split()))
df_train[["DESCR_cleaned", "rameau_concept_formatted_for_annif"]].head()

Unnamed: 0,DESCR_cleaned,rameau_concept_formatted_for_annif
0,La culture pour vivre Mort de la culture popul...,https://www.idref.fr/027348237 https://www.idr...
1,"La nuit, le jour : essai psychanalytique sur l...",https://www.idref.fr/027823393 https://www.idr...
2,"Ruptures, cultures Il faut imaginer Robinson s...",https://www.idref.fr/027231429
3,La Destruction du temple Oswald tire sur Kenne...,http://www.idref.fr/028242955 https://www.idre...
4,"Mon père Jové, pèlerin de l'image Naturalisé f...",http://www.idref.fr/028224248


In [24]:
# Save TSV file for ANNIF
with open(train_tsv_path, 'w', encoding='utf-8') as output_train_file:
        for index, row in df_train.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_concept_formatted_for_annif'], file=output_train_file)


In [25]:
# Check import with bash: 
! head -n 5 {train_tsv_path}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
La culture pour vivre Mort de la culture populaire en France. Mutation des institutions culturelles grâce à une technique de mise en relation des oeuvres et d'un public, et qui tend à créer un comportement culturel adapté aux caractéristiques de l'époque	https://www.idref.fr/027348237 https://www.idref.fr/027224929 https://www.idref.fr/027416593
La nuit, le jour : essai psychanalytique sur le fonctionnement mental Discontinuité, latence, rétablissement d’une continuité organisent la vie psychique. Réparatrice est dite la nuit... Les auteurs ont voulu montrer la complexité sous-jacente à cette qualité dès lors que Freud met au jour dans l’étude du rêve, au-delà d’une certaine réalisation de désir inconscient lié à l’histoire individuelle d’un sujet donné, l’existence de « veinures » qui résultent de la préhistoire de tous les humains et qui, imprimant la matière où s’inscrit le

In [26]:
# Apply on test dataset
df_test["rameau_concept_formatted_for_annif"] = df_test["rameau_concepts"].apply(lambda x: format_for_annif(eval(x)))
df_test["DESCR_cleaned"] = df_test["DESCR"].apply(lambda x: " ".join(x.split()))
df_test[["DESCR_cleaned", "rameau_concept_formatted_for_annif"]].head()

Unnamed: 0,DESCR_cleaned,rameau_concept_formatted_for_annif
0,"La révolution structurale Mutations ou crises,...",https://www.idref.fr/027249581
1,La théorie des jeux et ses applications à l'éc...,https://www.idref.fr/02733743X https://www.idr...
2,Pétrole : le vrai dossier Que dissimulent les ...,https://www.idref.fr/027428958
3,Magie : aspects de la tradition occidentale La...,https://www.idref.fr/027238210
4,Mathématiques de base pour les linguistes La f...,https://www.idref.fr/027236005 https://www.idr...


In [27]:
# Save TSV file for ANNIF
test_tsv_path = os.path.join(annif_data_path, "rameau-test.tsv")
with open(test_tsv_path, 'w', encoding='utf-8') as output_test_file:
        for index, row in df_test.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_concept_formatted_for_annif'], file=output_test_file)

In [28]:
# Apply on test dataset
df_valid["rameau_concept_formatted_for_annif"] = df_valid["rameau_concepts"].apply(lambda x: format_for_annif(eval(x)))
df_valid["DESCR_cleaned"] = df_valid["DESCR"].apply(lambda x: " ".join(x.split()))
df_valid[["DESCR_cleaned", "rameau_concept_formatted_for_annif"]].head()

# Save TSV file for ANNIF
valid_tsv_path = os.path.join(annif_data_path, "rameau-valid.tsv")
with open(valid_tsv_path, 'w', encoding='utf-8') as output_valid_file:
        for index, row in df_test.iterrows():
                print(row['DESCR_cleaned'] + '\t' + row['rameau_concept_formatted_for_annif'], file=output_valid_file)

### Datasets to get predictions

ANNIF ne sait pas faire la prédiction de chaque notice d'un fichier short-text document (TSV) d'un coup. Il fait une prédiction pour tout le texte du TSV, ce qui n'a pas de sens.

Il faut donc créer un fichier .txt pour chaque notice et lancer la prédiction pour tous les .txt d'un dossier en utilisant annif index

In [29]:
# Train dataset
for index, row in df_train.iterrows():
    if not os.path.exists(annif_train_folder_path):
        os.makedirs(annif_train_folder_path)
    filename = os.path.join(annif_train_folder_path, str(row['PPN'] + '.txt'))
    with open(filename, 'a') as f:
        f.write(row['DESCR_cleaned'])

In [30]:
# Check number of files in the train folder:
nb_file = !ls {annif_train_folder_path} | wc -l
print(f"Number of files in {annif_train_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/train/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '125264']


In [31]:
# Test dataset
annif_test_folder_path = os.path.join(annif_data_path, "test/")
for index, row in df_test.iterrows():
    if not os.path.exists(annif_test_folder_path):
        os.makedirs(annif_test_folder_path)
    filename = os.path.join(annif_test_folder_path, str(row['PPN'] + '.txt'))
    with open(filename, 'a') as f:
        f.write(row['DESCR_cleaned'])

In [32]:
# Check number of files in the train folder:
nb_file = !ls {annif_test_folder_path} | wc -l
print(f"Number of files in {annif_test_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/test/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '29244']


In [33]:
# Validation dataset
annif_valid_folder_path = os.path.join(annif_data_path, "valid/")
for index, row in df_valid.iterrows():
    if not os.path.exists(annif_valid_folder_path):
        os.makedirs(annif_valid_folder_path)
    filename = os.path.join(annif_valid_folder_path, str(row['PPN'] + '.txt'))
    with open(filename, 'a') as f:
        f.write(row['DESCR_cleaned'])

# Check number of files in the train folder:
nb_file = !ls {annif_valid_folder_path} | wc -l
print(f"Number of files in {annif_valid_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/valid/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '100']


## Test ANNIF

### List all available projects 

In [34]:
# list of projects
project_list = ! annif list-projects
project_list

['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)',
 'Project ID               Project Name                                 Language  Trained',
 '---------------------------------------------------------------------------------------',
 'rameau-tfidf-snowball-fr TF-IDF French RAMEAU with snowball lemma     fr        True   ',
 'rameau-tfidf-spacy-fr    TF-IDF French RAMEAU with spacy lemma        fr        True   ',
 'rameau-fasttext-fr       FastText French RAMEAU                       fr        True   ',
 'rameau-fasttext-snowball-frFastText French RAMEAU                       fr        False  ',
 'rameau-svc-fr            SVC French RAMEAU                            fr        False  ',
 'rameau-yake-fr           Yake French RAMEAU                           fr        True   ',
 'rameau-omikuji-parabel-frOmikuji Parabel French                       fr        True   ',
 'rameau-mllm-fr           RAMEAU MLLM project         

### Select project to test

In [50]:
# Select project and parameters
project = "rameau-mllm-snowball-fr"
njobs = 0
input_file = train_tsv_path
max_nb_concepts = 10
threshold = 0.2 
trials = 10
metric_file_path = os.path.join(annif_report_path, str(project + '.json'))
result_file_path = os.path.join(annif_report_path, str(project + '.csv'))

test_file = test_tsv_path

In [43]:
metric_file_path

'/home/aurelie/ABES/labo-indexation-ai/ANNIF/reports/rameau-mllm-snowball-fr.json'

### Train model

In [44]:
# Train project
! annif train {project} --jobs {njobs} {input_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Backend mllm: starting train
Backend mllm: preparing training data
Backend mllm: training model
^C


### Evaluate model

In [None]:
# Evaluate project
! annif eval --limit {max_nb_concepts} --threshold {threshold} --metrics-file {metric_file_path} --results-file {result_file_path} --jobs {njobs} {project} {test_file}

## Hyperoptimization

In [51]:
if optimization:
    metric_file_path = os.path.join(annif_report_path, str(project + '_opt.json'))
    result_file_path_opt = os.path.join(annif_report_path, str(project + '_opt.csv'))

    ! annif hyperopt {project} --trials {trials} --results-file {result_file_path_opt} --metric "f1score" {test_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking for optimal hyperparameters using 10 trials
[32m[I 2023-06-28 18:50:59,779][0m A new study created in memory with name: no-name-4a72b918-184a-49fa-95d8-b838fe00b0e0[0m
[33m[W 2023-06-28 19:01:22,162][0m Trial 0 failed because of the following error: KeyError('f1score')[0m
Traceback (most recent call last):
  File "/home/aurelie/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/home/aurelie/anaconda3/envs/abes_index/lib/python3.10/site-packages/annif/backend/mllm.py", line 52, in _objective
    results = batch.results(metrics=[self._metric])
  File "/home/aurelie/anaconda3/envs/abes_index/lib/python3.10/site-packages/annif/eval.py", line 223, in results
    results = self._evaluate_samples(y_true, y_pred, metrics)
  File "/home/aurelie/anaconda3/envs/abes_index/lib/python3.10/site-pac

In [49]:
# Select best model
opt = pd.read_csv(result_file_path_opt, sep='\t')
opt

EmptyDataError: No columns to parse from file

In [38]:
import numpy as np
best_model = opt.iloc[np.argmax(opt["value"])]
best_model

KeyError: 'value'

In [None]:
# Retrain model
! annif train {project} --jobs {njobs} {input_file}

In [None]:
# Evaluate project
! annif eval --limit {max_nb_concepts} --threshold {threshold} --metrics-file {metric_file_path} --results-file {result_file_path} --jobs {njobs} {project} {test_file}

In [1]:
# Plot radar from a evaluation file
from utils_visualization import metrics_radar_plot

annif_eval = pd.read_csv("annif_evaluation.csv", index_col=0)
metrics = [
    "Precision_doc_avg",
    "Recall_doc_avg",
    "F1_score_doc_avg",
    "Precision_weighted_subj_avg",
    "Recall_weighted_subj_avg",
    "F1_score_weighted_subj_avg",
    "F1@5"
]

metrics_radar_plot(
    annif_eval,
    metrics=metrics,
    remove_identity=False,
    scale=False,
    title="ANNIF evaluation",
    savefig=None,
    width=1000,
    height=550)


NameError: name 'pd' is not defined

In [None]:
annif_eval.loc["rameau-pav-MLLM-fr.json"]

### Prediction on all notices from test folder

In [None]:
# Prediction sur toutes les notices du dossier "test"
suffix = str('_' + project + 'opt.csv')
! annif index -s {suffix} {project} {annif_test_folder_path}

## Format predictions for future use

In [None]:
csv_files = [f for f in os.listdir(annif_test_folder_path) if f.endswith(suffix)]
print(f"There are {len(csv_files)} files to compile")

In [None]:
# Build dataframe
predictions = pd.DataFrame(columns=["PPN", "predictions", "scores"])
for i, file in enumerate(csv_files):
    ppn = file.split('_')[0]
    pred = pd.read_csv(os.path.join(annif_test_folder_path, file), sep='\t', header=None, names=["URI", "pred_concept", "score"])
    predictions.loc[i,"PPN"] = ppn
    predictions.loc[i,"predictions"] = pred["pred_concept"].to_list()
    predictions.loc[i,"scores"] = pred["score"].to_list()

In [None]:
# Show predictions
predictions.head()

In [None]:
# Save dataframe
if optimization:
    predictions.to_csv(os.path.join(annif_report_path, str(project + "_opt" + "_predictions.csv")))
else: 
    predictions.to_csv(os.path.join(annif_report_path, str(project + "_predictions.csv")))

## Merge predictions with existing predictions (including reindexation and indexing if available)

In [None]:
os.getcwd()

In [None]:
 # Set files
if save_predictions:
    input_file = "./../data/data_with_reindexation_and_embeddings_withANNIF.csv"
    output_file = "./../data/data_with_reindexation_and_embeddings_withANNIF.csv"
else: 
    input_file = None
    output_file = None

In [None]:
# merge predictions with reindexation file
indexation_file = pd.read_csv(input_file, index_col=0)
print(indexation_file.shape)
indexation_file.head(3)

In [None]:
# Merge predictions
output = indexation_file.merge(predictions, on="PPN", how="inner")
if optimization:
    output.rename(columns={"predictions" : str("predictions_opt_" + project), "scores":str("scores_opt_" + project)}, inplace=True)
else:
    output.rename(columns={"predictions" : str("predictions_" + project), "scores":str("scores_" + project)}, inplace=True)

In [None]:
output.head(3)

In [None]:
# Save output
output.to_csv(output_file)

## Multilabel classification - Metrics

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from utils_metrics import *

In [None]:
# Variables to use
## Chains
field = [
    ("ANNIF_mllm", "rameau_concepts", "predictions_opt_rameau-mllm-fr"),
    ("ANNIF_tfidf",  "rameau_concepts", "predictions_rameau-tfidf-snowball-fr")
]
results = dict()


In [None]:
def flatten(list):
    flat_list = [item for sublist in list for item in sublist]
    return flat_list

In [None]:
# Binarization
for var in field: 
    print("Working on ", var[0])
    mlb = MultiLabelBinarizer(sparse_output=False)
    mlb.fit(flatten(output["rameau_concepts"]))
    sudoc = mlb.transform(output[var[1]])
    embed = mlb.transform(output[var[2]])
    results[str("ANNIF" + var[0])] = label_metrics_report("Embeddings", sudoc, embed, zero_division=0)

In [None]:
mlb.inverse_transform(embed)

### Dataframe of results

In [None]:
result_df = pd.DataFrame(results).T
result_df

### Plot

In [None]:
# Plot results
metrics_radar_plot(
    result_df,
    remove_identity=False,
    title="Quantitative comparisons",
    savefig="metrics_ANNIF-sudoc.html",)