In [1]:
#!/bin/bash -e

# Use of ANNIF library

Ce notebook contient toutes les étapes de l'utilisation de la librairie annif (doc d'installation de toutes les librairies à prévoir).
- Formatage des données pour utilisation dans ANNIF
- Entrainement d'un modèle 
- Utilisation de pipelines pour tester plusieurs modèles
- Recherche des meilleurs paramètres  

## Setup 

### Packages

In [2]:
# Import librairies
import os
import csv
import pandas as pd

### Graphical parameters

### Paths

In [3]:
# Set path
abes_path = "/home/aurelie/ABES/labo-indexation-ai/"
os.chdir(abes_path)

In [4]:
# Create folders if needed
list_folder = [
    "ANNIF", 
    "ANNIF/data", "ANNIF/reports",
    "ANNIF/data/train", "ANNIF/data/test", "ANNIF/data/valid"]

for folder in list_folder:
    if not os.path.exists(folder):
        os.makedirs(folder)
    else:
        print(f"Folder {folder} already exists")



Folder ANNIF already exists
Folder ANNIF/data already exists
Folder ANNIF/reports already exists
Folder ANNIF/data/train already exists
Folder ANNIF/data/test already exists
Folder ANNIF/data/valid already exists


In [5]:
# Set current directory
annif_path = os.getcwd() + "/ANNIF"
os.chdir(annif_path)

In [6]:
# Set paths
data_path = "./../data"
fig_path = "./../figs"
annif_data_path = annif_path + "/data"
annif_report_path = annif_path + "/reports"

train_tsv_path = os.path.join(annif_data_path, "rameau-train.tsv")
test_tsv_path = os.path.join(annif_data_path, "rameau-test.tsv")
valid_tsv_path = os.path.join(annif_data_path, "rameau-valid.tsv")

annif_train_folder_path = os.path.join(annif_data_path, "train/")
annif_test_folder_path = os.path.join(annif_data_path, "test/")
annif_valid_folder_path = os.path.join(annif_data_path, "valid/")

### Files

In [7]:
# Select data to use
data = "working_data_sans_dewey.pkl"
rameau_file = "rameau_Tf_Td.csv"

In [8]:
# Merge predictions to existing predictions?
save_predictions = True
optimization = True

## Prepare data

### Import data

In [9]:
# Import working data
df = pd.read_pickle(os.path.join(data_path, data))
print(df.shape)

(154447, 10)


In [10]:
df.head()

Unnamed: 0,PPN,TITRE,RESUME,RAMEAU,DEWEY,DESCR,RAMEAU_CHECKED,presence_chaine_indexation,rameau_chaines_index,rameau_concepts
0,000002364,La culture pour vivre,Mort de la culture populaire en France. Mutati...,Culture populaire;Diffusion de la culture;Poli...,840.0,La culture pour vivre Mort de la culture popul...,Culture populaire;Diffusion de la culture;Poli...,False,"[Culture populaire, Diffusion de la culture, P...","[Culture populaire, Diffusion de la culture, P..."
1,000014877,"La nuit, le jour : essai psychanalytique sur l...","Discontinuité, latence, rétablissement d’une c...",Complexe de castration;Psychanalyse;Rêves,154.63,"La nuit, le jour : essai psychanalytique sur l...",Complexe de castration;Psychanalyse;Rêves,False,"[Complexe de castration, Psychanalyse, Rêves]","[Complexe de castration, Psychanalyse, Rêves]"
2,000021857,"Ruptures, cultures","Il faut imaginer Robinson sur son île, au mome...",Culture,840.0,"Ruptures, cultures Il faut imaginer Robinson s...",Culture,False,[Culture],[Culture]
3,00002564X,La révolution structurale,"Mutations ou crises, les brusques accès de fiè...",Structuralisme,100.0,"La révolution structurale Mutations ou crises,...",Structuralisme,False,[Structuralisme],[Structuralisme]
4,000026352,La Destruction du temple,"Oswald tire sur Kennedy. Jusque-là, c'est bon,...",Science-fiction américaine -- Traductions fran...,830.0,La Destruction du temple Oswald tire sur Kenne...,Science-fiction américaine -- Traductions fran...,True,[Science-fiction américaine -- Traductions fra...,"[Science-fiction américaine, Traductions franç..."


In [11]:
# Import file of RAMEAU concepts
rameau = pd.read_csv(os.path.join(data_path, rameau_file), encoding="utf-8", index_col=0)
print(rameau.shape)
rameau.head(20)

(103628, 3)


Unnamed: 0,PPN,NOM,URI
0,157992527,Kirp?n,https://www.idref.fr/157992527
1,110140494,Militaires artistes,https://www.idref.fr/110140494
2,028492161,Militaires romains,https://www.idref.fr/028492161
3,028521757,Militaires prussiens,https://www.idref.fr/028521757
4,029895561,Sa-skya-pa,https://www.idref.fr/029895561
5,031875459,Militaires réunionnais,https://www.idref.fr/031875459
6,032370083,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,032878117,Missionnaires suisses,https://www.idref.fr/032878117
8,034423982,Militaires ivoiriens,https://www.idref.fr/034423982
9,034686940,Outils à métaux,https://www.idref.fr/034686940


### Create vocabulary file

In [12]:
# Create dictionnary of URI
label2uri = {k:v for k,v in zip(rameau["NOM"], rameau["URI"].astype(str))}

In [13]:
# Create vocabulary file
vocab_filename = os.path.join(annif_data_path,'subjects.csv')
vocab = pd.DataFrame(rameau[["NOM", "URI"]])
vocab.columns = ["label_fr", "uri"]
vocab.to_csv(vocab_filename, encoding='utf-8', index=None)
vocab.head(10)

Unnamed: 0,label_fr,uri
0,Kirp?n,https://www.idref.fr/157992527
1,Militaires artistes,https://www.idref.fr/110140494
2,Militaires romains,https://www.idref.fr/028492161
3,Militaires prussiens,https://www.idref.fr/028521757
4,Sa-skya-pa,https://www.idref.fr/029895561
5,Militaires réunionnais,https://www.idref.fr/031875459
6,Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
7,Missionnaires suisses,https://www.idref.fr/032878117
8,Militaires ivoiriens,https://www.idref.fr/034423982
9,Outils à métaux,https://www.idref.fr/034686940


In [14]:
# Check import with bash:
! head {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
label_fr,uri
Kirp?n,https://www.idref.fr/157992527
Militaires artistes,https://www.idref.fr/110140494
Militaires romains,https://www.idref.fr/028492161
Militaires prussiens,https://www.idref.fr/028521757
Sa-skya-pa,https://www.idref.fr/029895561
Militaires réunionnais,https://www.idref.fr/031875459
Construction à l'épreuve de la sécheresse,https://www.idref.fr/032370083
Missionnaires suisses,https://www.idref.fr/032878117
Militaires ivoiriens,https://www.idref.fr/034423982


In [15]:
# Check number of concepts in the vocabulary file:
! wc -l < {vocab_filename}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
103629


In [16]:
# Load vocabulary file in ANNIF
! annif load-vocab rameau {vocab_filename} --force

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Loading vocabulary from CSV file /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/subjects.csv...
creating subject index
saving vocabulary into SKOS file data/vocabs/rameau/subjects.ttl


In [17]:
# Check import with bash: 
! head -n 5 {train_tsv_path}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
La culture pour vivre Mort de la culture populaire en France. Mutation des institutions culturelles grâce à une technique de mise en relation des oeuvres et d'un public, et qui tend à créer un comportement culturel adapté aux caractéristiques de l'époque	https://www.idref.fr/027348237 https://www.idref.fr/027224929 https://www.idref.fr/027416593
La nuit, le jour : essai psychanalytique sur le fonctionnement mental Discontinuité, latence, rétablissement d’une continuité organisent la vie psychique. Réparatrice est dite la nuit... Les auteurs ont voulu montrer la complexité sous-jacente à cette qualité dès lors que Freud met au jour dans l’étude du rêve, au-delà d’une certaine réalisation de désir inconscient lié à l’histoire individuelle d’un sujet donné, l’existence de « veinures » qui résultent de la préhistoire de tous les humains et qui, imprimant la matière où s’inscrit le

In [18]:
# Check number of files in the train folder:
nb_file = !ls {annif_train_folder_path} | wc -l
print(f"Number of files in {annif_train_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/train/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '125220']


In [19]:
# Check number of files in the test folder:
nb_file = !ls {annif_test_folder_path} | wc -l
print(f"Number of files in {annif_test_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/test/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '46795']


In [20]:
# Check number of files in the valid folder:
nb_file = !ls {annif_valid_folder_path} | wc -l
print(f"Number of files in {annif_valid_folder_path} folder: \n{nb_file}")

Number of files in /home/aurelie/ABES/labo-indexation-ai/ANNIF/data/valid/ folder: 
['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)', '100']


## Test ANNIF

### List all available projects 

In [21]:
# list of projects
project_list = !annif list-projects
project_list

['/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)',
 'Project ID               Project Name                                 Language  Trained',
 '---------------------------------------------------------------------------------------',
 'rameau-tfidf-snowball-fr TF-IDF French RAMEAU with snowball lemma     fr        True   ',
 'rameau-fasttext-snowball-frFastText French RAMEAU                       fr        False  ',
 'rameau-yake-snowball-fr  Yake French RAMEAU                           fr        True   ',
 'rameau-mllm-snowball-fr  RAMEAU MLLM project                          fr        True   ',
 'rameau-omikuji-snowball-frOmikuji Parabel French                       fr        True   ',
 'rameau-tfidf-fr          TF-IDF French RAMEAU with spacy lemma        fr        False  ',
 'rameau-fasttext-fr       FastText French RAMEAU                       fr        True   ',
 'rameau-yake-fr           Yake French RAMEAU         

### Select project to test

In [22]:
# Select project and parameters
project = "rameau-ensemble-mllmSpacy-allButFastext-fr"
njobs = 0
input_file = train_tsv_path
max_nb_concepts = 10
threshold = 0.2 
trials = 10
metric_file_path = os.path.join(annif_report_path, str(project + '.json'))
result_file_path = os.path.join(annif_report_path, str(project + '.csv'))

test_file = test_tsv_path

### Train model

In [57]:
# Train project
! annif train {project} {input_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
2023-07-10 11:35:29.400008: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Backend nn_ensemble: creating NN ensemble model
2023-07-10 11:35:31.134464: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Backend nn_ensemble: Initializing source projects: rameau-tfidf-snowball-fr, rameau-mllm-fr, rameau-omikuji-snowball-fr
2023-07-10T11:35:44.716Z [36mINFO[0

### Evaluate model

In [23]:
# Evaluate project
! annif eval --limit {max_nb_concepts}  --metrics-file {metric_file_path} --results-file {result_file_path} --jobs {njobs} -v "DEBUG" {project} {test_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)


[34mdebug: [0mcreating app with configuration annif.default_config.Config
[34mdebug: [0mReading configuration file projects.cfg in CFG format
[34mdebug: [0mloading subjects from data/vocabs/rameau/subjects.csv
Writing per subject evaluation results to /home/aurelie/ABES/labo-indexation-ai/ANNIF/reports/rameau-ensemble-mllmSpacy-allButFastext-fr.csv
[34mdebug: [0mInitializing project 'rameau-ensemble-mllmSpacy-allButFastext-fr'
[34mdebug: [0mProject 'rameau-ensemble-mllmSpacy-allButFastext-fr': initialized subjects: <annif.corpus.subject.SubjectIndex object at 0x7f22031986d0>
[34mdebug: [0mProject 'rameau-ensemble-mllmSpacy-allButFastext-fr': initializing backend
2023-07-11 05:41:37.439476: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the app

## Hyperoptimization

In [23]:
optimization = True

In [24]:
if optimization:
    metric_file_path = os.path.join(annif_report_path, str(project + '_opt.json'))
    result_file_path_opt = os.path.join(annif_report_path, str(project + '_opt.csv'))

    ! annif hyperopt {project} --trials {trials} --results-file {result_file_path_opt} --metric "F1 score (doc avg)" {test_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking for optimal hyperparameters using 10 trials
2023-07-09 21:36:42.915774: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-09T21:36:55.361Z [36mINFO[0m [omikuji::model] Loading model from data/projects/rameau-omikuji-snowball-fr/omikuji-model...
2023-07-09T21:36:55.361Z [36mINFO[0m [omikuji::model] Loading model settings from data/projects/rameau-omikuji-snowball-fr/omikuji-model/settings.json...
2023-07-09T21:36:55.361Z [36mINFO[0m [omikuji::model] Loaded model settings Settings { n_features: 207673, classifier_loss_type: Hinge }...
2023-07-09T21:36:55.361Z [36mINFO[0m [omikuji::model] Loading 

In [None]:
# Select best model
opt = pd.read_csv(result_file_path_opt, sep='\t')
opt

In [27]:
import numpy as np
best_model = opt.iloc[np.argmax(opt["value"])]
best_model

trial                  9.000000
value                  0.043603
min_samples_leaf      13.000000
max_leaf_nodes      1015.000000
max_samples            0.662318
Name: 9, dtype: float64

In [28]:
# Retrain model
! annif train {project} --jobs {njobs} {input_file}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Backend mllm: starting train
Backend mllm: preparing training data


In [None]:
# Evaluate project
! annif eval --limit {max_nb_concepts} --threshold {threshold} --metrics-file {metric_file_path} --results-file {result_file_path} --jobs {njobs} {project} {test_file}

In [24]:
# Plot radar from a evaluation file
from utils_visualization import metrics_radar_plot

annif_eval = pd.read_csv("reports/annif_evaluation_concepts_juin2023.csv", index_col=0)
metrics = [
    "Precision_doc_avg",
    "Recall_doc_avg",
    "F1_score_doc_avg",
    "Precision_weighted_subj_avg",
    "Recall_weighted_subj_avg",
    "F1_score_weighted_subj_avg",
    "F1@5"
]

metrics_radar_plot(
    annif_eval,
    metrics=metrics,
    remove_identity=False,
    scale=False,
    title="ANNIF evaluation",
    savefig=None,
    width=1000,
    height=550)


In [25]:
annif_eval.loc["rameau-ensemble-mllmSpacy-allButFastext-fr.json"]

Precision_doc_avg                   0.200160
Recall_doc_avg                      0.589381
F1_score_doc_avg                    0.278750
Precision_subj_avg                  0.027758
Recall_subj_avg                     0.054848
F1_score_subj_avg                   0.033449
Precision_weighted_subj_avg         0.235924
Recall_weighted_subj_avg            0.549798
F1_score_weighted_subj_avg          0.315128
Precision_microavg                  0.188802
Recall_microavg                     0.549798
F1_score_microavg                   0.281080
F1@5                                0.331667
NDCG                                0.524107
NDCG@5                              0.498630
NDCG@10                             0.524194
Precision@1                         0.511445
Precision@3                         0.348833
Precision@5                         0.268844
True_positives                  42760.000000
False_positives                183721.000000
False_negatives                 35014.000000
Documents_

### Prediction on all notices from test folder

In [32]:
# Prediction sur toutes les notices du dossier "test"
suffix = str('_' + project + 'pred.csv')
! annif index -s {suffix} {project} {annif_valid_folder_path}

/bin/bash: /home/aurelie/anaconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
2023-07-11 08:09:48.945160: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-11T08:10:03.838Z [36mINFO[0m [omikuji::model] Loading model from data/projects/rameau-omikuji-snowball-fr/omikuji-model...
2023-07-11T08:10:03.838Z [36mINFO[0m [omikuji::model] Loading model settings from data/projects/rameau-omikuji-snowball-fr/omikuji-model/settings.json...
2023-07-11T08:10:03.838Z [36mINFO[0m [omikuji::model] Loaded model settings Settings { n_features: 207673, classifier_loss_type: Hinge }...
2023-07-11T08:10:03.838Z [36mINFO[0m [omikuji::model] Loading tree from data/projects/rameau-omikuji-snowball-fr/o

## Format predictions for future use

In [34]:
csv_files = [f for f in os.listdir(annif_valid_folder_path) if f.endswith(suffix)]
print(f"There are {len(csv_files)} files to compile")

There are 100 files to compile


In [35]:
# Build dataframe
predictions = pd.DataFrame(columns=["PPN", "predictions", "scores"])
for i, file in enumerate(csv_files):
    ppn = file.split('_')[0]
    pred = pd.read_csv(os.path.join(annif_valid_folder_path, file), sep='\t', header=None, names=["URI", "pred_concept", "score"])
    predictions.loc[i,"PPN"] = ppn
    predictions.loc[i,"predictions"] = pred["pred_concept"].to_list()
    predictions.loc[i,"scores"] = pred["score"].to_list()

In [36]:
# Show predictions
predictions.head()

Unnamed: 0,PPN,predictions,scores
0,198388810,"[Ressources humaines, Direction du personnel]","[1.0, 0.7973206639289856]"
1,235755265,"[Jardinage, Hosta, Plantes cultivées, Cultures...","[0.1623816043138504, 0.1281401813030243, 0.073..."
2,248590413,"[Comptabilité, Problèmes et exercices, Manuels...","[0.8501826524734497, 0.3784928917884826, 0.329..."
3,252816528,"[Amour, Relations amoureuses, Couples, Sociolo...","[0.7278058528900146, 0.3301266133785248, 0.126..."
4,257349006,"[Changement social, Réalisation de soi, Modes ...","[0.3092773556709289, 0.2749579548835754, 0.129..."


In [37]:
# Save dataframe
predictions.to_csv(os.path.join(annif_report_path, str("valid100_" + project + "_predictions.csv")))

## Merge predictions with existing predictions (including reindexation and indexing if available)

In [38]:
os.getcwd()

'/home/aurelie/ABES/labo-indexation-ai/ANNIF'

In [39]:
 # Set files
input_file = "./../data/reindexation_final_with_concepts_juin2023.csv"
output_file = "./../data/reindexation_final_with_concepts_juin2023_withANNIF.csv"

In [40]:
# merge predictions with reindexation file
indexation_file = pd.read_csv(input_file, index_col=0)
print(indexation_file.shape)
indexation_file.head(3)

(100, 29)


Unnamed: 0_level_0,DESCR,RAMEAU_CHECKED,rameau_concepts,rameau_chaines_index,INDEX_AFE,INDEX_JMF,INDEX_LJZ,INDEX_LPL,INDEX_MCR,INDEX_MPD,...,rameau_concept_JMF,rameau_concept_LPL,rameau_concept_LJZ,rameau_concept_MPD,rameau_index_chain_AFE,rameau_index_chain_MCR,rameau_index_chain_JMF,rameau_index_chain_LPL,rameau_index_chain_LJZ,rameau_index_chain_MPD
PPN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000308838,Les sommets de l'État : essai sur l'élite du p...,Bureaucratie;Classes dirigeantes;Classes dirig...,"['Bureaucratie', 'Classes dirigeantes', 'Class...","['Bureaucratie', 'Classes dirigeantes', 'Class...",Classes dirigeantes -- France -- Histoire;;Pou...,Classes dirigeantes -- Relations avec l'État -...,Classes dirigeantes -- France;;Hauts-fonctionn...,Hauts-fonctionnairesss -- France;;Classes diri...,Pouvoir (sciences sociales) -- Classes dirigea...,Classes dirigeantes -- France -- Histoire;;Éli...,...,"['Classes dirigeantes', ""Relations avec l'État...","['Hauts-fonctionnairesss', 'France', 'Classes ...","['Classes dirigeantes', 'France', 'Hauts-fonct...","['Classes dirigeantes', 'France', 'Histoire', ...","['Classes dirigeantes -- France -- Histoire', ...",['Pouvoir (sciences sociales) -- Classes dirig...,"[""Classes dirigeantes -- Relations avec l'État...","['Hauts-fonctionnairesss -- France', 'Classes ...","['Classes dirigeantes -- France', 'Hauts-fonct...","['Classes dirigeantes -- France -- Histoire', ..."
00094758X,Le dollar La quatrième de couverture indique :...,Dollar américain;Finances internationales;Poli...,"['Dollar américain', 'Finances internationales...","['Dollar américain', 'Finances internationales...","Dollar américain;;Eurodollar, Marché de l';;Po...",Dollar américain ;;Politique économique -- Éta...,"Dollar américain;;Eurodollar, Marché de l';;Fi...",Dollar américain -- Influence -- 20e siècle;;F...,Dollar américain -- Mondialisation;;Dollar amé...,"Dollar américain;;Eurodollar, Marché de l';;Fi...",...,"['Dollar américain ', 'Politique économique', ...","['Dollar américain', 'Influence', '20e siècle'...","['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain -- Mondialisation', 'Dollar...","['Dollar américain ', 'Politique économique --...",['Dollar américain -- Influence -- 20e siècle'...,"['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain', ""Eurodollar, Marché de l'..."
003632806,Les intellectuels sous la Ve République : 1958...,Intellectuels;Intellectuels français,"['Intellectuels', 'Intellectuels français']","['Intellectuels', 'Intellectuels français']",Intellectuels -- France -- 1958-.... (5e Répub...,Intellectuels français -- Sociologie ;;Intelle...,Intellectuels -- France;;Vie intellectuelle --...,Intellectuels -- France -- 1958 (5e République...,Intellectuels -- France -- 1945,Intellectuels -- France -- 1958-.... (5e Répub...,...,"['Intellectuels français', 'Sociologie ', 'Int...","['Intellectuels', 'France', '1958 (5e Républiq...","['Intellectuels', 'France', 'Vie intellectuell...","['Intellectuels', 'France', '1958-.... (5e Rép...",['Intellectuels -- France -- 1958-.... (5e Rép...,['Intellectuels -- France -- 1945'],"['Intellectuels français -- Sociologie ', 'Int...",['Intellectuels -- France -- 1958 (5e Républiq...,"['Intellectuels -- France', 'Vie intellectuell...",['Intellectuels -- France -- 1958-.... (5e Rép...


In [41]:
# Merge predictions
output = indexation_file.merge(predictions, on="PPN", how="inner")
output.rename(columns={"predictions" : str("predictions_" + project), "scores":str("scores_" + project)}, inplace=True)

In [42]:
output.head(3)

Unnamed: 0,PPN,DESCR,RAMEAU_CHECKED,rameau_concepts,rameau_chaines_index,INDEX_AFE,INDEX_JMF,INDEX_LJZ,INDEX_LPL,INDEX_MCR,...,rameau_concept_LJZ,rameau_concept_MPD,rameau_index_chain_AFE,rameau_index_chain_MCR,rameau_index_chain_JMF,rameau_index_chain_LPL,rameau_index_chain_LJZ,rameau_index_chain_MPD,predictions_rameau-ensemble-mllmSpacy-allButFastext-fr,scores_rameau-ensemble-mllmSpacy-allButFastext-fr
0,000308838,Les sommets de l'État : essai sur l'élite du p...,Bureaucratie;Classes dirigeantes;Classes dirig...,"['Bureaucratie', 'Classes dirigeantes', 'Class...","['Bureaucratie', 'Classes dirigeantes', 'Class...",Classes dirigeantes -- France -- Histoire;;Pou...,Classes dirigeantes -- Relations avec l'État -...,Classes dirigeantes -- France;;Hauts-fonctionn...,Hauts-fonctionnairesss -- France;;Classes diri...,Pouvoir (sciences sociales) -- Classes dirigea...,...,"['Classes dirigeantes', 'France', 'Hauts-fonct...","['Classes dirigeantes', 'France', 'Histoire', ...","['Classes dirigeantes -- France -- Histoire', ...",['Pouvoir (sciences sociales) -- Classes dirig...,"[""Classes dirigeantes -- Relations avec l'État...","['Hauts-fonctionnairesss -- France', 'Classes ...","['Classes dirigeantes -- France', 'Hauts-fonct...","['Classes dirigeantes -- France -- Histoire', ...","[Histoire, Élite (sciences sociales), État, Li...","[0.5610998868942261, 0.4483318328857422, 0.435..."
1,00094758X,Le dollar La quatrième de couverture indique :...,Dollar américain;Finances internationales;Poli...,"['Dollar américain', 'Finances internationales...","['Dollar américain', 'Finances internationales...","Dollar américain;;Eurodollar, Marché de l';;Po...",Dollar américain ;;Politique économique -- Éta...,"Dollar américain;;Eurodollar, Marché de l';;Fi...",Dollar américain -- Influence -- 20e siècle;;F...,Dollar américain -- Mondialisation;;Dollar amé...,...,"['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain -- Mondialisation', 'Dollar...","['Dollar américain ', 'Politique économique --...",['Dollar américain -- Influence -- 20e siècle'...,"['Dollar américain', ""Eurodollar, Marché de l'...","['Dollar américain', ""Eurodollar, Marché de l'...","[Système monétaire international, Monnaie, Tau...","[0.2470552176237106, 0.1972707509994506, 0.164..."
2,003632806,Les intellectuels sous la Ve République : 1958...,Intellectuels;Intellectuels français,"['Intellectuels', 'Intellectuels français']","['Intellectuels', 'Intellectuels français']",Intellectuels -- France -- 1958-.... (5e Répub...,Intellectuels français -- Sociologie ;;Intelle...,Intellectuels -- France;;Vie intellectuelle --...,Intellectuels -- France -- 1958 (5e République...,Intellectuels -- France -- 1945,...,"['Intellectuels', 'France', 'Vie intellectuell...","['Intellectuels', 'France', '1958-.... (5e Rép...",['Intellectuels -- France -- 1958-.... (5e Rép...,['Intellectuels -- France -- 1945'],"['Intellectuels français -- Sociologie ', 'Int...",['Intellectuels -- France -- 1958 (5e Républiq...,"['Intellectuels -- France', 'Vie intellectuell...",['Intellectuels -- France -- 1958-.... (5e Rép...,"[Intellectuels, Sociologie, Politique et gouve...","[0.2859456539154053, 0.1352124512195587, 0.077..."


In [43]:
# Save output
output.to_csv(output_file)