### Entrainement de modéles

##### 1. Récupérer les embeddings depuis ChromaDB

In [7]:
def get_collection(client, name):
    train_collection = client.get_collection(name=f'train_collection_{name}')
    test_collection = client.get_collection(name=f'test_collection_{name}')

    print(f'Train - {name.capitalize() :15s} ->     ', train_collection.count())
    print(f'Test  - {name.capitalize() :15s} ->     ', test_collection.count())
    print('')

    return [train_collection, test_collection]


In [8]:
import chromadb

client = chromadb.PersistentClient(path='../data/chroma_db')

train_collection_clean, test_collection_clean = get_collection(client, 'clean')

train_collection_mentions, test_collection_mentions = get_collection(client, 'mentions')

train_collection_undersampling, test_collection_undersampling = get_collection(client, 'undersampling')

Train - Clean           ->      12410
Test  - Clean           ->      2191

Train - Mentions        ->      12410
Test  - Mentions        ->      2191

Train - Undersampling   ->      7254
Test  - Undersampling   ->      1281



##### 2. Extraction de labels et d'embeddings

In [9]:
import numpy as np

def extract_train_test_dataframes(train_collection, test_collection):
    train_metadatas = train_collection.get(include=["metadatas", "embeddings"])
    test_metadatas = test_collection.get(include=["metadatas", "embeddings"])

    train_labels = [metadata['label'] for metadata in train_metadatas['metadatas']]
    test_labels = [metadata['label'] for metadata in test_metadatas['metadatas']]

    train_embeddings = [emb for emb in train_metadatas['embeddings']]
    test_embeddings = [emb for emb in test_metadatas['embeddings']]

    X_train = np.array(train_embeddings)
    y_train = np.array(train_labels)
    X_test = np.array(test_embeddings)
    y_test = np.array(test_labels)

    print('X_Train ->', len(X_train))
    print('Y_Train ->', len(y_train))
    print('X_Test  ->', len(X_test))
    print('Y_Test  ->', len(y_test))

    return [X_train, y_train, X_test, y_test]

- Data 1: data_clean

In [14]:
clean_sets = \
extract_train_test_dataframes(train_collection_clean, test_collection_clean)

X_Train -> 12410
Y_Train -> 12410
X_Test  -> 2191
Y_Test  -> 2191


- Data 2: data_with_mentions

In [15]:
mentions_sets = \
extract_train_test_dataframes(train_collection_mentions, test_collection_mentions)

X_Train -> 12410
Y_Train -> 12410
X_Test  -> 2191
Y_Test  -> 2191


- Data 3: data_balanced_undersampling

In [16]:
undersampling_sets = \
extract_train_test_dataframes(train_collection_undersampling, test_collection_undersampling)

X_Train -> 7254
Y_Train -> 7254
X_Test  -> 1281
Y_Test  -> 1281


##### 3. Créer des fonctions d'entraînement de différents modèles ML sur les embeddings.

In [51]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score


- Fonction pour entrainer un modèle

In [53]:
def train_model(model, X_train, y_train, X_test, y_test, name, models):

    print(f"--- Entrainemnt du modèle --------> {name}", '\n')

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    pr = precision_score(y_test, y_pred, average="micro")
    rc = recall_score(y_test, y_pred, average="micro")
    f1 = f1_score(y_test, y_pred, average="micro")

    print("Accuracy --> ", acc, '\n')
    
    print("Rapport de classification:\n")
    print(classification_report(y_test, y_pred))

    print('-----------------------------------------------\n\n')

    models[name] = model

    return [name, acc, pr, rc, f1]

- Fonction pour entrainer plusieurs modèles

In [54]:
def train_multiple_models(X_train, y_train, X_test, y_test, balanced = None):

    models = {}
    scores = []

    # Logistic Regression

    sc1 = train_model(
        LogisticRegression(class_weight=balanced),
        X_train, y_train, X_test, y_test,
        "Logistic Regression",
        models
    )

    # Random Forest
   
    sc2 = train_model(
        RandomForestClassifier(class_weight=balanced),
        X_train, y_train, X_test, y_test,
        "Random Forest",
        models
    )

    # Linear SVC
   
    sc3 = train_model(
        LinearSVC(class_weight=balanced),
        X_train, y_train, X_test, y_test,
        "Linear SVC",
        models
    )

    # XGBoost
   
    sc4 = train_model(
        XGBClassifier(),
        X_train, y_train, X_test, y_test,
        "XGBoost",
        models
    )

    # SVC
   
    sc5 = train_model(
        SVC(probability=True, class_weight=balanced),
        X_train, y_train, X_test, y_test,
        "SVC",
        models
    )

    for score in [sc1, sc2, sc3, sc4, sc5]:
        scores.append(score)

    scores_df = pd.DataFrame(scores, columns=["Name", "Accuracy", "Precision", "Recall", "F1"])
        
    return [scores_df, models]

##### 4. Entraîner les modèles de machine learning sur des données différentes.

##### 4.1. Data 1: data_clean

In [None]:
scores_df_clean_1, models_clean_1 = train_multiple_models(*clean_sets)

##### 4.2. Data 2: data_clean (Balanced)

In [None]:
scores_df_clean_2, models_clean_2 = train_multiple_models(*clean_sets, "balanced")

##### 4.3. Data 3: data_with_mentions

In [None]:
scores_df_mentions, models_mentions = train_multiple_models(*mentions_sets, "balanced")

##### 4.4. Data 4: data_balanced_undersampling

In [None]:
scores_df_undersampling, models_undersampling = train_multiple_models(*undersampling_sets)