### Entrainement de modéles

##### 1. Récupérer les embeddings depuis ChromaDB.

In [10]:
import chromadb

client = chromadb.PersistentClient(path='../data/chromaDB')

train_collection = client.get_collection(name='train_collection')
test_collection = client.get_collection(name='test_collection')

print(train_collection.count())
print(test_collection.count())

120000
7600


##### 2. Extraction de labels et d'embeddings

In [11]:
all_metadatas = train_collection.get(include=["metadatas", "embeddings"])

labels = [metadata['label'] for metadata in all_metadatas['metadatas']]

embeddings = [emb for emb in all_metadatas['embeddings']]

print(len(labels))
print(len(embeddings))

120000
120000


##### 3. Entraîner différents modèles ML sur les embeddings.

- Préparer les données (features & target)

In [6]:
import numpy as np

X_train = np.array(embeddings)
y_train = np.array(labels)

- Importer des modèles ML

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

- Logistic Regression

In [5]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


- Random Forest

In [6]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


- XGBoost

In [8]:
xgb_model = XGBClassifier()

xgb_model.fit(embeddings, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


- SVM

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

- K-Nearest Neighbors (KNN)

In [10]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


##### 4. Sauvegarder les modéles

In [None]:
import joblib as hafid

hafid.dump(lr_model, '../models/lr_model.pkl')
hafid.dump(rf_model, '../models/rf_model.pkl')
hafid.dump(xgb_model, '../models/xgb_model.pkl')
hafid.dump(svm_model, '../models/svm_model.pkl')
hafid.dump(knn_model, '../models/knn_model.pkl')

['../models/xgb_model.pkl']