In [3]:
!pip install catboost



In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
from catboost import CatBoostClassifier

from load_dataset import load_data

In [5]:
# load drive in Colab environment
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Utils functions and global variables

In [6]:
# unused here but provides a non-optimized ready to use dataset
def load_data_nonOptimized():
  data = pd.read_csv("drive/MyDrive/data_ml_project_GIF4101/dataset.csv")

  # remove "Enrolled" target as it's not relevant to the project
  data = data[data['Target'] != "Enrolled"]

  # separating X from Y
  dataY = data.iloc[:,-1:]
  dataX = data.drop("Target", axis=1)

  # numpyification
  arrayX = pd.DataFrame.to_numpy(dataX)
  arrayY = pd.DataFrame.to_numpy(dataY)

  X_train, X_test, y_train, y_test = train_test_split(arrayX, arrayY, test_size=0.2, random_state=42)

  # transforming target labels to either 1 or 0
  le = LabelEncoder()
  y_train = le.fit_transform(y_train)
  y_test = le.fit_transform(y_test)

  return X_train, X_test, y_train, y_test


In [7]:
def get_max_from_dict(dict):
  max_value = 0
  for key, val in dict.items():
    if val > max_value:
      max_key = key
      max_value = val

  return (max_key, max_value)

In [8]:
dict_accuracy = {}
dict_f1 = {}

# Instancier jeux de données

Depuis la classe utilitaire load_dataset, instancier différentes versions du jeux de données. Les modèles seront testés sur la base des 4 jeux de données, ce qui permettra d'avoir une meilleure visibilité sur la pertinence des modifications de chaque versions en rapport à leurs résultats respectifs.

Cela facilitera aussi la sélection des jeux de données à pousser plus loin en analyse des features.

La variables cat_features_catb est surtout utile pour l'entraînement du CatBoost qui ne peux seulement consommer que la version Simplified du dataset.

In [9]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True)
X_train_simplified_oneHot, X_test_simplified_oneHot, y_train_simplified_oneHot, y_test_simplified_oneHot = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
X, y, cat_features = load_data().get_data_X_y(data='original', OneHot=True)
X_train_original_oneHot, X_test_original_oneHot, y_train_original_oneHot, y_test_original_oneHot = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
X, y, cat_features_catb = load_data().get_data_X_y(data='simplify')
X_train_simplified, X_test_simplified, y_train_simplified, y_test_simplified = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
X, y, cat_features = load_data().get_data_X_y(data='original')
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(X, y, test_size=0.3, random_state=42)

# XGboost

## Recherche en grille

Le booster est, ici, nécessairement Gblinear. Les deux autre choix, en l'occurence Gbtree et Dart, prenaient un temps immense à être entrainés dans la recherche en grille. De plus, des tests hors recherche en grille ont prouvés à plusieurs reprises que Gblinear offre des meilleurs résultats d'accuracy et de score f1 sur nos données.

Par mesure de simplicité et d'efficacité, le paramètre booster est donc défini par défaut sur Gblinear et la recherche en grille s'effectue sur la base des paramètres relatifs à un booster de type Gblinear.

In [13]:
# init xgb classifier
xgb_clf = xgb.XGBClassifier(verbosity=3, random_state=42)

param_grid = {
    'objective':["binary:logistic", "binary:hinge"],
    'booster':["gblinear"],
    'enable_categorical':[True],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
}


# init grid search
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='f1')

grid_search.fit(X_train_simplified_oneHot, y_train_simplified_oneHot)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamèetres:")
best_parameters_df = {key: [value] for key, value in best_parameters.items()}
df = pd.DataFrame(best_parameters_df)
print(df)
print()
print("Meilleur score:", best_score)

[00:11:43] Configure: 0.017187s, 1 calls @ 17187us

[00:11:43] EvalOneIter: 0.000534s, 50 calls @ 534us

[00:11:43] GetGradient: 0.031376s, 50 calls @ 31376us

[00:11:43] PredictRaw: 0.075698s, 50 calls @ 75698us

[00:11:43] UpdateOneIter: 0.208365s, 50 calls @ 208365us

[00:11:43] DoBoost: 0.083332s, 50 calls @ 83332us

[00:11:43] PredictBatch: 0.075565s, 50 calls @ 75565us

[00:11:43] PredictBatchInternal: 0.075495s, 50 calls @ 75495us

[00:11:44] Configure: 0.000787s, 1 calls @ 787us

[00:11:44] EvalOneIter: 0.009432s, 50 calls @ 9432us

[00:11:44] GetGradient: 0.00238s, 50 calls @ 2380us

[00:11:44] PredictRaw: 0.066878s, 50 calls @ 66878us

[00:11:44] UpdateOneIter: 0.180126s, 50 calls @ 180126us

[00:11:44] DoBoost: 0.109665s, 50 calls @ 109665us

[00:11:44] PredictBatch: 0.066774s, 50 calls @ 66774us

[00:11:44] PredictBatchInternal: 0.066717s, 50 calls @ 66717us

[00:11:44] Configure: 0.000901s, 1 calls @ 901us

[00:11:44] EvalOneIter: 0.00031s, 50 calls @ 310us

[00:11:44] Get

In [14]:
xgb_acc = {}
xgb_f1 = {}

Meilleur modèle XGBoost:

In [15]:
xgbClassifier_optimized = xgb.XGBClassifier(**best_parameters, random_state=42)

## Simplified + One hot

In [16]:
xgbClassifier_optimized.fit(X_train_simplified_oneHot, y_train_simplified_oneHot)
xgbClassifier_optimized_acc = xgbClassifier_optimized.score(X_test_simplified_oneHot, y_test_simplified_oneHot)
xgbClassifier_optimized_predz = xgbClassifier_optimized.predict(X_test_simplified_oneHot)
xgbClassifier_optimized_f1 = f1_score(y_test_simplified_oneHot, xgbClassifier_optimized_predz, average='binary')

xgb_acc["Simplified + OneHot"] = xgbClassifier_optimized_acc
xgb_f1["Simplified + OneHot"] = xgbClassifier_optimized_f1

## Original + One hot

In [17]:
xgbClassifier_optimized.fit(X_train_original_oneHot, y_train_original_oneHot)
xgbClassifier_optimized_acc = xgbClassifier_optimized.score(X_test_original_oneHot, y_test_original_oneHot)
xgbClassifier_optimized_predz = xgbClassifier_optimized.predict(X_test_original_oneHot)
xgbClassifier_optimized_f1 = f1_score(y_test_original_oneHot, xgbClassifier_optimized_predz, average='binary')

xgb_acc["Original + OneHot"] = xgbClassifier_optimized_acc
xgb_f1["Original + OneHot"] = xgbClassifier_optimized_f1

## Simplified

In [18]:
xgbClassifier_optimized.fit(X_train_simplified, y_train_simplified)
xgbClassifier_optimized_acc = xgbClassifier_optimized.score(X_test_simplified, y_test_simplified)
xgbClassifier_optimized_predz = xgbClassifier_optimized.predict(X_test_simplified)
xgbClassifier_optimized_f1 = f1_score(y_test_simplified, xgbClassifier_optimized_predz, average='binary')

xgb_acc["Simplified"] = xgbClassifier_optimized_acc
xgb_f1["Simplified"] = xgbClassifier_optimized_f1

## Original

In [19]:
xgbClassifier_optimized.fit(X_train_original, y_train_original)
xgbClassifier_optimized_acc = xgbClassifier_optimized.score(X_test_original, y_test_original)
xgbClassifier_optimized_predz = xgbClassifier_optimized.predict(X_test_original)
xgbClassifier_optimized_f1 = f1_score(y_test_original, xgbClassifier_optimized_predz, average='binary')

xgb_acc["Original"] = xgbClassifier_optimized_acc
xgb_f1["Original"] = xgbClassifier_optimized_f1

## Aggrégation

In [20]:
dict_accuracy["XGboost"] = xgb_acc
dict_f1["XGboost"] = xgb_f1

# SVC

## Recherche en grille

Pour le SVC, le choix du kernel doit obligatoirement être linéaire, puisque cela empêche au modèle d'augmenter la dimensionnalité des données, ce qui nous empêcherait plus tard de faire une analyse claire des features les plus prenants pour la classification.

L'utilisation d'un noyeau linéaire permet au SVM de rester dans l'espace dimensionnel de base et ne projète pas les features dans un espace à plus haute dimensionnalité, comme le ferait un noyeau Gaussien ou RBF, par exemple. C'est pourquoi nous pourrons directement récupérer l'importance des features plus tard dans l'analyse.

In [21]:
# by default SVC
svcClassifier = SVC(kernel="rbf", C=1.0, degree=3, gamma='scale', random_state=42)

param_grid = {
    'kernel': ['linear'],
    'C': [0.1, 1, 10, 50],
    'gamma': ['scale', 0.001, 0.1, 1],
    'class_weight': [None, 'balanced']
}

# init grid search
grid_search = GridSearchCV(estimator=svcClassifier, param_grid=param_grid, cv=3, scoring='f1', verbose=3)

grid_search.fit(X_train_simplified_oneHot, y_train_simplified_oneHot)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamèetres:")
best_parameters_df = {key: [value] for key, value in best_parameters.items()}
df = pd.DataFrame(best_parameters_df)
print(df)
print()
print("Meilleur score:", best_score)

Fitting 3 folds for each of 32 candidates, totalling 96 fits
[CV 1/3] END C=0.1, class_weight=None, gamma=scale, kernel=linear;, score=0.929 total time=   0.1s
[CV 2/3] END C=0.1, class_weight=None, gamma=scale, kernel=linear;, score=0.932 total time=   0.2s
[CV 3/3] END C=0.1, class_weight=None, gamma=scale, kernel=linear;, score=0.923 total time=   0.2s
[CV 1/3] END C=0.1, class_weight=None, gamma=0.001, kernel=linear;, score=0.929 total time=   0.1s
[CV 2/3] END C=0.1, class_weight=None, gamma=0.001, kernel=linear;, score=0.932 total time=   0.2s
[CV 3/3] END C=0.1, class_weight=None, gamma=0.001, kernel=linear;, score=0.923 total time=   0.2s
[CV 1/3] END C=0.1, class_weight=None, gamma=0.1, kernel=linear;, score=0.929 total time=   0.2s
[CV 2/3] END C=0.1, class_weight=None, gamma=0.1, kernel=linear;, score=0.932 total time=   0.2s
[CV 3/3] END C=0.1, class_weight=None, gamma=0.1, kernel=linear;, score=0.923 total time=   0.2s
[CV 1/3] END C=0.1, class_weight=None, gamma=1, kernel

In [22]:
svc_acc = {}
svc_f1 = {}

Meilleur modèle SVC:

In [23]:
svcClassifier_optimized = SVC(**best_parameters, random_state=42)

## Simplified + One hot

In [24]:
svcClassifier_optimized.fit(X_train_simplified_oneHot, y_train_simplified_oneHot)
svcClassifier_optimized_acc = svcClassifier_optimized.score(X_test_simplified_oneHot, y_test_simplified_oneHot)
svcClassifier_optimized_predz = svcClassifier_optimized.predict(X_test_simplified_oneHot)
svcClassifier_optimized_f1 = f1_score(y_test_simplified_oneHot, svcClassifier_optimized_predz, average='binary')

svc_acc["Simplified + OneHot"] = svcClassifier_optimized_acc
svc_f1["Simplified + OneHot"] = svcClassifier_optimized_f1

## Original + One hot

In [25]:
svcClassifier_optimized.fit(X_train_original_oneHot, y_train_original_oneHot)
svcClassifier_optimized_acc = svcClassifier_optimized.score(X_test_original_oneHot, y_test_original_oneHot)
svcClassifier_optimized_predz = svcClassifier_optimized.predict(X_test_original_oneHot)
svcClassifier_optimized_f1 = f1_score(y_test_original_oneHot, svcClassifier_optimized_predz, average='binary')

svc_acc["Original + OneHot"] = svcClassifier_optimized_acc
svc_f1["Original + OneHot"] = svcClassifier_optimized_f1

## Simplified

In [26]:
svcClassifier_optimized.fit(X_train_simplified, y_train_simplified)
svcClassifier_optimized_acc = svcClassifier_optimized.score(X_test_simplified, y_test_simplified)
svcClassifier_optimized_predz = svcClassifier_optimized.predict(X_test_simplified)
svcClassifier_optimized_f1 = f1_score(y_test_simplified, svcClassifier_optimized_predz, average='binary')

svc_acc["Simplified"] = svcClassifier_optimized_acc
svc_f1["Simplified"] = svcClassifier_optimized_f1

## Original

In [27]:
svcClassifier_optimized.fit(X_train_original, y_train_original)
svcClassifier_optimized_acc = svcClassifier_optimized.score(X_test_original, y_test_original)
svcClassifier_optimized_predz = svcClassifier_optimized.predict(X_test_original)
svcClassifier_optimized_f1 = f1_score(y_test_original, svcClassifier_optimized_predz, average='binary')

svc_acc["Original"] = svcClassifier_optimized_acc
svc_f1["Original"] = svcClassifier_optimized_f1

## Aggrégation

In [28]:
dict_accuracy["SVC"] = svc_acc
dict_f1["SVC"] = svc_f1

# KNN

## Recherche en grille

In [30]:
n_range = 30
mean_acc = np.zeros(n_range)
for i in range(1,n_range + 1):
    #Train Model and Predict
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train_simplified_oneHot, y_train_simplified_oneHot)
    yhat= knn.predict(X_test_simplified_oneHot)
    mean_acc[i-1] = metrics.accuracy_score(y_test_simplified_oneHot, yhat)
k_range_cnt = 10
k_range = np.argpartition(mean_acc, -k_range_cnt)[-k_range_cnt:]

# by default knn
knnClassifier = KNeighborsClassifier()

param_grid = {
               'n_neighbors' : k_range,
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'],
               'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'p' : [1,2],
}

# init grid search
grid_search = GridSearchCV(estimator=knnClassifier, param_grid=param_grid, cv=3, scoring='f1', verbose=3)

grid_search.fit(X_train_simplified_oneHot, y_train_simplified_oneHot)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamèetres:")
best_parameters_df = {key: [value] for key, value in best_parameters.items()}
df = pd.DataFrame(best_parameters_df)
print(df)
print()
print("Meilleur score:", best_score)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
[CV 1/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=1, weights=uniform;, score=0.885 total time=   0.4s
[CV 2/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=1, weights=uniform;, score=0.898 total time=   0.4s
[CV 3/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=1, weights=uniform;, score=0.889 total time=   0.4s
[CV 1/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=1, weights=distance;, score=0.888 total time=   0.3s
[CV 2/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=1, weights=distance;, score=0.902 total time=   0.3s
[CV 3/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=1, weights=distance;, score=0.891 total time=   0.3s
[CV 1/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=2, weights=uniform;, score=0.893 total time=   0.3s
[CV 2/3] END algorithm=ball_tree, metric=minkowski, n_neighbors=21, p=2, weight

In [31]:
knn_acc = {}
knn_f1 = {}

Meilleur modèle KNN:

In [33]:
knnClassifier_optimized = KNeighborsClassifier(**best_parameters)

## Simplified + One hot

In [34]:
knnClassifier_optimized.fit(X_train_simplified_oneHot, y_train_simplified_oneHot)
knnClassifier_optimized_acc = knnClassifier_optimized.score(X_test_simplified_oneHot, y_test_simplified_oneHot)
knnClassifier_optimized_predz = knnClassifier_optimized.predict(X_test_simplified_oneHot)
knnClassifier_optimized_f1 = f1_score(y_test_simplified_oneHot, knnClassifier_optimized_predz, average='binary')

knn_acc["Simplified + OneHot"] = knnClassifier_optimized_acc
knn_f1["Simplified + OneHot"] = knnClassifier_optimized_f1

## Original + One hot

In [35]:
knnClassifier_optimized.fit(X_train_original_oneHot, y_train_original_oneHot)
knnClassifier_optimized_acc = knnClassifier_optimized.score(X_test_original_oneHot, y_test_original_oneHot)
knnClassifier_optimized_predz = knnClassifier_optimized.predict(X_test_original_oneHot)
knnClassifier_optimized_f1 = f1_score(y_test_original_oneHot, knnClassifier_optimized_predz, average='binary')

knn_acc["Original + OneHot"] = knnClassifier_optimized_acc
knn_f1["Original + OneHot"] = knnClassifier_optimized_f1

## Simplified

In [36]:
knnClassifier_optimized.fit(X_train_simplified, y_train_simplified)
knnClassifier_optimized_acc = knnClassifier_optimized.score(X_test_simplified, y_test_simplified)
knnClassifier_optimized_predz = knnClassifier_optimized.predict(X_test_simplified)
knnClassifier_optimized_f1 = f1_score(y_test_simplified, knnClassifier_optimized_predz, average='binary')

knn_acc["Simplified"] = knnClassifier_optimized_acc
knn_f1["Simplified"] = knnClassifier_optimized_f1

## Original

In [37]:
knnClassifier_optimized.fit(X_train_original, y_train_original)
knnClassifier_optimized_acc = knnClassifier_optimized.score(X_test_original, y_test_original)
knnClassifier_optimized_predz = knnClassifier_optimized.predict(X_test_original)
knnClassifier_optimized_f1 = f1_score(y_test_original, knnClassifier_optimized_predz, average='binary')

knn_acc["Original"] = knnClassifier_optimized_acc
knn_f1["Original"] = knnClassifier_optimized_f1

## Aggrégation

In [38]:
dict_accuracy["KNN"] = knn_acc
dict_f1["KNN"] = knn_f1

# CatBoost

## Recherche en grille

In [39]:
# Create a CatBoostClassifier
catboost_model = CatBoostClassifier(cat_features=cat_features_catb, verbose=0, random_state=42)

# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# init grid search
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, scoring='f1', cv=3, verbose=3)

grid_search.fit(X_train_simplified, y_train_simplified)

best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamèetres:")
best_parameters_df = {key: [value] for key, value in best_parameters.items()}
df = pd.DataFrame(best_parameters_df)
print(df)
print()
print("Meilleur score:", best_score)


Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV 1/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01;, score=0.919 total time=   0.9s
[CV 2/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01;, score=0.917 total time=   0.7s
[CV 3/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.01;, score=0.897 total time=   0.7s
[CV 1/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.05;, score=0.924 total time=   0.8s
[CV 2/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.05;, score=0.930 total time=   0.8s
[CV 3/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.05;, score=0.914 total time=   0.8s
[CV 1/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1;, score=0.928 total time=   0.8s
[CV 2/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1;, score=0.927 total time=   0.6s
[CV 3/3] END depth=4, iterations=100, l2_leaf_reg=1, learning_rate=0.1;, score=0.922 total time=   0

In [40]:
catb_acc = {}
catb_f1 = {}

Meilleur modèle CatB:

In [41]:
catbClassifier_optimized = grid_search.best_estimator_

## Simplified + One hot

In [42]:
catb_acc["Simplified + OneHot"] =  None
catb_f1["Simplified + OneHot"] = None

## Original + One hot

In [43]:
catb_acc["Original + OneHot"] = None
catb_f1["Original + OneHot"] = None

## Simplified

In [44]:
catbClassifier_optimized.fit(X_train_simplified, y_train_simplified)
catbClassifier_optimized_acc = catbClassifier_optimized.score(X_test_simplified, y_test_simplified)
catbClassifier_optimized_predz = catbClassifier_optimized.predict(X_test_simplified)
catbClassifier_optimized_f1 = f1_score(y_test_simplified, catbClassifier_optimized_predz, average='binary')

catb_acc["Simplified"] = catbClassifier_optimized_acc
catb_f1["Simplified"] = catbClassifier_optimized_f1

## Original

In [45]:
catb_acc["Original"] = None
catb_f1["Original"] = None

## Aggrégation

In [46]:
dict_accuracy["CatBoost"] = catb_acc
dict_f1["CatBoost"] = catb_f1

# Résultats

In [47]:
df_acc = pd.DataFrame(dict_accuracy)
df_acc

Unnamed: 0,XGboost,SVC,KNN,CatBoost
Simplified + OneHot,0.906336,0.916437,0.878788,
Original + OneHot,0.899908,0.912764,0.87787,
Simplified,0.910927,0.915519,0.864096,0.905418
Original,0.913682,0.918274,0.84573,


In [48]:
df_f1 = pd.DataFrame(dict_f1)
df_f1

Unnamed: 0,XGboost,SVC,KNN,CatBoost
Simplified + OneHot,0.927143,0.934954,0.908966,
Original + OneHot,0.92242,0.932288,0.908591,
Simplified,0.930764,0.934566,0.899183,0.926059
Original,0.932665,0.936745,0.886792,
