In [14]:
from load_dataset import load_data
import utils
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd

## Instancier le jeu de données

In [2]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=False)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Classification par SVC (Avec One-Hot Encoding):

In [4]:
svcClassifier = SVC(kernel="rbf", C=1.0, degree=3, gamma='scale', random_state=42)
svcClassifier.fit(X_train, y_train)

print(f"Train Accuracy : {svcClassifier.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 89.847 %


In [5]:
print(f"Test Accuracy : {svcClassifier.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.083 %


In [6]:
preds = svcClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86       414
           1       0.89      0.96      0.92       675

    accuracy                           0.90      1089
   macro avg       0.91      0.88      0.89      1089
weighted avg       0.90      0.90      0.90      1089



## Optimisation d'hyperparamètres

In [7]:
""" parameters_to_tune = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.0001],
    'degree': [2, 3, 4]
}

grid_search = GridSearchCV(SVC(), parameters_to_tune, refit=True, verbose=3, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_) """

" parameters_to_tune = {\n    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],\n    'C': [0.1, 1, 10, 100],\n    'gamma': ['scale', 'auto', 0.001, 0.0001],\n    'degree': [2, 3, 4]\n}\n\ngrid_search = GridSearchCV(SVC(), parameters_to_tune, refit=True, verbose=3, cv=5, n_jobs=-1)\ngrid_search.fit(X_train, y_train)\n\nprint(grid_search.best_params_) "

In [8]:
parameters_to_tune = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.001],
}

grid_search = GridSearchCV(SVC(), parameters_to_tune, refit=True, verbose=10, cv=5, n_jobs=-1, scoring='f1')
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [9]:
svcClassifier_optimized = SVC(kernel="linear", C=0.1, degree=3, gamma='scale', random_state=42)
svcClassifier_optimized.fit(X_train, y_train)

print(f"Train Accuracy : {svcClassifier_optimized.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 91.106 %


In [10]:
print(f"Test Accuracy : {svcClassifier_optimized.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 91.552 %


In [11]:
preds = svcClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86       414
           1       0.89      0.96      0.92       675

    accuracy                           0.90      1089
   macro avg       0.91      0.88      0.89      1089
weighted avg       0.90      0.90      0.90      1089



### Le *svcClassifier_optimized* possède un noyeau linéaire donc il est possible de faire cette manipulation pour obtenir les features importants pour la classification:

In [12]:
coefs = svcClassifier_optimized.coef_
print(coefs)

[[ 0.10421376 -0.02093673 -0.02381913 -0.04101897 -0.2268206   0.07405449
  -0.0468593   0.01153824 -0.0029237   0.02335793 -0.05918744  0.11526675
  -0.39158429  1.39763712 -0.26823118  0.41267636 -0.01285592  0.05825528
  -0.12994642 -0.16229555  0.02659416  0.36372397 -0.04391105 -0.05855946
  -0.04529809 -0.42742584 -0.01733669  0.50496143  0.02814424  0.13858297
  -0.02841713 -0.03227027 -0.00345831]]


### L'utilisation d'un noyeau linéaire permet au SVM de rester dans l'espace dimensionnel de base et ne projète pas les features dans un espace à plus haute dimensionnalité, comme le ferait un noyeau Gaussien ou RBF, par exemple. C'est pourquoi nous pouvons directement récupérer l'importance des features.

In [16]:
feature_importances = pd.Series(coefs[0], index=X_train.columns).sort_values(ascending=False)

# feeding it to the compiler
def log_feature_importances(feature_importances):
    s = feature_importances.reset_index()
    line = []
    for index, row in s.iterrows():
        ind, imp = row
        line.append([ind, imp])
    utils.add_model_feature_importances(svcClassifier_optimized, line)
    
#log_feature_importances(feature_importances) # uncomment to load the importances into the compiled csv


File features_importances.csv has been updated and saved.
