In [27]:
from load_dataset import load_data
import utils
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd

## Instancier le jeu de données

In [28]:
X, y, cat_features = load_data().get_data_X_y(data='simplify', OneHot=True)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Classification par SVC (Avec One-Hot Encoding):

In [30]:
svcClassifier = SVC(kernel="rbf", C=1.0, degree=3, gamma='scale', random_state=42)
svcClassifier.fit(X_train, y_train)

print(f"Train Accuracy : {svcClassifier.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 89.414 %


In [31]:
print(f"Test Accuracy : {svcClassifier.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 90.174 %


In [32]:
preds = svcClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86       414
           1       0.89      0.96      0.92       675

    accuracy                           0.90      1089
   macro avg       0.91      0.88      0.89      1089
weighted avg       0.90      0.90      0.90      1089



## Optimisation d'hyperparamètres

In [33]:
""" parameters_to_tune = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.0001],
    'degree': [2, 3, 4]
}

grid_search = GridSearchCV(SVC(), parameters_to_tune, refit=True, verbose=3, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_) """

" parameters_to_tune = {\n    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],\n    'C': [0.1, 1, 10, 100],\n    'gamma': ['scale', 'auto', 0.001, 0.0001],\n    'degree': [2, 3, 4]\n}\n\ngrid_search = GridSearchCV(SVC(), parameters_to_tune, refit=True, verbose=3, cv=5, n_jobs=-1)\ngrid_search.fit(X_train, y_train)\n\nprint(grid_search.best_params_) "

In [34]:
parameters_to_tune = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.001],
}

grid_search = GridSearchCV(SVC(), parameters_to_tune, refit=True, verbose=10, cv=5, n_jobs=-1, scoring='f1')
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 5/5; 1/12] START C=0.1, gamma=scale, kernel=linear..........................
[CV 4/5; 1/12] START C=0.1, gamma=scale, kernel=linear..........................
[CV 1/5; 1/12] START C=0.1, gamma=scale, kernel=linear..........................
[CV 2/5; 1/12] START C=0.1, gamma=scale, kernel=linear..........................
[CV 3/5; 1/12] START C=0.1, gamma=scale, kernel=linear..........................
[CV 3/5; 2/12] START C=0.1, gamma=scale, kernel=rbf.............................
[CV 1/5; 2/12] START C=0.1, gamma=scale, kernel=rbf.............................
[CV 2/5; 2/12] START C=0.1, gamma=scale, kernel=rbf.............................
[CV 4/5; 1/12] END C=0.1, gamma=scale, kernel=linear;, score=0.922 total time=   0.4s
[CV 5/5; 1/12] END C=0.1, gamma=scale, kernel=linear;, score=0.929 total time=   0.4s
[CV 1/5; 1/12] END C=0.1, gamma=scale, kernel=linear;, score=0.939 total time=   0.4s
[CV 4/5; 2/12] START C=0.1, gamma

In [35]:
svcClassifier_optimized = SVC(kernel="linear", C=0.1, degree=3, gamma='scale', random_state=42)
svcClassifier_optimized.fit(X_train, y_train)

print(f"Train Accuracy : {svcClassifier_optimized.score(X_train, y_train) * 100:.3f} %")

Train Accuracy : 91.972 %


In [36]:
print(f"Test Accuracy : {svcClassifier_optimized.score(X_test, y_test) * 100:.3f} %")

Test Accuracy : 91.644 %


In [37]:
preds = svcClassifier.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.93      0.80      0.86       414
           1       0.89      0.96      0.92       675

    accuracy                           0.90      1089
   macro avg       0.91      0.88      0.89      1089
weighted avg       0.90      0.90      0.90      1089



### Le *svcClassifier_optimized* possède un noyeau linéaire donc il est possible de faire cette manipulation pour obtenir les features importants pour la classification:

In [38]:
coefs = svcClassifier_optimized.coef_
print(coefs)

[[-1.02325535e-02 -1.49964432e-02 -9.48399855e-02 -1.89379727e-01
   3.84769921e-02  3.70320358e-01 -6.24696889e-02 -2.79251958e-02
  -2.69944835e-02 -4.41699303e-01 -3.62974082e-02  4.98724778e-01
   5.56880992e-02  2.09841792e-01 -4.21152370e-02 -1.04367197e-02
  -1.26835197e-02 -1.54601939e-01 -1.39512431e-01  0.00000000e+00
   9.10796858e-02  2.03034685e-01  0.00000000e+00  7.23087592e-02
  -9.27431892e-02 -1.87595279e-01  1.36460318e-01  5.54716096e-02
   5.64462378e-02 -2.69504994e-01  2.70744172e-02 -2.29824209e-01
   4.24483205e-01  7.42312521e-03  0.00000000e+00  1.18740699e-01
   2.94181526e-01  6.63572087e-02  4.18547405e-01 -6.71831932e-01
  -1.82879632e-01  6.59950222e-03  3.84299008e-01  2.04310882e-01
   5.03427460e-02 -1.66237842e-01  1.52329804e-01 -7.49915692e-02
  -5.99767805e-01  7.02737482e-02 -7.02737482e-02  0.00000000e+00
  -1.00000000e-01 -1.88413919e-01 -6.64211899e-02 -7.61166874e-02
   1.38985384e-01  1.76518711e-01  1.15447701e-01 -6.18217581e-02
   2.67304

### L'utilisation d'un noyeau linéaire permet au SVM de rester dans l'espace dimensionnel de base et ne projète pas les features dans un espace à plus haute dimensionnalité, comme le ferait un noyeau Gaussien ou RBF, par exemple. C'est pourquoi nous pouvons directement récupérer l'importance des features.

In [39]:
pd.Series(coefs[0], index=X_train.columns).sort_values(ascending=False)

Frais de scolarité à jour_1                      0.740113
Unités curriculaires 2e semestre (approuvées)    0.498725
Mode d'application_10                            0.424483
Cours_5                                          0.418547
Cours_9                                          0.384299
                                                   ...   
Occupation mère_9                               -0.275714
Unités curriculaires 2e semestre (inscrits)     -0.441699
Cours_15                                        -0.599768
Cours_6                                         -0.671832
Frais de scolarité à jour_0                     -0.740113
Length: 135, dtype: float64