# Préparation et tests du modèle SVM (SVR)

Objectif :
- Préparer les données pour l'entraînement
- Appliquer encodage et normalisation
- Entraîner un modèle SVR pour prédire `depenses`
- Évaluer les performances avec MSE et R²
- Tester l'optimisation des hyperparamètres


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score


In [4]:
# Charger le dataset nettoyé
df = pd.read_csv("../data/dataset_financier.csv")

# Vérification rapide
df.head()


Unnamed: 0,bilan_financier,actifs,revenu,depenses,taux_interet,flux_tresorerie,capital,agence,banque,lieu
0,57450.712295,29795.488056,2459.85738,546.096221,4.85018,9549.649146,36581.956747,Agence_Centre,Société Générale,Bafoussam
1,47926.035482,26472.43578,2884.385063,1069.807495,0.85403,6207.981496,38725.67092,Agence_Centre,UBA,Douala
2,59715.328072,20417.412589,2366.064063,1293.197233,9.724614,11458.554375,77179.593209,Agence_Sud,UBA,Bafoussam
3,72845.447846,15471.442556,2753.630776,2443.843829,5.180104,4151.884734,36901.898344,Agence_Centre,UBA,Douala
4,46487.699379,24887.563195,1485.108266,1778.276562,6.141862,3848.927628,46121.492335,Agence_Centre,Ecobank,Bafoussam


In [5]:
# Séparer features et target
X = df.drop("depenses", axis=1)
y = df["depenses"]

# Encodage des variables catégorielles
X = pd.get_dummies(X, columns=["agence", "banque", "lieu"])

# Normalisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [6]:
# Modèle SVR avec paramètres de base
model = SVR(kernel="rbf", C=100, gamma="scale", epsilon=0.1)
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)

# Évaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE :", mse)
print("R² :", r2)


MSE : 271838.16114807175
R² : -0.04922824752642452


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [1, 10, 100],
    "gamma": ["scale", "auto", 0.01, 0.1],
    "epsilon": [0.01, 0.1, 1]
}

grid = GridSearchCV(SVR(kernel="rbf"), param_grid, cv=3, scoring="r2", verbose=2)
grid.fit(X_train, y_train)

print("Meilleurs paramètres :", grid.best_params_)
print("Meilleur score :", grid.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END .....................C=1, epsilon=0.01, gamma=scale; total time=   0.0s
[CV] END .....................C=1, epsilon=0.01, gamma=scale; total time=   0.0s
[CV] END .....................C=1, epsilon=0.01, gamma=scale; total time=   0.0s
[CV] END ......................C=1, epsilon=0.01, gamma=auto; total time=   0.0s
[CV] END ......................C=1, epsilon=0.01, gamma=auto; total time=   0.0s
[CV] END ......................C=1, epsilon=0.01, gamma=auto; total time=   0.0s
[CV] END ......................C=1, epsilon=0.01, gamma=0.01; total time=   0.1s
[CV] END ......................C=1, epsilon=0.01, gamma=0.01; total time=   0.0s
[CV] END ......................C=1, epsilon=0.01, gamma=0.01; total time=   0.0s
[CV] END .......................C=1, epsilon=0.01, gamma=0.1; total time=   0.0s
[CV] END .......................C=1, epsilon=0.01, gamma=0.1; total time=   0.1s
[CV] END .......................C=1, epsilon=0.

In [8]:
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test)

print("MSE (optimisé):", mean_squared_error(y_test, y_pred_best))
print("R² (optimisé):", r2_score(y_test, y_pred_best))


MSE (optimisé): 262806.2018527628
R² (optimisé): -0.014367112566108675


In [9]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("MSE RF:", mean_squared_error(y_test, y_pred_rf))
print("R² RF:", r2_score(y_test, y_pred_rf))


MSE RF: 272528.94889999373
R² RF: -0.05189451785175625


In [10]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Modèle SVR
svr = SVR(kernel="rbf", C=100, gamma="scale", epsilon=0.1)
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

# Modèle Random Forest
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Modèle Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

# Évaluation
results = pd.DataFrame({
    "Modèle": ["SVR", "Random Forest", "Gradient Boosting"],
    "MSE": [
        mean_squared_error(y_test, y_pred_svr),
        mean_squared_error(y_test, y_pred_rf),
        mean_squared_error(y_test, y_pred_gb)
    ],
    "R²": [
        r2_score(y_test, y_pred_svr),
        r2_score(y_test, y_pred_rf),
        r2_score(y_test, y_pred_gb)
    ]
})

print(results)


              Modèle            MSE        R²
0                SVR  271838.161148 -0.049228
1      Random Forest  272528.948900 -0.051895
2  Gradient Boosting  291183.250800 -0.123896
