In [249]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import GridSearchCV, KFold
    
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [250]:
#chargees le dataset
dataset = pd.read_csv('penguins_size.csv')

#verifiez le taille et structure 
print(dataset.shape)
dataset.head()

(344, 7)


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [251]:
dataset.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [252]:
#drop le colonne sex et drop les row avec valeurs null
dataset = dataset.drop('sex', axis=1)
dataset = dataset.dropna(axis=0)
print(dataset.shape)
dataset.head()


(342, 6)


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0


In [253]:
dataset.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
dtype: int64

In [254]:
X = dataset.drop('body_mass_g', axis=1)
y = dataset['body_mass_g']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [255]:
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale','auto'], 
    'kernel': ['rbf', 'linear']
    }

In [256]:
grid_search = GridSearchCV(SVR(), param_grid, cv=5, verbose=1)

In [257]:
numerical_cols = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm']

# Define categorical columns (example list based on your CSV structure, adjust as necessary)
categorical_cols = ['species', 'island']


In [258]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Full preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols),
    ])

In [259]:
full_pipeline = make_pipeline(preprocessor, grid_search)

In [260]:
full_pipeline.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [261]:
y = y.reset_index(drop=True)

# Now proceed with the K-Fold cross-validation as before
kf = KFold(n_splits=10, shuffle=True, random_state=42)

mse_scores = []
rmse_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train_kf, X_test_kf = X.iloc[train_index], X.iloc[test_index]
    y_train_kf, y_test_kf = y.iloc[train_index], y.iloc[test_index]  # Use iloc here as well
    
    # Fit the pipeline to the training data
    full_pipeline.fit(X_train_kf, y_train_kf)
    
    # Predict on the testing set
    y_pred_kf = full_pipeline.predict(X_test_kf)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_kf, y_pred_kf)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_kf, y_pred_kf)
    
    # Append to lists
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    r2_scores.append(r2)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [262]:

print(f"Average R2 score across folds: {np.mean(r2_scores)}")
print(f"Average Root Mean Squared Error across folds: {np.mean(rmse_scores)}")
print(f"Average Mean Squared Error across folds: {np.mean(mse_scores)}")

Average R2 score across folds: 0.8308858871306084
Average Root Mean Squared Error across folds: 320.72075727377216
Average Mean Squared Error across folds: 105634.78382260985
