In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.metrics import accuracy_score

import pandas as pd

In [25]:
# Load the wine dataset
wine = load_wine(as_frame=True)
X = wine.data
y = wine.target

In [26]:
X

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [21]:
y.unique()

array([0, 1, 2])

In [22]:
# Pipeline??

In [27]:
# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=13)),  # feature selection
    ('svc', SVC())
])

In [28]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Define a parameter grid for the SVC and SelectKBest
param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [1, 0.1, 0.01, 0.001],
    'svc__kernel': ['rbf', 'linear'],
    'feature_selection__k': [5, 9, 10, 11, 12, 13],
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()]
}

In [36]:
# Apply k-fold cross-validation using GridSearchCV on the pipeline
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [37]:
grid_search = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=4)

In [38]:
# Fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

In [42]:

# Convert cv_results_ into a DataFrame
cv_results_df = pd.DataFrame(grid_search.cv_results_)

# Select the columns related to the parameters and the mean test score (accuracy)
results_df = cv_results_df.loc[:, ['params', 'mean_test_score']]

# Convert the 'params' column into separate columns
params_df = pd.json_normalize(results_df['params'])

# Concatenate the parameters DataFrame and the accuracy column
results_df = pd.concat([params_df, results_df['mean_test_score']], axis=1)

# Rename the 'mean_test_score' column to 'Accuracy'
results_df.rename(columns={'mean_test_score': 'Accuracy'}, inplace=True)

# Print the DataFrame
results_df.sort_values(by="Accuracy", ascending=False).head(20)

Unnamed: 0,feature_selection__k,scaler,svc__C,svc__gamma,svc__kernel,Accuracy
551,13,RobustScaler(),0.1,0.001,linear,0.986207
547,13,RobustScaler(),0.1,0.1,linear,0.986207
502,13,StandardScaler(),10.0,0.001,rbf,0.986207
545,13,RobustScaler(),0.1,1.0,linear,0.986207
549,13,RobustScaler(),0.1,0.01,linear,0.986207
410,12,StandardScaler(),100.0,0.1,rbf,0.985961
242,10,MinMaxScaler(),10.0,0.1,rbf,0.985961
406,12,StandardScaler(),10.0,0.001,rbf,0.985961
232,10,MinMaxScaler(),1.0,1.0,rbf,0.985961
396,12,StandardScaler(),1.0,0.01,rbf,0.985961


In [43]:
# Print the best parameters found by GridSearchCV
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'feature_selection__k': 13, 'scaler': StandardScaler(), 'svc__C': 10, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
