In [8]:

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score

import pandas as pd

# Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target


# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    # ('feature_selection', SelectKBest(score_func=f_classif, k=13)),  # feature selection
    ('rf', RandomForestClassifier())  # Random Forest classifier
])


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a parameter grid for the RandomForestClassifier and SelectKBest
param_grid = {
    'rf__n_estimators': [10, 50, 100],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [2, 4, 6],
    'feature_selection__k': [11, 12, 13],
    'scaler': [StandardScaler(), RobustScaler()]
}

# Apply k-fold cross-validation using GridSearchCV on the pipeline
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=-1)


# Fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)
#%%

#%%

# Convert cv_results_ into a DataFrame
cv_results_df = pd.DataFrame(grid_search.cv_results_)

# Select the columns related to the parameters and the mean test score (accuracy)
results_df = cv_results_df.loc[:, ['params', 'mean_test_score']]

# Convert the 'params' column into separate columns
params_df = pd.json_normalize(results_df['params'])

# Concatenate the parameters DataFrame and the accuracy column
results_df = pd.concat([params_df, results_df['mean_test_score']], axis=1)

# Rename the 'mean_test_score' column to 'Accuracy'
results_df.rename(columns={'mean_test_score': 'Accuracy'}, inplace=True)

# Print the DataFrame
results_df.sort_values(by="Accuracy", ascending=False)

Unnamed: 0,feature_selection__k,rf__max_depth,rf__min_samples_leaf,rf__min_samples_split,rf__n_estimators,scaler,Accuracy
392,13,10.0,2,10,50,StandardScaler(),0.986207
314,12,20.0,6,5,50,StandardScaler(),0.986207
411,13,10.0,4,10,50,RobustScaler(),0.986207
194,12,,4,10,50,StandardScaler(),0.986207
279,12,20.0,2,5,50,RobustScaler(),0.986207
...,...,...,...,...,...,...,...
265,12,10.0,6,10,10,RobustScaler(),0.922414
37,11,,6,2,10,RobustScaler(),0.922414
48,11,,6,10,10,StandardScaler(),0.908374
367,13,,6,5,10,RobustScaler(),0.908128


In [9]:
# Print the best parameters found by GridSearchCV
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'feature_selection__k': 11, 'rf__max_depth': None, 'rf__min_samples_leaf': 6, 'rf__min_samples_split': 2, 'rf__n_estimators': 50, 'scaler': RobustScaler()}
