In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.pipeline import Pipeline, make_pipeline

# Convert the iris dataset to a pandas dataframe
# https://ocw.mit.edu/courses/15-097-prediction-machine-learning-and-statistics-spring-2012/resources/iris/
iris_df = pd.read_csv('./iris.csv', header = None, names = ['1','2','3', '4', 'Species'])

# How to make a dataframe into a numpy array, which sklearn needs
X = iris_df.loc[:, iris_df.columns != 'Species'].to_numpy()
y = iris_df['Species'].to_numpy()

# Split the data into training and test sets, could run twice for a validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=2030, stratify = y)

# Define the models
models = {
    'Random Forest':     RandomForestClassifier(),
    'SVM':               SVC(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Define the scalers
scalers = {
    'StandardScaler': StandardScaler(), # Scales to have mean 0 and stdev 1
    'MinMaxScaler':   MinMaxScaler() # Scales into fixed range of (0,1)
}

# Define hyperparameter grids for each model
param_grids = {
    'Random Forest':     {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 10, 20]},
    'SVM':               {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']},
    'Gradient Boosting': {'model__n_estimators': [50, 100, 200], 'model__learning_rate': [0.01, 0.1, 0.2]}
}

best_score = 0
best_model = None
for model_name, model in models.items():
    for scaler_name, scaler in scalers.items():
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])
        grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grids[model_name], cv=10, scoring=make_scorer(accuracy_score))
        grid_search.fit(X_train, y_train)
        
        # Evaluate the model on the train set
        y_pred = grid_search.predict(X_test)
        test_score = accuracy_score(y_test, y_pred)

        # Print results of grid parameter search
        print(f"Best parameters for {model_name} with {scaler_name}: {grid_search.best_params_}")
        print(f"Train set score for {model_name} with {scaler_name}: {grid_search.best_score_:.4f}")
        print(f"Test set score for {model_name} with {scaler_name}: {test_score:.4f}\n")
        
        # Update best model if necessary
        if test_score > best_score:
            best_score = test_score
            best_model = grid_search.best_estimator_

print(f"Best performing model: {best_model.named_steps['model'].__class__.__name__}")
print(f"Best Model's Score on the test set: {best_score:.4f}")

Best parameters for Random Forest with StandardScaler: {'model__max_depth': None, 'model__n_estimators': 50}
Train set score for Random Forest with StandardScaler: 0.9561
Test set score for Random Forest with StandardScaler: 0.9474

Best parameters for Random Forest with MinMaxScaler: {'model__max_depth': None, 'model__n_estimators': 50}
Train set score for Random Forest with MinMaxScaler: 0.9561
Test set score for Random Forest with MinMaxScaler: 0.9474

Best parameters for SVM with StandardScaler: {'model__C': 1, 'model__kernel': 'linear'}
Train set score for SVM with StandardScaler: 0.9826
Test set score for SVM with StandardScaler: 0.9474

Best parameters for SVM with MinMaxScaler: {'model__C': 1, 'model__kernel': 'linear'}
Train set score for SVM with MinMaxScaler: 0.9735
Test set score for SVM with MinMaxScaler: 0.9737

Best parameters for Gradient Boosting with StandardScaler: {'model__learning_rate': 0.01, 'model__n_estimators': 200}
Train set score for Gradient Boosting with S

In [17]:
# The best model
best_model

In [16]:
# Examine classificaiton
df = pd.DataFrame(
    {
        'Predictions': best_model.predict(X_test),
        'Actual': y_test
    }
)
print(df)

        Predictions           Actual
0   Iris-versicolor  Iris-versicolor
1    Iris-virginica   Iris-virginica
2    Iris-virginica   Iris-virginica
3       Iris-setosa      Iris-setosa
4    Iris-virginica   Iris-virginica
5   Iris-versicolor  Iris-versicolor
6       Iris-setosa      Iris-setosa
7   Iris-versicolor  Iris-versicolor
8    Iris-virginica  Iris-versicolor
9       Iris-setosa      Iris-setosa
10   Iris-virginica   Iris-virginica
11   Iris-virginica   Iris-virginica
12      Iris-setosa      Iris-setosa
13  Iris-versicolor  Iris-versicolor
14      Iris-setosa      Iris-setosa
15      Iris-setosa      Iris-setosa
16   Iris-virginica   Iris-virginica
17  Iris-versicolor  Iris-versicolor
18      Iris-setosa      Iris-setosa
19   Iris-virginica   Iris-virginica
20  Iris-versicolor  Iris-versicolor
21      Iris-setosa      Iris-setosa
22   Iris-virginica   Iris-virginica
23   Iris-virginica   Iris-virginica
24      Iris-setosa      Iris-setosa
25   Iris-virginica   Iris-virginica
2

In [13]:
# Wrong classification
df[df['Predictions'] != df['Actual']]

Unnamed: 0,Predictions,Actual
8,Iris-virginica,Iris-versicolor


In [18]:
iris_df.shape

(150, 5)