CS 6375 Project 2: Decision Trees and Ensembles - Benjamin Walmer

Imports

In [1]:
import os
import zipfile
import pandas as pd
import requests
import pandas as pd
import warnings
from itertools import product
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

Reading in the 45 datasets:

In [2]:
url = "https://github.com/benwalmer/Machine-Learning-Project-2/raw/refs/heads/main/project2_data.zip"

# Downloading the zip file
zip_path = "project2_data.zip"
if not os.path.exists(zip_path):
    print("Downloading zip file from GitHub...")
    r = requests.get(url)
    with open(zip_path, "wb") as f:
        f.write(r.content)
    print("Download complete.")
else:
    print("Zip file already exists, skipping download.")

# Unzipping the file
extract_path = "project2_data"
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Unzipping complete.")
else:
    print("Data already unzipped, skipping extraction.")

# Read all CSVs inside 'all_data' folder
data_dir = os.path.join(extract_path, "all_data")
all_files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

# Load each file into its own DataFrame
for filename in all_files:
    df_name = filename.replace(".csv", "")
    file_path = os.path.join(data_dir, filename)
    df = pd.read_csv(file_path, header=None)
    globals()[df_name] = df

    print(f"Created DataFrame: {df_name} (shape={df.shape})")

Downloading zip file from GitHub...
Download complete.
Unzipping complete.
Created DataFrame: valid_c1500_d100 (shape=(200, 501))
Created DataFrame: valid_c1800_d5000 (shape=(10000, 501))
Created DataFrame: train_c300_d5000 (shape=(10000, 501))
Created DataFrame: train_c1000_d100 (shape=(200, 501))
Created DataFrame: valid_c300_d1000 (shape=(2000, 501))
Created DataFrame: test_c1800_d1000 (shape=(2000, 501))
Created DataFrame: valid_c1500_d1000 (shape=(2000, 501))
Created DataFrame: train_c500_d5000 (shape=(10000, 501))
Created DataFrame: train_c1800_d100 (shape=(200, 501))
Created DataFrame: train_c1500_d1000 (shape=(2000, 501))
Created DataFrame: train_c1000_d5000 (shape=(10000, 501))
Created DataFrame: test_c300_d100 (shape=(200, 501))
Created DataFrame: valid_c500_d100 (shape=(200, 501))
Created DataFrame: test_c1800_d100 (shape=(200, 501))
Created DataFrame: train_c1800_d5000 (shape=(10000, 501))
Created DataFrame: test_c300_d1000 (shape=(2000, 501))
Created DataFrame: valid_c500_

Making accuracy and f1 tables:

In [3]:
c_values = [300, 500, 1000, 1500, 1800]
d_values = [100, 1000, 5000]
datasets = []

for c in c_values:
    for d in d_values:
        datasets.append(f"c_{c}_d_{d}")

acc = pd.DataFrame({'Dataset': datasets})
f1 = pd.DataFrame({'Dataset': datasets})

Decision Tree Classifiers:

In [4]:
# DT Parameters
max_depth_values_dt = [3, 4, 5, 6, 7, 8, 9, 10]
criterion_values_dt = ['gini', 'entropy', 'log_loss']
splitter_values_dt = ['best', 'random']
acc_list_dt = []
f1_list_dt = []

In [5]:
for c in c_values:
    for d in d_values:
        train_df = globals()[f"train_c{c}_d{d}"]
        valid_df = globals()[f"valid_c{c}_d{d}"]
        test_df  = globals()[f"test_c{c}_d{d}"]

        X_train, y_train = train_df.iloc[:, :-1], train_df.iloc[:, -1]
        X_valid, y_valid = valid_df.iloc[:, :-1], valid_df.iloc[:, -1]
        X_test,  y_test  = test_df.iloc[:, :-1],  test_df.iloc[:, -1]

        # --- Manual grid search on the validation set ---
        best_acc = 0
        best_params = None

        for max_depth, criterion, splitter in product(max_depth_values_dt, criterion_values_dt, splitter_values_dt):
            dt = DecisionTreeClassifier(
                max_depth=max_depth,
                criterion=criterion,
                splitter=splitter,
                random_state=42
            )
            dt.fit(X_train, y_train)
            val_pred = dt.predict(X_valid)
            val_acc = accuracy_score(y_valid, val_pred)

            if val_acc > best_acc:
                best_acc = val_acc
                best_params = {
                    'max_depth': max_depth,
                    'criterion': criterion,
                    'splitter': splitter
                }

        # --- Retrain best model on train + valid ---
        X_combined = pd.concat([X_train, X_valid])
        y_combined = pd.concat([y_train, y_valid])
        best_dt = DecisionTreeClassifier(**best_params, random_state=42)
        best_dt.fit(X_combined, y_combined)

        # --- Evaluate on test set ---
        y_pred = best_dt.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        acc_list_dt.append(test_acc)
        f1_list_dt.append(test_f1)

        print(f"[c={c}, d={d}] Best Params: {best_params}, Test Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}")

[c=300, d=100] Best Params: {'max_depth': 4, 'criterion': 'gini', 'splitter': 'best'}, Test Accuracy: 0.6850, F1 Score: 0.7123
[c=300, d=1000] Best Params: {'max_depth': 5, 'criterion': 'gini', 'splitter': 'best'}, Test Accuracy: 0.6725, F1 Score: 0.7095
[c=300, d=5000] Best Params: {'max_depth': 8, 'criterion': 'gini', 'splitter': 'random'}, Test Accuracy: 0.7796, F1 Score: 0.7999
[c=500, d=100] Best Params: {'max_depth': 6, 'criterion': 'entropy', 'splitter': 'best'}, Test Accuracy: 0.6850, F1 Score: 0.6897
[c=500, d=1000] Best Params: {'max_depth': 6, 'criterion': 'entropy', 'splitter': 'best'}, Test Accuracy: 0.7080, F1 Score: 0.7203
[c=500, d=5000] Best Params: {'max_depth': 8, 'criterion': 'entropy', 'splitter': 'best'}, Test Accuracy: 0.7812, F1 Score: 0.8020
[c=1000, d=100] Best Params: {'max_depth': 4, 'criterion': 'gini', 'splitter': 'random'}, Test Accuracy: 0.7300, F1 Score: 0.7477
[c=1000, d=1000] Best Params: {'max_depth': 7, 'criterion': 'entropy', 'splitter': 'random'},

Bagging Classifiers:

In [6]:
# Bagging Parameters
n_estimators_values_bg = [100,150,200]
max_samples_values_bg = [0.5, 0.75, 1]
acc_list_bg = []
f1_list_bg = []

In [7]:
for c in c_values:
    for d in d_values:
        train_df = globals()[f"train_c{c}_d{d}"]
        valid_df = globals()[f"valid_c{c}_d{d}"]
        test_df  = globals()[f"test_c{c}_d{d}"]

        X_train, y_train = train_df.iloc[:, :-1], train_df.iloc[:, -1]
        X_valid, y_valid = valid_df.iloc[:, :-1], valid_df.iloc[:, -1]
        X_test,  y_test  = test_df.iloc[:, :-1],  test_df.iloc[:, -1]

        # --- Manual grid search on the validation set ---
        best_acc = 0
        best_params = None

        for n_estimators, max_samples in product(n_estimators_values_bg, max_samples_values_bg):
            bg = BaggingClassifier(
                n_estimators=n_estimators,
                max_samples = max_samples,
                random_state=42,
                n_jobs = -1
            )
            bg.fit(X_train, y_train)
            val_pred = bg.predict(X_valid)
            val_acc = accuracy_score(y_valid, val_pred)

            if val_acc > best_acc:
                best_acc = val_acc
                best_params = {
                    'n_estimators': n_estimators,
                    'max_samples': max_samples
                }

        # --- Retrain best model on train + valid ---
        X_combined = pd.concat([X_train, X_valid])
        y_combined = pd.concat([y_train, y_valid])
        best_bg = BaggingClassifier(**best_params, random_state=42)
        best_bg.fit(X_combined, y_combined)

        # --- Evaluate on test set ---
        y_pred = best_bg.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        acc_list_bg.append(test_acc)
        f1_list_bg.append(test_f1)

        print(f"[c={c}, d={d}] Best Params: {best_params}, Test Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}")

[c=300, d=100] Best Params: {'n_estimators': 150, 'max_samples': 0.75}, Test Accuracy: 0.7700, F1 Score: 0.7700
[c=300, d=1000] Best Params: {'n_estimators': 200, 'max_samples': 0.75}, Test Accuracy: 0.8975, F1 Score: 0.9000
[c=300, d=5000] Best Params: {'n_estimators': 200, 'max_samples': 0.5}, Test Accuracy: 0.9208, F1 Score: 0.9253
[c=500, d=100] Best Params: {'n_estimators': 150, 'max_samples': 0.5}, Test Accuracy: 0.8650, F1 Score: 0.8670
[c=500, d=1000] Best Params: {'n_estimators': 200, 'max_samples': 0.5}, Test Accuracy: 0.9055, F1 Score: 0.9049
[c=500, d=5000] Best Params: {'n_estimators': 200, 'max_samples': 0.5}, Test Accuracy: 0.9427, F1 Score: 0.9433
[c=1000, d=100] Best Params: {'n_estimators': 100, 'max_samples': 0.5}, Test Accuracy: 0.9100, F1 Score: 0.9100
[c=1000, d=1000] Best Params: {'n_estimators': 200, 'max_samples': 0.5}, Test Accuracy: 0.9505, F1 Score: 0.9512
[c=1000, d=5000] Best Params: {'n_estimators': 150, 'max_samples': 0.75}, Test Accuracy: 0.9659, F1 Sco

Random Forest Classifier:

In [8]:
# Random Forest Parameters
max_depth_values_rf = [5, 7, 9]
n_estimators_values_rf = [100, 150, 200]
max_features_values_rf = ['sqrt', 'log2']
acc_list_rf = []
f1_list_rf = []

In [9]:
for c in c_values:
    for d in d_values:
        train_df = globals()[f"train_c{c}_d{d}"]
        valid_df = globals()[f"valid_c{c}_d{d}"]
        test_df  = globals()[f"test_c{c}_d{d}"]

        X_train, y_train = train_df.iloc[:, :-1], train_df.iloc[:, -1]
        X_valid, y_valid = valid_df.iloc[:, :-1], valid_df.iloc[:, -1]
        X_test,  y_test  = test_df.iloc[:, :-1],  test_df.iloc[:, -1]

        # --- Manual grid search on the validation set ---
        best_acc = 0
        best_params = None

        for max_depth, n_estimators, max_features in product(max_depth_values_rf, n_estimators_values_rf, max_features_values_rf):
            rf = RandomForestClassifier(
                max_depth=max_depth,
                n_estimators=n_estimators,
                max_features=max_features,
                random_state=42,
                n_jobs = -1
            )
            rf.fit(X_train, y_train)
            val_pred = rf.predict(X_valid)
            val_acc = accuracy_score(y_valid, val_pred)

            if val_acc > best_acc:
                best_acc = val_acc
                best_params = {
                    'max_depth': max_depth,
                    'n_estimators': n_estimators,
                    'max_features': max_features
                }

        # --- Retrain best model on train + valid ---
        X_combined = pd.concat([X_train, X_valid])
        y_combined = pd.concat([y_train, y_valid])
        best_rf = RandomForestClassifier(**best_params, random_state=42)
        best_rf.fit(X_combined, y_combined)

        # --- Evaluate on test set ---
        y_pred = best_rf.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        acc_list_rf.append(test_acc)
        f1_list_rf.append(test_f1)

        print(f"[c={c}, d={d}] Best Params: {best_params}, Test Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}")

[c=300, d=100] Best Params: {'max_depth': 5, 'n_estimators': 200, 'max_features': 'sqrt'}, Test Accuracy: 0.8450, F1 Score: 0.8488
[c=300, d=1000] Best Params: {'max_depth': 7, 'n_estimators': 200, 'max_features': 'sqrt'}, Test Accuracy: 0.8860, F1 Score: 0.8875
[c=300, d=5000] Best Params: {'max_depth': 9, 'n_estimators': 200, 'max_features': 'log2'}, Test Accuracy: 0.9219, F1 Score: 0.9238
[c=500, d=100] Best Params: {'max_depth': 5, 'n_estimators': 200, 'max_features': 'log2'}, Test Accuracy: 0.8800, F1 Score: 0.8800
[c=500, d=1000] Best Params: {'max_depth': 5, 'n_estimators': 200, 'max_features': 'log2'}, Test Accuracy: 0.9460, F1 Score: 0.9465
[c=500, d=5000] Best Params: {'max_depth': 7, 'n_estimators': 200, 'max_features': 'log2'}, Test Accuracy: 0.9572, F1 Score: 0.9576
[c=1000, d=100] Best Params: {'max_depth': 5, 'n_estimators': 200, 'max_features': 'log2'}, Test Accuracy: 0.9850, F1 Score: 0.9852
[c=1000, d=1000] Best Params: {'max_depth': 7, 'n_estimators': 200, 'max_featu

Gradient Boosting Classifier:

In [10]:
# Gradient Boosting Parameters
learning_rate_gb = [0.01, 0.05, 0.1]
n_estimators_gb = [100, 150, 200]
max_depth_gb = [2, 3, 4]
acc_list_gb = []
f1_list_gb = []

In [11]:
for c in c_values:
    for d in d_values:
        train_df = globals()[f"train_c{c}_d{d}"]
        valid_df = globals()[f"valid_c{c}_d{d}"]
        test_df  = globals()[f"test_c{c}_d{d}"]

        X_train, y_train = train_df.iloc[:, :-1], train_df.iloc[:, -1]
        X_valid, y_valid = valid_df.iloc[:, :-1], valid_df.iloc[:, -1]
        X_test,  y_test  = test_df.iloc[:, :-1],  test_df.iloc[:, -1]

        # --- Manual grid search on the validation set ---
        best_acc = 0
        best_params = None

        for learning_rate, n_estimators, max_depth in product(learning_rate_gb, n_estimators_gb, max_depth_gb):
            gb = GradientBoostingClassifier(
                learning_rate=learning_rate,
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=42
            )
            gb.fit(X_train, y_train)
            val_pred = gb.predict(X_valid)
            val_acc = accuracy_score(y_valid, val_pred)

            if val_acc > best_acc:
                best_acc = val_acc
                best_params = {
                    'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'n_estimators': n_estimators,
                }

        # --- Retrain best model on train + valid ---
        X_combined = pd.concat([X_train, X_valid])
        y_combined = pd.concat([y_train, y_valid])
        best_gb = GradientBoostingClassifier(**best_params, random_state=42)
        best_gb.fit(X_combined, y_combined)

        # --- Evaluate on test set ---
        y_pred = best_gb.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)

        acc_list_gb.append(test_acc)
        f1_list_gb.append(test_f1)

        print(f"[c={c}, d={d}] Best Params: {best_params}, Test Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}")

[c=300, d=100] Best Params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}, Test Accuracy: 0.8050, F1 Score: 0.8020
[c=300, d=1000] Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Test Accuracy: 0.9945, F1 Score: 0.9945
[c=300, d=5000] Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Test Accuracy: 0.9989, F1 Score: 0.9989
[c=500, d=100] Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Test Accuracy: 0.8700, F1 Score: 0.8762
[c=500, d=1000] Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Test Accuracy: 0.9960, F1 Score: 0.9960
[c=500, d=5000] Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}, Test Accuracy: 0.9993, F1 Score: 0.9993
[c=1000, d=100] Best Params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150}, Test Accuracy: 0.9750, F1 Score: 0.9754
[c=1000, d=1000] Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}, T

Accuracy and F1 Score Comparisons:

In [12]:
acc['DecisionTree'] = acc_list_dt
f1['DecisionTree'] = f1_list_dt
acc['Bagging'] = acc_list_bg
f1['Bagging'] = f1_list_bg
acc['RandomForest'] = acc_list_rf
f1['RandomForest'] = f1_list_rf
acc['GradientBoosting'] = acc_list_gb
f1['GradientBoosting'] = f1_list_gb

In [13]:
acc

Unnamed: 0,Dataset,DecisionTree,Bagging,RandomForest,GradientBoosting
0,c_300_d_100,0.685,0.77,0.845,0.805
1,c_300_d_1000,0.6725,0.8975,0.886,0.9945
2,c_300_d_5000,0.7796,0.9208,0.9219,0.9989
3,c_500_d_100,0.685,0.865,0.88,0.87
4,c_500_d_1000,0.708,0.9055,0.946,0.996
5,c_500_d_5000,0.7812,0.9427,0.9572,0.9993
6,c_1000_d_100,0.73,0.91,0.985,0.975
7,c_1000_d_1000,0.8185,0.9505,0.996,0.996
8,c_1000_d_5000,0.8535,0.9659,0.9972,0.9999
9,c_1500_d_100,0.855,0.99,1.0,1.0


In [14]:
f1

Unnamed: 0,Dataset,DecisionTree,Bagging,RandomForest,GradientBoosting
0,c_300_d_100,0.712329,0.77,0.84878,0.80203
1,c_300_d_1000,0.709534,0.900049,0.887463,0.99453
2,c_300_d_5000,0.799855,0.925297,0.923812,0.998901
3,c_500_d_100,0.689655,0.866995,0.88,0.87619
4,c_500_d_1000,0.720307,0.904882,0.946482,0.996016
5,c_500_d_5000,0.802027,0.943262,0.957641,0.9993
6,c_1000_d_100,0.747664,0.91,0.985222,0.975369
7,c_1000_d_1000,0.830927,0.951207,0.996,0.996012
8,c_1000_d_5000,0.859661,0.965951,0.997203,0.9999
9,c_1500_d_100,0.858537,0.990099,1.0,1.0


Loading in MNIST Dataset

In [15]:
# Loading in MNIST dataset (may not work first try due to openML connection)
warnings.filterwarnings("ignore", message="Invalid cache, redownloading file")
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame = False)

In [16]:
X = X / 255.0 # Normalize pixel values to [0,1]
# Split into training (60K) and test (10K) sets
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

# Split into training and validation sets
X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

Decision Tree Classifier

In [17]:
# Decision Tree Parameters
max_depth_values_dt2 = [11,12,13,14,15]
criterion_values_dt2 = ['gini', 'entropy', 'log_loss']
splitter_values_dt2 = ['best', 'random']
best_acc_dt2 = 0
best_params_dt2 = None

In [18]:
# Manual Grid Search to find best parameters based on validation accuracy
for max_depth, criterion, splitter in product(max_depth_values_dt2, criterion_values_dt2, splitter_values_dt2):
  dt = DecisionTreeClassifier(
                max_depth=max_depth,
                criterion=criterion,
                splitter=splitter,
                random_state=42
            )
  dt.fit(X_train_sub, y_train_sub)
  val_pred = dt.predict(X_valid)
  val_acc = accuracy_score(y_valid, val_pred)
  print(f"Max Depth: {max_depth}, Criterion: {criterion}, Splitter: {splitter}, Validation Accuracy: {val_acc:.4f}")

  if val_acc > best_acc_dt2:
    best_acc_dt2 = val_acc
    best_params_dt2 = (max_depth, criterion, splitter)



Max Depth: 11, Criterion: gini, Splitter: best, Validation Accuracy: 0.8630
Max Depth: 11, Criterion: gini, Splitter: random, Validation Accuracy: 0.8529
Max Depth: 11, Criterion: entropy, Splitter: best, Validation Accuracy: 0.8707
Max Depth: 11, Criterion: entropy, Splitter: random, Validation Accuracy: 0.8555
Max Depth: 11, Criterion: log_loss, Splitter: best, Validation Accuracy: 0.8707
Max Depth: 11, Criterion: log_loss, Splitter: random, Validation Accuracy: 0.8555
Max Depth: 12, Criterion: gini, Splitter: best, Validation Accuracy: 0.8684
Max Depth: 12, Criterion: gini, Splitter: random, Validation Accuracy: 0.8590
Max Depth: 12, Criterion: entropy, Splitter: best, Validation Accuracy: 0.8772
Max Depth: 12, Criterion: entropy, Splitter: random, Validation Accuracy: 0.8668
Max Depth: 12, Criterion: log_loss, Splitter: best, Validation Accuracy: 0.8772
Max Depth: 12, Criterion: log_loss, Splitter: random, Validation Accuracy: 0.8668
Max Depth: 13, Criterion: gini, Splitter: best, 

In [19]:
  # --- Retrain best model on train + valid ---
best_dt_2 = DecisionTreeClassifier(max_depth = best_params_dt2[0],
                                   criterion = best_params_dt2[1],
                                   splitter = best_params_dt2[2],
                                   random_state=42)
best_dt_2.fit(X_train, y_train)

        # --- Evaluate on test set ---
y_pred_dt2 = best_dt_2.predict(X_test)
test_acc_dt2 = accuracy_score(y_test, y_pred_dt2)

print(f"Best Params: {best_params_dt2}, Test Accuracy: {test_acc_dt2:.4f}")

Best Params: (12, 'entropy', 'best'), Test Accuracy: 0.8833


Bagging

Made decision to hold n_estimators constant at 100 for remaining models to control runtime (data has 60k samples, runtime becomes very long with a larger number of estimators)

In [20]:
# Bagging Parameters
n_estimators_values_bg2 = [100]
max_samples_values_bg2 = [0.4, 0.5, 0.6]
best_acc_bg2 = 0
best_params_bg2 = None

In [21]:
# Manual Grid Search to find best parameters based on validation accuracy
for n_estimators, max_samples in product(n_estimators_values_bg2, max_samples_values_bg2):
  bg = BaggingClassifier(
                n_estimators=n_estimators,
                max_samples = max_samples,
                random_state=42,
                verbose = 3,
                n_jobs = -1
            )
  bg.fit(X_train_sub, y_train_sub)
  val_pred = bg.predict(X_valid)
  val_acc = accuracy_score(y_valid, val_pred)
  print(f"N Estimators: {n_estimators}, Max Samples: {max_samples}, Validation Accuracy: {val_acc:.4f}")
  if val_acc > best_acc_bg2:
    best_acc_bg2 = val_acc
    best_params_bg2 = (n_estimators, max_samples)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  7.7min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


N Estimators: 100, Max Samples: 0.4, Validation Accuracy: 0.9517


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:  9.9min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   19.7s finished


N Estimators: 100, Max Samples: 0.5, Validation Accuracy: 0.9518


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed: 11.3min finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


N Estimators: 100, Max Samples: 0.6, Validation Accuracy: 0.9509


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   18.7s finished


In [22]:
# --- Retrain best model on train + valid ---
best_bg2 = BaggingClassifier(n_estimators = best_params_bg2[0],
                              max_samples = best_params_bg2[1],
                              random_state=42)
best_bg2.fit(X_train, y_train)

        # --- Evaluate on test set ---
y_pred_bg2 = best_bg2.predict(X_test)
test_acc_bg2 = accuracy_score(y_test, y_pred_bg2)
print(f"Best Params: {best_params_bg2}, Test Accuracy: {test_acc_bg2:.4f}")

Best Params: (100, 0.5), Test Accuracy: 0.9571


Random Forest

In [23]:
# Random Forest Parameters
max_depth_rf2 = [18,19,20,21,22,23,24]
n_estimators_rf2 = [100]
max_features_rf2 = ['sqrt', 'log2']
best_acc_rf2 = 0
best_params_rf2 = None

In [24]:
# Manual Grid Search to find best parameters based on validation accuracy
for max_depth, n_estimators, max_features in product(max_depth_rf2, n_estimators_rf2, max_features_rf2):
  rf = RandomForestClassifier(
                max_depth=max_depth,
                n_estimators=n_estimators,
                max_features=max_features,
                random_state=42,
                n_jobs = -1
            )
  rf.fit(X_train_sub, y_train_sub)
  val_pred = rf.predict(X_valid)
  val_acc = accuracy_score(y_valid, val_pred)
  print (f"Max Depth: {max_depth}, N Estimators: {n_estimators}, Max Features: {max_features}, Validation Accuracy: {val_acc:.4f}")
  if val_acc > best_acc_rf2:
    best_acc_rf2 = val_acc
    best_params_rf2 = (max_depth, n_estimators, max_features)

Max Depth: 18, N Estimators: 100, Max Features: sqrt, Validation Accuracy: 0.9643
Max Depth: 18, N Estimators: 100, Max Features: log2, Validation Accuracy: 0.9618
Max Depth: 19, N Estimators: 100, Max Features: sqrt, Validation Accuracy: 0.9653
Max Depth: 19, N Estimators: 100, Max Features: log2, Validation Accuracy: 0.9624
Max Depth: 20, N Estimators: 100, Max Features: sqrt, Validation Accuracy: 0.9657
Max Depth: 20, N Estimators: 100, Max Features: log2, Validation Accuracy: 0.9634
Max Depth: 21, N Estimators: 100, Max Features: sqrt, Validation Accuracy: 0.9663
Max Depth: 21, N Estimators: 100, Max Features: log2, Validation Accuracy: 0.9628
Max Depth: 22, N Estimators: 100, Max Features: sqrt, Validation Accuracy: 0.9665
Max Depth: 22, N Estimators: 100, Max Features: log2, Validation Accuracy: 0.9634
Max Depth: 23, N Estimators: 100, Max Features: sqrt, Validation Accuracy: 0.9663
Max Depth: 23, N Estimators: 100, Max Features: log2, Validation Accuracy: 0.9623
Max Depth: 24, N

In [25]:
# --- Retrain best model on train + valid ---
best_rf2 = RandomForestClassifier(
                              max_depth = best_params_rf2[0],
                              n_estimators = best_params_rf2[1],
                              max_features = best_params_rf2[2],
                              random_state=42)
best_rf2.fit(X_train, y_train)

        # --- Evaluate on test set ---
y_pred_rf2 = best_rf2.predict(X_test)
test_acc_rf2 = accuracy_score(y_test, y_pred_rf2)
print(f"Best Params: {best_params_rf2}, Test Accuracy: {test_acc_rf2:.4f}")

Best Params: (22, 100, 'sqrt'), Test Accuracy: 0.9695


Gradient Boosting

In [26]:
# Gradient Boosting Parameters
gb_learning_rate = [0.1]
gb_n_estimators = [100]
gb_max_depth = [3, 4]
best_acc_gb = 0
best_params_gb = None

In [27]:
# Manual Grid Search to find best parameters based on validation accuracy
for learning_rate, n_estimators, max_depth in product(gb_learning_rate, gb_n_estimators, gb_max_depth):
  gb = GradientBoostingClassifier(
      learning_rate=learning_rate,
      n_estimators=n_estimators,
      max_depth=max_depth,
      verbose = 3,
      random_state=42)
  gb.fit(X_train_sub, y_train_sub)
  val_pred = gb.predict(X_valid)
  val_acc = accuracy_score(y_valid, val_pred)
  print(f"Learning Rate: {learning_rate}, N Estimators: {n_estimators}, Max Depth: {max_depth}, Validation Accuracy: {val_acc:.4f}")
  if val_acc > best_acc_gb:
    best_acc_gb = val_acc
    best_params_gb = (learning_rate, n_estimators, max_depth)


      Iter       Train Loss   Remaining Time 
         1           1.9147           55.26m
         2           1.6786           54.41m
         3           1.5134           53.72m
         4           1.3727           53.21m
         5           1.2628           52.69m
         6           1.1655           51.96m
         7           1.0824           51.52m
         8           1.0072           51.02m
         9           0.9428           50.58m
        10           0.8831           50.09m
        11           0.8316           49.71m
        12           0.7851           49.03m
        13           0.7438           48.48m
        14           0.7044           47.87m
        15           0.6733           47.32m
        16           0.6431           46.78m
        17           0.6156           46.17m
        18           0.5906           45.63m
        19           0.5675           45.08m
        20           0.5480           44.48m
        21           0.5279           43.90m
        2

In [28]:
# --- Retrain best model on train + valid ---
best_gb_2 = GradientBoostingClassifier(
                              learning_rate = best_params_gb[0],
                              n_estimators = best_params_gb[1],
                              max_depth = best_params_gb[2],
                              verbose = 3,
                              random_state=42)
best_gb_2.fit(X_train, y_train)

        # --- Evaluate on test set ---
y_pred_gb = best_gb_2.predict(X_test)
test_acc_gb = accuracy_score(y_test, y_pred_gb)
print(f"Best Params: {best_params_gb}, Test Accuracy: {test_acc_gb:.4f}")

      Iter       Train Loss   Remaining Time 
         1           1.8220           62.53m
         2           1.5648           62.47m
         3           1.3878           61.89m
         4           1.2416           61.20m
         5           1.1237           60.61m
         6           1.0223           60.02m
         7           0.9300           59.47m
         8           0.8549           59.03m
         9           0.7905           58.38m
        10           0.7334           57.77m
        11           0.6803           57.19m
        12           0.6334           56.65m
        13           0.5920           56.20m
        14           0.5549           55.65m
        15           0.5228           55.02m
        16           0.4932           54.36m
        17           0.4665           53.70m
        18           0.4421           53.16m
        19           0.4213           52.77m
        20           0.4027           52.20m
        21           0.3854           51.58m
        2

Comparison of test accuracy between models on MNIST Dataset

In [29]:
acc_mnist = pd.DataFrame({'DecisionTree': [test_acc_dt2],
                          'Bagging': [test_acc_bg2],
                          'RandomForest': [test_acc_rf2],
                          'GradientBoosting': [test_acc_gb]})
acc_mnist

Unnamed: 0,DecisionTree,Bagging,RandomForest,GradientBoosting
0,0.8833,0.9571,0.9695,0.9614
