# Project 1 â€“ Decision Trees and Random Forests

In [178]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from decision_tree import DecisionTree
from random_forest import RandomForest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [182]:
data = np.genfromtxt("../wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

# Create seed
seed = 0
np.random.seed(seed)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)


Split the data into train and val, and test

In [183]:
from sklearn.model_selection import train_test_split

# Split the data (use 70% for training and 30% for validation)
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True)

Use Kfold with 5 splits

In [184]:
from sklearn.model_selection import KFold

# Init KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [185]:
## Hyperparameters
criterion_values = ["gini", "entropy"]

# Desicion Tree
dt_max_depth_values = [3, 5, 8, 10, 12, 15, None]

# Random Forest
rf_n_estimators_values = [5]
rf_max_depth_values = [3, 5, 10, None]
rf_max_features_values = ["sqrt", "log2", None]

Tune the self made DecisionTree to get optimal hyperparameters

In [186]:
def tune_hyperparameters(model_class: DecisionTree | DecisionTreeClassifier | RandomForest | RandomForestClassifier, parameter_grid):
    # Seed for deterministic output
    np.random.seed(seed)

    best_accuracy = 0
    best_hyperparameters = {}

    for params in parameter_grid:
        accuracies = []
        
        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_train_and_val):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            model = model_class(**params)

            # Train the decision tree
            if model_class is DecisionTree:
                model.root = model.fit(X_train, y_train)
            else:
                # Train the random forest
                model.fit(X_train, y_train)
            
            # Make predictions on the validation fold
            y_pred = model.predict(X_val)
            
            # Compute accuracy
            accuracy = accuracy_score(y_val, y_pred)
            accuracies.append(accuracy)
        
        # Calculate the mean accuracy across all folds
        mean_accuracy = np.mean(accuracies)
        
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_hyperparameters = params
    
    return best_hyperparameters, best_accuracy


In [192]:
# Tune Decision Tree
dt_param_grid = [{'max_depth': md, 'criterion': c} for md in dt_max_depth_values for c in criterion_values]

best_hyperparameters_dt_self, best_accuracy_dt_self = tune_hyperparameters(DecisionTree, dt_param_grid)
best_hyperparameters_dt_sklearn, best_accuracy_dt_sklearn = tune_hyperparameters(DecisionTreeClassifier, dt_param_grid)


print(f"Best Desicion Tree Accuracy (self): {best_accuracy_dt_self}")
print(f"Best Desicion Tree Parameters (self): {best_hyperparameters_dt_self}")
print()
print(f"Best Desicion Tree Accuracy (sklearn): {best_accuracy_dt_sklearn}")
print(f"Best Desicion Tree Parameters (sklearn): {best_hyperparameters_dt_sklearn}")

Best Desicion Tree Accuracy (self): 0.7742857142857142
Best Desicion Tree Parameters (self): {'max_depth': 5, 'criterion': 'gini'}

Best Desicion Tree Accuracy (sklearn): 0.8428571428571429
Best Desicion Tree Parameters (sklearn): {'max_depth': 12, 'criterion': 'gini'}


Tune the self made RandomForest to get optimal hyperparameters

In [193]:
# Tune Random Forest
rf_param_grid = [
    {'n_estimators': ne, 'max_depth': md, 'max_features': mf, 'criterion': c} 
    for ne in rf_n_estimators_values 
    for md in rf_max_depth_values 
    for mf in rf_max_features_values 
    for c in criterion_values
]

best_hyperparameters_rf_self, best_accuracy_rf_self = tune_hyperparameters(RandomForest, rf_param_grid)
best_hyperparameters_rf_sklearn, best_accuracy_rf_sklearn = tune_hyperparameters(RandomForestClassifier, rf_param_grid)

print(f"Best Random Forest Accuracy (self): {best_accuracy_rf_self}")
print(f"Best Random Forest Parameters (self): {best_hyperparameters_rf_self}")
print()
print(f"Best Random Forest Accuracy (sklearn): {best_accuracy_rf_sklearn}")
print(f"Best Random Forest Parameters (sklearn): {best_hyperparameters_rf_sklearn}")

Best Random Forest Accuracy (self): 0.8400000000000001
Best Random Forest Parameters (self): {'n_estimators': 5, 'max_depth': None, 'max_features': None, 'criterion': 'entropy'}

Best Random Forest Accuracy (sklearn): 0.8514285714285714
Best Random Forest Parameters (sklearn): {'n_estimators': 5, 'max_depth': None, 'max_features': 'log2', 'criterion': 'gini'}


Best Random Forest Accuracy (self): 0.8400000000000001

Best Random Forest Parameters (self): {'n_estimators': 5, 'max_depth': None, 'max_features': None, 'criterion': 'entropy'}







Evaluate function

In [198]:
# Train and test the best models
def evaluate_model(model_class, params):
    model = model_class(**params)
    if model_class is DecisionTree:
        model.root = model.fit(X_train_and_val, y_train_and_val)
    else:
        model.fit(X_train_and_val, y_train_and_val)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

Train RandomForest and DecisionTree to the best hyperparameters, and check the accuracy of the models to the test set

In [199]:
print(f"Test accuracy DT (self): {evaluate_model(DecisionTree, best_hyperparameters_dt_self)}")
print(f"Test accuracy DT (sklearn): {evaluate_model(DecisionTreeClassifier, best_hyperparameters_dt_sklearn)}")
print(f"Test accuracy RF (self): {evaluate_model(RandomForest, best_hyperparameters_rf_self)}")
print(f"Test accuracy RF (sklearn): {evaluate_model(RandomForestClassifier, best_hyperparameters_rf_sklearn)}")


Test accuracy DT (self): 0.8
Test accuracy DT (sklearn): 0.8466666666666667
Test accuracy RF (self): 0.84
Test accuracy RF (sklearn): 0.9
