# Project 1 – Decision Trees and Random Forests

In [121]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [122]:
data = np.genfromtxt("../wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)


Create the seed

In [123]:
seed = 0
np.random.seed(seed)

Split the data into train and val, and test

In [124]:
from sklearn.model_selection import train_test_split

# Split the data (use 70% for training and 30% for validation)
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True)

Use Kfold with 5 splits

In [125]:
from sklearn.model_selection import KFold

# Init KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

Tune the self made DecisionTree to get optimal hyperparameters

In [126]:
from decision_tree import DecisionTree
from sklearn.metrics import accuracy_score

# Hyperparameter grid
max_depth_values = [3, 5, 8, 10, 12, 15, None]
criterion_values = ["gini", "entropy"]

best_accuracy = 0
best_hyperparameters_dt = {}

# Loop through different hyperparameter combinations
for max_depth in max_depth_values:
    for criterion in criterion_values:
        accuracies = []
        
        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_train_and_val):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]
            
            # Train the decision tree
            dt = DecisionTree(max_depth=max_depth, criterion=criterion)
            dt.root = dt.fit(X_train, y_train)
            
            # Make predictions on the validation fold
            y_pred = dt.predict(X_val)
            
            # Compute accuracy
            accuracy = accuracy_score(y_val, y_pred)
            accuracies.append(accuracy)
        
        # Calculate the mean accuracy across all folds
        mean_accuracy = np.mean(accuracies)
        
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_hyperparameters_dt = {"max_depth": max_depth, "criterion": criterion}

print(f"Best Accuracy: {best_accuracy}")
print(f"Best Parameters: {best_hyperparameters_dt}")


Best Accuracy: 0.7742857142857142
Best Parameters: {'max_depth': 5, 'criterion': 'gini'}


Tune the self made RandomForest to get optimal hyperparameters

Hyperparameters to consider:

In [None]:
# Random Forest hyperparameters
n_estimators_values = [5, 10]
max_depth_values = [3, 5, 10, 15, 20, 30, None]
max_features_values = ["sqrt", "log2", None]
criterion_values = ["gini", "entropy"]

In [127]:
from random_forest import RandomForest

def tune_random_forest_hyperparameters(n_estimators_values, max_depth_values, max_features_values, criterion_values):
    best_accuracy_rf = 0
    best_hyperparameters_rf = {}

    # Loop through hyperparameter combinations
    for n_estimators in n_estimators_values:
        for max_depth in max_depth_values:
            for max_features in max_features_values:
                for criterion in criterion_values:  # Iterate over criterion
                    accuracies = []
                    
                    # Perform the k-fold cross-validation
                    for train_index, val_index in kf.split(X_train_and_val):
                        X_train, X_val = X[train_index], X[val_index]
                        y_train, y_val = y[train_index], y[val_index]
                        
                        # Train the random forest
                        rf = RandomForest(
                            n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, criterion=criterion
                        )
                        rf.fit(X_train, y_train)
                        
                        # Make predictions on the validation fold
                        y_pred = rf.predict(X_val)
                        
                        # Compute accuracy
                        accuracy = accuracy_score(y_val, y_pred)
                        accuracies.append(accuracy)
                    
                    # Calculate the mean accuracy across all folds
                    mean_accuracy_rf = np.mean(accuracies)
                    
                    if mean_accuracy_rf > best_accuracy_rf:
                        best_accuracy_rf = mean_accuracy_rf
                        # Track the best hyperparameters
                        best_hyperparameters = {
                            "n_estimators": n_estimators,
                            "max_depth": max_depth,
                            "max_features": max_features,
                            "criterion": criterion,
                        }
    return best_hyperparameters

best_hyperparameters_rf = tune_random_forest_hyperparameters(n_estimators_values, max_depth_values, max_features_values, criterion_values)
print(f"Best Random Forest Accuracy: {best_accuracy_rf}")
print(f"Best Random Forest Parameters: {best_hyperparameters_rf}")


Best Random Forest Accuracy: 0.86
Best Random Forest Parameters: {'n_estimators': 10, 'max_depth': 5, 'max_features': 'log2', 'criterion': 'entropy'}


1.
Best Random Forest Accuracy: 0.884
Best Random Forest Parameters: {'n_estimators': 25, 'max_depth': 15, 'max_features': 'log2', 'criterion': 'gini'}





Train RandomForest and DecisionTree to the best hyperparameters

In [128]:
dt = DecisionTree(
    max_depth=best_hyperparameters_dt['max_depth'],
    criterion=best_hyperparameters_dt['criterion'],
)
dt.root = dt.fit(X_train_and_val, y_train_and_val)

rf = RandomForest(
    n_estimators=best_hyperparameters_rf['n_estimators'],
    max_depth=best_hyperparameters_rf['max_depth'],
    max_features=best_hyperparameters_rf['max_features'],
    criterion=best_hyperparameters_rf['criterion'],
)
rf.fit(X_train_and_val, y_train_and_val)

Check the accuracy of the models to the test set

In [132]:
y_pred_rf = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Test set accuracy RF: {accuracy}")

y_pred_dt = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Test set accuracy DT: {accuracy}")

Test set accuracy RF: 0.8733333333333333
Test set accuracy DT: 0.8


Tune the sklearn DecisionTreeClassifier and RandomForestClassifier to compare models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

