# Project 1 â€“ Decision Trees and Random Forests

In [275]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from decision_tree import DecisionTree
from random_forest import RandomForest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset (Wine)

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [276]:
data = np.genfromtxt("../wine_dataset_small.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

# Create seed for reproducibility
seed = 0
np.random.seed(seed)

# Print sizes
print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['citric_acid', 'residual_sugar', 'pH', 'sulphates', 'alcohol']
Target column name: type
X shape: (500, 5)
y shape: (500,)


Split the data into train and val, and test

In [277]:
from sklearn.model_selection import train_test_split

# Split the data (use 70% for training and 30% for validation)
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True)

### Kfold
Use Kfold with 5 splits

In [278]:
from sklearn.model_selection import KFold

# Init KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

### Hyperparameters

Define hyperparameter values to test for the model tuning

In [279]:
## Hyperparameters
criterion_values = ["gini", "entropy"]

# Desicion Tree
dt_max_depth_values = [3, 5, 8, 10, 12, 15, None]

# Random Forest
rf_n_estimators_values = [5]
rf_max_depth_values = [3, 5, 10, None]
rf_max_features_values = ["sqrt", "log2", None]

### Tuning the models

Defining a generic function for tuning both Decision Tree and Random Forest (self & sklearn)

In [280]:
def tune_hyperparameters(model_class: DecisionTree | DecisionTreeClassifier | RandomForest | RandomForestClassifier, parameter_grid):
    # Seed for deterministic output
    np.random.seed(seed)

    best_accuracy = 0
    best_hyperparameters = {}

    for params in parameter_grid:
        accuracies = []
        
        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_train_and_val):
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            model = model_class(**params)

            # Train the decision tree self (train on root)
            if isinstance(model, DecisionTree):
                model.root = model.fit(X_train, y_train)
            else:
                # Train like normal
                model.fit(X_train, y_train)
                        
            # Make predictions on the validation fold
            y_pred = model.predict(X_val)
            
            # Compute accuracy
            accuracy = accuracy_score(y_val, y_pred)
            accuracies.append(accuracy)
        
        # Calculate the mean accuracy across all folds
        mean_accuracy = np.mean(accuracies)
        
        # Only keep the best accuracy and hyperparameters
        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_hyperparameters = params
    
    return best_hyperparameters, best_accuracy


Tuning the self made Decision Tree on the validation set to get the optimal hyperparameters

In [321]:
# Tune the Decision Tree
dt_param_grid = [{'max_depth': md, 'criterion': c} for md in dt_max_depth_values for c in criterion_values]

best_hyperparameters_dt_self, best_accuracy_dt_self = tune_hyperparameters(DecisionTree, dt_param_grid)
best_hyperparameters_dt_sklearn, best_accuracy_dt_sklearn = tune_hyperparameters(DecisionTreeClassifier, dt_param_grid)


print(f"Best Desicion Tree Accuracy (self): {best_accuracy_dt_self}")
print(f"Best Desicion Tree Parameters (self): {best_hyperparameters_dt_self}")
print()
print(f"Best Desicion Tree Accuracy (sklearn): {best_accuracy_dt_sklearn}")
print(f"Best Desicion Tree Parameters (sklearn): {best_hyperparameters_dt_sklearn}")

Best Desicion Tree Accuracy (self): 0.7473991817650497
Best Desicion Tree Parameters (self): {'max_depth': 3, 'criterion': 'gini'}

Best Desicion Tree Accuracy (sklearn): 0.7682641729982466
Best Desicion Tree Parameters (sklearn): {'max_depth': 5, 'criterion': 'entropy'}


Tuning the self made Random Forest on the validation set to get the optimal hyperparameters

In [282]:
# Tune the Random Forest
rf_param_grid = [
    {'n_estimators': ne, 'max_depth': md, 'max_features': mf, 'criterion': c} 
    for ne in rf_n_estimators_values 
    for md in rf_max_depth_values 
    for mf in rf_max_features_values 
    for c in criterion_values
]

best_hyperparameters_rf_self, best_accuracy_rf_self = tune_hyperparameters(RandomForest, rf_param_grid)
best_hyperparameters_rf_sklearn, best_accuracy_rf_sklearn = tune_hyperparameters(RandomForestClassifier, rf_param_grid)

print(f"Best Random Forest Accuracy (self): {best_accuracy_rf_self}")
print(f"Best Random Forest Parameters (self): {best_hyperparameters_rf_self}")
print()
print(f"Best Random Forest Accuracy (sklearn): {best_accuracy_rf_sklearn}")
print(f"Best Random Forest Parameters (sklearn): {best_hyperparameters_rf_sklearn}")

Best Random Forest Accuracy (self): 0.8400000000000001
Best Random Forest Parameters (self): {'n_estimators': 5, 'max_depth': None, 'max_features': None, 'criterion': 'entropy'}

Best Random Forest Accuracy (sklearn): 0.8514285714285714
Best Random Forest Parameters (sklearn): {'n_estimators': 5, 'max_depth': None, 'max_features': 'log2', 'criterion': 'gini'}


Best Random Forest Accuracy (self): 0.8400000000000001

Best Random Forest Parameters (self): {'n_estimators': 5, 'max_depth': None, 'max_features': None, 'criterion': 'entropy'}







### Evaluating the results

Defining a generic function for evaluating both Decision Tree and Random Forest (self & sklearn)

In [322]:
# Evaluate model function => Train and test the best models
def evaluate_model(model_class, params):
    model = model_class(**params)
    if isinstance(model, DecisionTree):
        model.root = model.fit(X_train_and_val, y_train_and_val)
    else:
        model.fit(X_train_and_val, y_train_and_val)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

Train RandomForest and DecisionTree to the best hyperparameters, and check the accuracy of the models to the test set

In [339]:
print(f"Test accuracy DT (self): {evaluate_model(DecisionTree, best_hyperparameters_dt_self)}")
print(f"Test accuracy DT (sklearn): {evaluate_model(DecisionTreeClassifier, best_hyperparameters_dt_sklearn)}")
print(f"Test accuracy RF (self): {evaluate_model(RandomForest, best_hyperparameters_rf_self)}")
print(f"Test accuracy RF (sklearn): {evaluate_model(RandomForestClassifier, best_hyperparameters_rf_sklearn)}")


Test accuracy DT (self): 0.7698412698412699
Test accuracy DT (sklearn): 0.8253968253968254
Test accuracy RF (self): 0.873015873015873
Test accuracy RF (sklearn): 0.8333333333333334


#### Compare the results of the models

In [287]:
def print_comparison_results(best_hyperparameters_dt_self, best_accuracy_dt_self, best_hyperparameters_dt_sklearn, best_accuracy_dt_sklearn, 
                             best_hyperparameters_rf_self, best_accuracy_rf_self, best_hyperparameters_rf_sklearn, best_accuracy_rf_sklearn,
                             ):
    
    num_equality_signs = 150
    
    print("="*num_equality_signs)
    print(f"{'Model':<40}{'Best Hyperparameters':<40}{'CV Accuracy':<15}{'Test Accuracy':<15}")
    print("="*num_equality_signs)
    
    # Decision Tree (self)
    dt_self_test_acc = evaluate_model(DecisionTree, best_hyperparameters_dt_self)
    print(f"{'Decision Tree (self)':<40}{str(best_hyperparameters_dt_self):<40}{best_accuracy_dt_self:<15.4f}{dt_self_test_acc:<15.4f}")
    print()
    
    # Decision Tree (sklearn)
    dt_sklearn_test_acc = evaluate_model(DecisionTreeClassifier, best_hyperparameters_dt_sklearn)
    print(f"{'Decision Tree (sklearn)':<40}{str(best_hyperparameters_dt_sklearn):<40}{best_accuracy_dt_sklearn:<15.4f}{dt_sklearn_test_acc:<15.4f}")
    print()

    # Random Forest (self)
    rf_self_test_acc = evaluate_model(RandomForest, best_hyperparameters_rf_self)
    print(f"{'Random Forest (self)':<40}{str(best_hyperparameters_rf_self):<40}{best_accuracy_rf_self:<15.4f}{rf_self_test_acc:<15.4f}")
    print()

    # Random Forest (sklearn)
    rf_sklearn_test_acc = evaluate_model(RandomForestClassifier, best_hyperparameters_rf_sklearn)
    print(f"{'Random Forest (sklearn)':<40}{str(best_hyperparameters_rf_sklearn):<40}{best_accuracy_rf_sklearn:<15.4f}{rf_sklearn_test_acc:<15.4f}")
    
    print("="*num_equality_signs)

In [288]:
# Call the print function to compare results
print_comparison_results(best_hyperparameters_dt_self, best_accuracy_dt_self, best_hyperparameters_dt_sklearn, best_accuracy_dt_sklearn, 
                         best_hyperparameters_rf_self, best_accuracy_rf_self, best_hyperparameters_rf_sklearn, best_accuracy_rf_sklearn,
                         )

Model                                   Best Hyperparameters                    CV Accuracy    Test Accuracy  
Decision Tree (self)                    {'max_depth': 5, 'criterion': 'gini'}   0.7743         0.8000         

Decision Tree (sklearn)                 {'max_depth': 12, 'criterion': 'gini'}  0.8429         0.8467         

Random Forest (self)                    {'n_estimators': 5, 'max_depth': None, 'max_features': None, 'criterion': 'entropy'}0.8400         0.8267         

Random Forest (sklearn)                 {'n_estimators': 5, 'max_depth': None, 'max_features': 'log2', 'criterion': 'gini'}0.8514         0.8733         


### 3.3

## Dataset (Coffee)

In [289]:
data = np.genfromtxt("../coffee_data.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

# Create seed for reproducibility
seed = 0
np.random.seed(seed)

# Print sizes
print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Sweetness']
Target column name: CountryofOrigin
X shape: (419, 8)
y shape: (419,)


Split the data into train and val, and test

In [290]:
from sklearn.model_selection import train_test_split

# Split the data (use 70% for training and 30% for validation)
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, shuffle=True)

### Tuning the models

In [291]:
# Tune the Decision Tree
best_hyperparameters_dt_self, best_accuracy_dt_self = tune_hyperparameters(DecisionTree, dt_param_grid)
best_hyperparameters_dt_sklearn, best_accuracy_dt_sklearn = tune_hyperparameters(DecisionTreeClassifier, dt_param_grid)


print(f"Best Desicion Tree Accuracy (self): {best_accuracy_dt_self}")
print(f"Best Desicion Tree Parameters (self): {best_hyperparameters_dt_self}")
print()
print(f"Best Desicion Tree Accuracy (sklearn): {best_accuracy_dt_sklearn}")
print(f"Best Desicion Tree Parameters (sklearn): {best_hyperparameters_dt_sklearn}")

Best Desicion Tree Accuracy (self): 0.7473991817650497
Best Desicion Tree Parameters (self): {'max_depth': 3, 'criterion': 'gini'}

Best Desicion Tree Accuracy (sklearn): 0.7682641729982466
Best Desicion Tree Parameters (sklearn): {'max_depth': 5, 'criterion': 'entropy'}


In [292]:
# Tune the Random Forest
best_hyperparameters_rf_self, best_accuracy_rf_self = tune_hyperparameters(RandomForest, rf_param_grid)
best_hyperparameters_rf_sklearn, best_accuracy_rf_sklearn = tune_hyperparameters(RandomForestClassifier, rf_param_grid)

print(f"Best Random Forest Accuracy (self): {best_accuracy_rf_self}")
print(f"Best Random Forest Parameters (self): {best_hyperparameters_rf_self}")
print()
print(f"Best Random Forest Accuracy (sklearn): {best_accuracy_rf_sklearn}")
print(f"Best Random Forest Parameters (sklearn): {best_hyperparameters_rf_sklearn}")

Best Random Forest Accuracy (self): 0.7646405610753946
Best Random Forest Parameters (self): {'n_estimators': 5, 'max_depth': 3, 'max_features': None, 'criterion': 'entropy'}

Best Random Forest Accuracy (sklearn): 0.7954412624196376
Best Random Forest Parameters (sklearn): {'n_estimators': 5, 'max_depth': 3, 'max_features': None, 'criterion': 'entropy'}


### Evaluating the results

In [305]:
print(f"Test accuracy DT (self): {evaluate_model(DecisionTree, best_hyperparameters_dt_self)}")
print(f"Test accuracy DT (sklearn): {evaluate_model(DecisionTreeClassifier, best_hyperparameters_dt_sklearn)}")
print(f"Test accuracy RF (self): {evaluate_model(RandomForest, best_hyperparameters_rf_self)}")
print(f"Test accuracy RF (sklearn): {evaluate_model(RandomForestClassifier, best_hyperparameters_rf_sklearn)}")

Test accuracy DT (self): 0.7698412698412699
Test accuracy DT (sklearn): 0.8253968253968254
Test accuracy RF (self): 0.873015873015873
Test accuracy RF (sklearn): 0.8571428571428571


In [302]:
# Call the print function to compare results
print_comparison_results(best_hyperparameters_dt_self, best_accuracy_dt_self, best_hyperparameters_dt_sklearn, best_accuracy_dt_sklearn, 
                         best_hyperparameters_rf_self, best_accuracy_rf_self, best_hyperparameters_rf_sklearn, best_accuracy_rf_sklearn,
                         )

Model                                   Best Hyperparameters                    CV Accuracy    Test Accuracy  
Decision Tree (self)                    {'max_depth': 3, 'criterion': 'gini'}   0.7474         0.7698         

Decision Tree (sklearn)                 {'max_depth': 5, 'criterion': 'entropy'}0.7683         0.8016         

Random Forest (self)                    {'n_estimators': 5, 'max_depth': 3, 'max_features': None, 'criterion': 'entropy'}0.7646         0.8333         

Random Forest (sklearn)                 {'n_estimators': 5, 'max_depth': 3, 'max_features': None, 'criterion': 'entropy'}0.7954         0.8413         
