## Supplement 6: Decision Trees and Random Forest

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from scipy.stats import mode


### 6.3 Programming Task: Song popularity prediction using Random Forest
The goal of this task is to train a random forest model that predicts the song popularity using the datasets already provided in task 4.3
 

In [2]:
# Read data

train_data = pd.read_csv("train-songs.csv")
test_data = pd.read_csv("test-songs.csv")

X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

   i\. Implement a function that draws a bootstrap sample of size N from the train dataset, where N can be specified by the user.




In [3]:
def generate_bootstrap(train_X,train_y,N):
   # Generate random indices with replacement
    indices = np.random.choice(len(train_X), size=N, replace=True)
    
    # Use the indices to create the bootstrap samples
    bootstrap_X = train_X[indices]
    bootstrap_y = train_y[indices]
    
    return bootstrap_X, bootstrap_y

   ii\. Complete the implementation of the random forest algorithm. For this task you may use the DecisionTreeClassifier from the scikit-learn library. The other parts of the random forest algorithm must be implemented using only Scipy/Numpy.

In [18]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from scipy.stats import mode

class RandomForest:
    def __init__(self, n_trees, max_features, max_samples, min_node_size, max_depth):
        #TODO Initialize list containing weak classifiers. Also initialize any other parameter if required.
        self.n_trees = n_trees
        self.max_features = max_features
        self.max_samples = max_samples
        self.min_node_size = min_node_size
        self.max_depth = max_depth
        self.trees = []

        for _ in range(n_trees):
            tree = DecisionTreeClassifier(max_features=self.max_features, 
                                          min_samples_leaf=self.min_node_size,
                                          max_depth=self.max_depth)
            self.trees.append(tree)

    def train(self, train_X, train_y):
        #TODO Training each weak classifier
        for tree in self.trees:
            # Generate a bootstrap sample
            bootstrap_indices = np.random.choice(np.arange(len(train_X)), size=self.max_samples, replace=True)
            bootstrap_X = train_X[bootstrap_indices]
            bootstrap_y = train_y[bootstrap_indices]

            # Train the tree on the bootstrap sample
            tree.fit(bootstrap_X, bootstrap_y)


    def predict(self, test_X):
        # Collect predictions from each tree for all test instances
        predictions = np.array([tree.predict(test_X) for tree in self.trees])

        # Transpose the predictions to get a matrix where each row represents a test instance
        predictions = predictions.T

        # Majority vote for each test instance
        y_predictions, _ = mode(predictions, axis=1)
        return y_predictions.ravel()  # Flatten the array to 1D
    

iii\. Train the model for the dataset from train-songs.csv using the parameters given below.
| Parameter| Value|
|----------|------|
Number of trees|100|
Maximum features per tree|2|
Bootstrap sample size|20000|
Minimum node size|1|
Maximum tree depth|10|


Note: The bootstrap sample size is the same as train dataset size in this task.


In [19]:
# Parameters for the Random Forest
n_trees = 100
max_features = 2
max_samples = len(X_train)  # Using the entire dataset size
min_node_size = 1
max_depth = 10

# Create and train the Random Forest model
random_forest = RandomForest(n_trees=n_trees, max_features=max_features, max_samples=max_samples, 
                             min_node_size=min_node_size, max_depth=max_depth)
random_forest.train(X_train, y_train)

train_X = X_train
train_y = y_train

In [20]:
# Note: Run this cell without any changes. The model will train if the implementation of subtask (ii) is correct.

random_forest_model = RandomForest(n_trees=100, max_samples=20000,max_depth=10,min_node_size=1, max_features=2 )

random_forest_model.train(train_X, train_y)




   iv\. Calculate the accuracy of the model using the test dataset and compare your results with the
RandomForestClassifier from the scikit-learn library using the following parameters.

In [21]:
# TODO Run predict for test data and calculate accuracy
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Predictions from the custom Random Forest model
y_pred_custom = random_forest.predict(X_test)

# Accuracy of the custom Random Forest model
accuracy_custom = accuracy_score(y_test, y_pred_custom)

# Training and prediction using scikit-learn's RandomForestClassifier
sklearn_rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=1, max_features=2)
sklearn_rf.fit(X_train, y_train)
y_pred_sklearn = sklearn_rf.predict(X_test)

# Accuracy of scikit-learn's RandomForestClassifier
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

# Print the accuracies
print(f"Custom Random Forest Accuracy: {accuracy_custom}")
print(f"Scikit-learn Random Forest Accuracy: {accuracy_sklearn}")



Custom Random Forest Accuracy: 0.804
Scikit-learn Random Forest Accuracy: 0.8055


In [22]:
# TODO: Train and predict using scikit-learn library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create the RandomForestClassifier instance with the specified parameters
rf_classifier = RandomForestClassifier(n_estimators=100, max_features=2, max_depth=10, min_samples_leaf=1)

# Train the model using the training data
rf_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred_sklearn = rf_classifier.predict(X_test)

# Calculate the accuracy of the predictions
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

print(f"Accuracy of scikit-learn's RandomForestClassifier: {accuracy_sklearn}")


Accuracy of scikit-learn's RandomForestClassifier: 0.807
