## Supplement 6: Decision Trees and Random Forest

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from scipy.stats import mode

### 6.3 Programming Task: Song popularity prediction using Random Forest
The goal of this task is to train a random forest model that predicts the song popularity using the datasets already provided in task 4.3
 

In [2]:
# Load data
train_data = pd.read_csv('train-songs.csv')
test_data = pd.read_csv('test-songs.csv')

print(train_data.head())
print(test_data.head())

label='popular'

train_X = train_data.drop(columns=[label])
train_y = train_data[label]

test_X = test_data.drop(columns=[label])
test_y = test_data[label]

   danceability  key  loudness  acousticness  instrumentalness  liveness  \
0         0.391    8    -9.532         0.478          0.000006    0.1160   
1         0.628    1   -13.834         0.156          0.010400    0.0836   
2         0.613    3   -22.789         0.864          0.000000    0.2690   
3         0.504    2    -5.931         0.414          0.000000    0.0845   
4         0.698    9    -3.840         0.101          0.000000    0.1070   

   valence    tempo  popular  
0    0.138  105.593      0.0  
1    0.761  102.974      0.0  
2    0.371   75.104      0.0  
3    0.163  135.927      1.0  
4    0.931  124.042      1.0  
   danceability  key  loudness  acousticness  instrumentalness  liveness  \
0         0.652    9    -7.319        0.7250          0.000002     0.189   
1         0.500   11    -7.996        0.0024          0.000000     0.133   
2         0.422   10    -7.215        0.1090          0.000000     0.722   
3         0.708    5    -5.426        0.0136         

   i\. Implement a function that draws a bootstrap sample of size N from the train dataset, where N can be specified by the user.




In [3]:
def generate_bootstrap(train_X, train_y, N):
    
    # Combine train_X and train_y to ensure sampling consistency
    combined_data = train_X.copy()
    combined_data['label'] = train_y

    # Draw bootstrap sample with replacement
    bootstrap_sample = combined_data.sample(n=N, replace=True, random_state=np.random.randint(0, 10000))

    # Split back into features and target
    bootstrap_X = bootstrap_sample.drop(columns=['label'])
    bootstrap_y = bootstrap_sample['label']

    return bootstrap_X, bootstrap_y

   ii\. Complete the implementation of the random forest algorithm. For this task you may use the DecisionTreeClassifier from the scikit-learn library. The other parts of the random forest algorithm must be implemented using only Scipy/Numpy.

In [4]:
class RandomForest:
    
    def __init__(self, n_trees, max_features, max_samples, min_node_size, max_depth):
        
        self.n_trees = n_trees
        self.max_features = max_features
        self.max_samples = max_samples
        self.min_node_size = min_node_size
        self.max_depth = max_depth
        self.trees = []  

    def train(self, train_X, train_y):
        
        n_samples, n_features = train_X.shape

        for i in range(self.n_trees):
            # Bootstrap sampling
            bootstrap_indices = np.random.choice(n_samples, size=self.max_samples, replace=True)
            bootstrap_X = train_X.iloc[bootstrap_indices]
            bootstrap_y = train_y.iloc[bootstrap_indices]


            # Random feature selection
            feature_indices = np.random.choice(n_features, size=self.max_features, replace=False)
            sampled_X = bootstrap_X.iloc[:, feature_indices]

            # Train a decision tree on the bootstrap sample
            # Train a decision tree on the bootstrap sample
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=max(2, self.min_node_size)  # Ensure a valid minimum value
            )
            tree.fit(sampled_X, bootstrap_y)

            tree.fit(sampled_X, bootstrap_y)

            # Save the tree and the selected features
            self.trees.append((tree, feature_indices))

    def predict(self, test_X):
        
        predictions = []

        for tree, feature_indices in self.trees:
            # Predict using the weak classifier on the subset of features
            predictions.append(tree.predict(test_X.iloc[:, feature_indices]))

        # Majority voting
        predictions = np.array(predictions)
        final_predictions = mode(predictions, axis=0).mode[0]

        return final_predictions

iii\. Train the model for the dataset from train-songs.csv using the parameters given below.
| Parameter| Value|
|----------|------|
Number of trees|100|
Maximum features per tree|2|
Bootstrap sample size|20000|
Minimum node size|1|
Maximum tree depth|10|


Note: The bootstrap sample size is the same as train dataset size in this task.


In [5]:
# Note: Run this cell without any changes. The model will train if the implementation of subtask (ii) is correct.
random_forest_model = RandomForest(n_trees=100, max_samples=20000,max_depth=10,min_node_size=1, max_features=2 )
random_forest_model.train(train_X, train_y)

   iv\. Calculate the accuracy of the model using the test dataset and compare your results with the
RandomForestClassifier from the scikit-learn library using the following parameters.

In [6]:
# TODO Run predict for test data and calculate accuracy
from sklearn.metrics import accuracy_score
custom_rf_predictions = random_forest_model.predict(test_X)
custom_rf_accuracy = accuracy_score(test_y, custom_rf_predictions)
print(f"Custom Random Forest Accuracy: {custom_rf_accuracy:.2f}")

Custom Random Forest Accuracy: 0.78


  final_predictions = mode(predictions, axis=0).mode[0]


In [7]:
# TODO: Train and predict using scikit-learn library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Parameters for the Random Forest
n_trees = 100
max_features = 2
max_depth = 10
min_samples_split = 2

# Train scikit-learn's Random Forest
sklearn_rf = RandomForestClassifier(
    n_estimators=n_trees,
    max_features=max_features,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    random_state=42
)

# Train the model
sklearn_rf.fit(train_X, train_y)

# Make predictions on the test set
sklearn_rf_predictions = sklearn_rf.predict(test_X)

# Calculate accuracy
sklearn_rf_accuracy = accuracy_score(test_y, sklearn_rf_predictions)
print(f"Scikit-learn Random Forest Accuracy: {sklearn_rf_accuracy:.2f}")

Scikit-learn Random Forest Accuracy: 0.81
