# Random Forest

## Libraries

In [38]:
# modules
from common import evaluate
# working with data
import numpy as np
import pandas as pd
# modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.utils import shuffle
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score
# saving model
import pickle

## Data Preparation

In [40]:
# Read in dataset from csv
songs = pd.read_csv("songs.csv", index_col=False)
df_no_genres = songs.drop(columns=['track_id', 'track_name', 'genres'])
df_genres = songs.drop(columns=['track_id', 'track_name'])

In [45]:
# Split dataset into train and test
y = df_no_genres['user_like']
X = df_no_genres.drop(columns=['user_like', 'artist_id'])
X, y = shuffle(X, y, random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [46]:
# Verify data shape after split
print(X_train.shape) # (384, 14)
print(y_train.shape) # (384,)
print(X_test.shape) # (97, 14)
print(y_test.shape) # (97,)

(832, 14)
(832,)
(208, 14)
(208,)


## Model Building

In [63]:
# Using default settings
base_model = RandomForestClassifier(n_estimators = 100, random_state = 123, class_weight = "balanced")

Starting from the simpliest random forest model using default parameters, I want to use cross validation to see the optimal number of features to use. The idea is to use cross validation to calculate an average error and see if we can reduce the error by removing the least important feature iteratively.

In [64]:
def get_most_important_features(X, y):
    model = RandomForestClassifier(n_estimators = 100, random_state = 123)
    # initialize cv score with base model
    base_cv_score = np.mean(cross_val_score(model, X, y, cv=10)) 
    max_cv_score = base_cv_score
    cur_features, new_features = X.columns, X.columns
    new_train = X
    while True:
        # remove feature that results in least impurity decrease
        sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state = 123), max_features=len(new_features)-1, threshold=-np.inf)
        sel.fit(new_train, y)
        new_features = cur_features[sel.get_support()]
        new_train = X[new_features]
        
        # cv score with reduced model
        new_cv_score = np.mean(cross_val_score(model, new_train, y, cv=10))
        
        # update max cv score if model improves
        if new_cv_score > max_cv_score:
            max_cv_score = new_cv_score
            cur_features = new_features
        else:
            break
    print(f"Base CV score: {base_cv_score}")
    print(f"Final CV score: {max_cv_score}")
    print(f"Features to use: {cur_features}")
    return cur_features

reduced_features = get_most_important_features(X_train, y_train)

Base CV score: 0.8161216293746415
Final CV score: 0.8197074010327023
Features to use: Index(['popularity', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms'],
      dtype='object')


Using 10-fold cross validation it looks like we can omit `time_signature` and `mode`. 

In [65]:
X_train = X_train[reduced_features]
X_test = X_test[reduced_features]

In [67]:
base_model.fit(X_train, y_train)
evaluate(base_model, X_test, y_test)
y_pred = base_model.predict(X_test)
f1_score(y_test, y_pred)


Average Error: 21.634615384615387
Accuracy = 78.36538461538461


0.1176470588235294

Using the random forest model with default parameters and omitting `time_signature` and `mode`, we obtain an evaluation accuracy of 77.32%. There's definitely room for improvement. The next step will be to tune the model's hyperparameters.

In [68]:
base_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 123,
 'verbose': 0,
 'warm_start': False}

There are quite a few hyperparameters for our model... Judging from prior experience however, the most impactful ones tend to be:
- the number of decision trees in the forest (i.e. `n_estimators`)
- the maximum number of features to consider when splitting a node (i.e. `max_features`)
- the minimum number of data points allowed in a leaf node (i.e. `min_samples_leaf`)

As a starting point, we can perform random grid search on ideal hyperparameter values. With a general idea of the ideal hyperparameter values we can then refine it with a narrower grid search.

In [76]:
# grid of hyperparamter values to test
rand_grid_values = {'n_estimators':np.arange(100, 200, 400),
              'max_features':np.arange(0.1, 1, 0.1),
              'max_depth': [2, 3, 5, 7, 9],
              'max_samples': [0.3, 0.5, 0.8, 0.9]}

In [91]:
model_random = RandomizedSearchCV(estimator = base_model, 
                                  param_distributions = rand_grid_values,
                                  n_iter = 100,
                                  cv = 3,
                                  random_state = 123,
                                  n_jobs = -1,
                                  scoring='f1',
                                  class_weight='balanced')

TypeError: RandomizedSearchCV.__init__() got an unexpected keyword argument 'class_weight'

In [90]:
model_random.fit(X_train, y_train)

KeyboardInterrupt: 

In [79]:
model_random.best_params_

{'n_estimators': 100, 'max_samples': 0.9, 'max_features': 0.6, 'max_depth': 2}

In [80]:
evaluate(model_random, X_test, y_test)

Average Error: 37.5
Accuracy = 62.5


A decrease of ~ 2% in accuracy. And now for a more refined grid search.

In [82]:
grid_values = {'max_features': [0.6, 0.7, 0.8],
 'max_depth': [7, 8, 9],
 'max_samples': [0.3, 0.4, 0.5, 0.6],
 'n_estimators': [400]}

model_gridsearch = GridSearchCV(estimator = base_model, 
                                param_grid = grid_values,                                  
                                cv = 10,                                 
                                n_jobs = -1,
                                scoring='f1')

In [83]:
model_gridsearch.fit(X_train, y_train)

In [87]:
model_gridsearch.best_params_

{'max_depth': 7, 'max_features': 0.7, 'max_samples': 0.6, 'n_estimators': 400}

In [88]:
evaluate(model_gridsearch, X_test, y_test)

Average Error: 19.71153846153846
Accuracy = 80.28846153846155


It looks like we can't improve the hyperparameters any further with the narrower grid search. It's important to realize what these hyperparameter values suggest. If I allowed `min_sample_lead` to be 1, this would mean a leaf node can contain only 1 sample. This allows the possibility of really deep decision trees. Similarly, if I allowed `max_features` = 1, this would mean each tree in the random forest only considers one feature at a time when splitting a node. The complications of these two hyperparameter values (if allowed to be 1) is that we risk overfitting. 

In [29]:
final_model = RandomForestClassifier(max_features=0.6, 
                                    max_depth=9,
                                    n_estimators=400,
                                    max_samples=0.4,
                                    random_state=123)
final_model.fit(X_train, y_train)

In [30]:
evaluate(final_model, X_test, y_test)

Average Error: 22.68041237113402
Accuracy = 77.31958762886597


In [29]:
# fit on whole dataset
final_model.fit(X.drop(columns=['mode', 'time_signature']), y)
# write model to disk
pickle.dump(final_model, open('rf.sav', 'wb'))