This notebook produces the Random Forest algorithm to solve the song popularity prediction challenge.

The parameters have been obtained through Bayesian search. 
The missing values are replaced using an Iterative imputer.

To get more insight into the data, see the file *exploratory/first_exploratory_kevin.ipynb*

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 



def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

train = pd.read_csv('../raw_data/train.csv', index_col = 0)
test = pd.read_csv('../raw_data/test.csv', index_col = 0)


from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

it_imputer = IterativeImputer(max_iter=10)
useful_cols = [col for col in train.columns if col not in ['id', 'song_popularity']]
train_iterimp = it_imputer.fit_transform(train[useful_cols])
test_iterimp = it_imputer.transform(test[useful_cols])

df_train_iterimp = pd.DataFrame(train_iterimp, columns=useful_cols)

X_train, X_valid, y_train, y_valid = train_test_split(df_train_iterimp, train['song_popularity'], test_size=0.2, random_state=42)

In [None]:
df_train_iterimp['instrumentalness_v1'] = df_train_iterimp['instrumentalness'].apply(lambda x: x if x > 0.01 else -1)
df_train_iterimp['instrumentalness_v2'] = df_train_iterimp['instrumentalness'].apply(lambda x: x if x > 0.11 else 0)

X_train, X_valid, y_train, y_valid = train_test_split(df_train_iterimp, train['song_popularity'], test_size=0.2, random_state=42)

In [None]:

useful_cols = [col for col in train.columns if col not in ['id', 'song_popularity']]
train_iterimp = it_imputer.fit_transform(train[useful_cols])
# test_iterimp = it_imputer.transform(test[useful_cols])
df_train_iterimp = pd.DataFrame(train_iterimp, columns=useful_cols)

X_train, X_valid, y_train, y_valid = train_test_split(df_train_iterimp, train['song_popularity'], test_size=0.2, random_state=42)

In [None]:
useful_cols = [col for col in train.columns if col not in ['id', 'song_popularity', 'instrumentalness']]
clf_bayes = RandomForestClassifier(n_estimators=500, max_features='log2', max_depth=12, min_samples_split=20, min_samples_leaf=30, class_weight='balanced_subsample')
clf_bayes.fit(X_train, y_train)
print(clf_bayes.score(X_valid, y_valid))
LR_AUC = roc_auc_score(y_valid, clf_bayes.predict_proba(X_valid)[:,1])
print("AUC: " + str(LR_AUC))

In [None]:
submission = pd.read_csv('../raw_data/sample_submission.csv', index_col = 0)
useful_cols = [col for col in test.columns if col not in ['id', 'song_popularity','instrumentalness_v1', 'instrumentalness_v2']]
df_test = pd.DataFrame(test_iterimp, columns=useful_cols)

useful_cols = [col for col in df_train_iterimp.columns if col not in ['id', 'song_popularity']]

df_test['instrumentalness_v1'] = df_test['instrumentalness'].apply(lambda x: x if x > 0.01 else -1)
df_test['instrumentalness_v2'] = df_test['instrumentalness'].apply(lambda x: x if x > 0.11 else 0)
submission['song_popularity'] = clf_bayes.predict_proba(df_test[useful_cols])[:,1]
submission.head()
submission.to_csv('../submissions/submission_rf_balanced_subsample_new_instry.csv')