In [59]:
import statistics

import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
import warnings
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')

In [60]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 10
    TARGET_COL_NAME = "song_popularity"
    CATEGORICAL_COLS = ["audio_mode", "time_signature", "key"]

DATA_PATH = "./data/"

In [61]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")

In [62]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold    
    return df     

df_train = strat_kfold_dataframe(df_train, target_col_name=Config.TARGET_COL_NAME, num_folds=Config.NUM_FOLDS)
df_train.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity,kfold
0,32823,144667.0,0.585599,0.691626,0.503891,0.22236,10.0,0.115524,-11.642316,0,0.049854,86.041825,3,0.539072,1,8
1,16298,203954.0,0.016664,0.667695,0.68382,0.001178,4.0,0.213299,,1,0.102933,97.073546,3,0.667166,0,0
2,28505,179054.0,0.069471,0.624358,0.891436,0.000637,2.0,,,1,0.173795,140.102334,4,0.824423,0,3
3,6689,246074.0,0.333662,0.645299,0.716589,0.003159,2.0,0.11481,-9.178056,0,0.05669,101.694474,4,0.532739,1,6
4,26893,282403.0,0.378221,0.258557,0.617,0.001366,7.0,0.172308,-6.721257,0,0.044904,123.402262,3,0.717549,0,2


In [63]:
cont_cols = ['song_duration_ms', 'acousticness', 'danceability', 'energy', 
            'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence']

In [None]:
def add_missing_col(df, cols_with_nulls):
    for col_name in cols_with_nulls:        
        df[col_name + "_missing"] = [int(item) for item in df[col_name].isna().values]
    return df        

train_cols_withnulls = [col for col in df_train.columns if df_train[col].isnull().any()]
test_cols_withnulls = [col for col in df_test.columns if df_test[col].isnull().any()]
df_train = add_missing_col(df_train, train_cols_withnulls)
df_test = add_missing_col(df_test, test_cols_withnulls)

In [64]:
def impute_df_col(df, col_name, imputer):
    imputed_col = imputer.fit_transform(df[col_name].to_numpy().reshape(-1, 1))
    return pd.Series(imputed_col.reshape(-1))    

In [65]:
def impute_missing_values(df, cols, col_type="cont"):    
    if col_type == "cont":
        imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    elif col_type == "cat":
        imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")            
    for col in cols:
        df[col] = impute_df_col(df, col, imputer)
    return df

df_train = impute_missing_values(df_train, Config.CATEGORICAL_COLS, col_type="cat")
df_train = impute_missing_values(df_train, cont_cols, col_type="cont")
df_test = impute_missing_values(df_test, Config.CATEGORICAL_COLS, col_type="cat")
df_test = impute_missing_values(df_test, cont_cols, col_type="cont")

In [67]:
[col for col in df_train.columns if df_train[col].isnull().any()]

[]

In [68]:
[col for col in df_test.columns if df_test[col].isnull().any()]

[]

In [69]:
def col_one_hot_encode(df, cols):    
    one_hot_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    one_hot_enc.fit(df[cols])
    return one_hot_enc.transform(df[cols])

In [70]:
def get_input_features(df):
    non_cont_cols = Config.CATEGORICAL_COLS + ["id", "kfold", "song_popularity_proba", Config.TARGET_COL_NAME]
    cont_col_names = [item for item in df.columns.values.tolist() if item not in non_cont_cols]     
    X_cont = df[cont_col_names].to_numpy()       
    X_cat_one_hot = col_one_hot_encode(df, Config.CATEGORICAL_COLS)    
    X = np.concatenate((X_cont, X_cat_one_hot), axis=1)    
    return X

def get_fold_data(fold, df):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    X_train = get_input_features(df_train)
    y_train = df_train[Config.TARGET_COL_NAME].to_numpy()
    X_val = get_input_features(df_val)
    y_val = df_val[Config.TARGET_COL_NAME].to_numpy()
    return X_train, y_train, X_val, y_val

In [71]:
def run_training(model, train_X, train_y, val_X, val_y):        
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X)    
    val_X_scaled = scaler.fit_transform(val_X)    
    model.fit(train_X_scaled, train_y.ravel())
    val_y_pred_proba = model.predict_proba(val_X_scaled)
    auc = roc_auc_score(val_y, val_y_pred_proba[:, 1], average="weighted")
    return auc, model, val_y_pred_proba[:, 1]

In [72]:
# import optuna

# def rf_objective(trial):       
#     params = {        
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "max_depth": trial.suggest_int("max_depth", 4, 20),
#         "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", [1, 2, 4]),
#         "min_samples_split": trial.suggest_categorical("min_samples_split", [2, 4, 8]),
#         "max_features": trial.suggest_categorical("max_features", ["auto", "log2"])
#     }
#     rf_model = RandomForestClassifier(
#                 n_estimators=params["n_estimators"],                 
#                 max_depth=params["max_depth"],
#                 min_samples_leaf=params["min_samples_leaf"],
#                 min_samples_split=params["min_samples_split"],
#                 max_features=params["max_features"],
#                 random_state=Config.RANDOM_SEED,
#                 n_jobs=-1
#             )   

#     fold_auc = []
#     for fold in range(Config.NUM_FOLDS):
#         train_X, train_y, val_X, val_y = get_fold_data(fold, df_train)
#         auc_score, _, _ = run_training(rf_model, train_X, train_y, val_X, val_y)
#         fold_auc.append(auc_score)
#     mean_auc = statistics.mean(fold_auc)                
#     return mean_auc

# study = optuna.create_study(direction="maximize", study_name="RFModelTuning")    
# study.optimize(rf_objective, n_trials=20)
# print("Best trial:")
# print(study.best_params)

In [73]:
# Best trial:
# {'n_estimators': 703, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 4, 'max_features': 'log2'}

In [74]:
rf_model_params = {'n_estimators': 703, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 4, 'max_features': 'log2'}
#rf_model_params = study.best_params
rf_model = RandomForestClassifier(
                n_estimators=rf_model_params["n_estimators"],                 
                max_depth=rf_model_params["max_depth"],
                min_samples_leaf=rf_model_params["min_samples_leaf"],
                min_samples_split=rf_model_params["min_samples_split"],
                max_features=rf_model_params["max_features"],
                random_state=Config.RANDOM_SEED,
                n_jobs=-1
            )     

In [75]:
fold_metrics_model = []
test_preds = {}
df_train["song_popularity_proba"] = 0.0
for fold in range(Config.NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(fold, df_train)
    fold_auc_score, model, fold_val_preds = run_training(rf_model, X_train, y_train, X_val, y_val)
    print(f"fold {fold } auc score = {fold_auc_score}")
    # add the validation probability predictions for the fold to a new column in train data
    df_train.loc[df_train.kfold == fold, "song_popularity_proba"] = fold_val_preds    
    X_test = get_input_features(df_test)
    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    fold_test_preds = model.predict_proba(X_test_scaled)[:, 1]
    pred_col_name = f"fold_{fold}_test_preds"
    test_preds[pred_col_name] = fold_test_preds    
    fold_metrics_model.append((round(fold_auc_score, 4), model))

fold 0 auc score = 0.5469573443245384
fold 1 auc score = 0.5813191604636079
fold 2 auc score = 0.564593566086995
fold 3 auc score = 0.5719182480554394
fold 4 auc score = 0.5780865006977429
fold 5 auc score = 0.5684082179332347
fold 6 auc score = 0.572460339673066
fold 7 auc score = 0.5756631780998939
fold 8 auc score = 0.5708795673914504
fold 9 auc score = 0.5723040707382776


In [76]:
fold_metrics = [item[0] for item in fold_metrics_model]
print(f"auc scores = {fold_metrics}")    
cv_auc_mean = statistics.mean(fold_metrics)
cv_auc_stdev = statistics.stdev(fold_metrics)
print(f"mean auc across folds = {cv_auc_mean}, auc stdev across folds = {cv_auc_stdev}")

auc scores = [0.547, 0.5813, 0.5646, 0.5719, 0.5781, 0.5684, 0.5725, 0.5757, 0.5709, 0.5723]
mean auc across folds = 0.57027, auc stdev across folds = 0.009438343781264439


In [77]:
df_test_preds = pd.DataFrame(test_preds)
test_pred_cols = [f"fold_{fold}_test_preds" for fold in range(Config.NUM_FOLDS)]
df_test_preds["mean_test_pred"] = df_test_preds[test_pred_cols].mean(axis=1)
print(f"Completed prediction for {len(df_test)} test rows")
df_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df_submission['song_popularity']= df_test_preds["mean_test_pred"]
df_submission.to_csv('submission_rf.csv',index=False)
df_submission.head()

Completed prediction for 10000 test rows


Unnamed: 0,id,song_popularity
0,0,0.372371
1,1,0.420812
2,2,0.326717
3,3,0.34808
4,4,0.361492


In [78]:
df_submission[df_submission.song_popularity > 0.5]

Unnamed: 0,id,song_popularity
1003,1003,0.524419
1558,1558,0.507786
2344,2344,0.520868
2755,2755,0.500204
2958,2958,0.534716
4028,4028,0.506777
4161,4161,0.505768
4629,4629,0.513453
5146,5146,0.54411
5777,5777,0.512072


In [79]:
rf_val_preds = df_train[["id", "song_popularity_proba", "song_popularity"]]
rf_val_preds.to_csv("rf_val_preds.csv")
print("Saved validation predictions for all folds to csv")

Saved validation predictions for all folds to csv
