In [128]:
import statistics

import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import lightgbm as lgbm
from lightgbm import LGBMClassifier
import optuna.integration.lightgbm as lgb
from optuna.integration.lightgbm import LightGBMTunerCV, LightGBMTuner
import category_encoders
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore')

In [129]:
class Config:
    RANDOM_SEED = 42
    NUM_FOLDS = 5
    TARGET_COL_NAME = "song_popularity"
    CATEGORICAL_COLS = ["key", "audio_mode", "time_signature"]
    EARLY_STOPPING = 500

DATA_PATH = "./data/"

In [130]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")

In [131]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold    
    return df     

df_train = strat_kfold_dataframe(df_train, target_col_name=Config.TARGET_COL_NAME, num_folds=Config.NUM_FOLDS)
df_train.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity,kfold
0,32823,144667.0,0.585599,0.691626,0.503891,0.22236,10.0,0.115524,-11.642316,0,0.049854,86.041825,3,0.539072,1,4
1,16298,203954.0,0.016664,0.667695,0.68382,0.001178,4.0,0.213299,,1,0.102933,97.073546,3,0.667166,0,0
2,28505,179054.0,0.069471,0.624358,0.891436,0.000637,2.0,,,1,0.173795,140.102334,4,0.824423,0,1
3,6689,246074.0,0.333662,0.645299,0.716589,0.003159,2.0,0.11481,-9.178056,0,0.05669,101.694474,4,0.532739,1,3
4,26893,282403.0,0.378221,0.258557,0.617,0.001366,7.0,0.172308,-6.721257,0,0.044904,123.402262,3,0.717549,0,1


In [132]:
cont_cols = ['song_duration_ms', 'acousticness', 'danceability', 'energy', 
            'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence']

In [134]:
def add_missing_col(df, cols_with_nulls):
    for col_name in cols_with_nulls:        
        df[col_name + "_missing"] = [int(item) for item in df[col_name].isna().values]
    return df        

train_cols_withnulls = [col for col in df_train.columns if df_train[col].isnull().any()]
test_cols_withnulls = [col for col in df_test.columns if df_test[col].isnull().any()]
df_train = add_missing_col(df_train, train_cols_withnulls)
df_test = add_missing_col(df_test, test_cols_withnulls)

In [135]:
def impute_df_col(df, col_name, imputer):
    imputed_col = imputer.fit_transform(df[col_name].to_numpy().reshape(-1, 1))
    return pd.Series(imputed_col.reshape(-1))    

In [136]:
def impute_missing_values(df, cols, col_type="cont"):    
    if col_type == "cont":
        imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    elif col_type == "cat":
        imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")            
    for col in cols:
        df[col] = impute_df_col(df, col, imputer)
    return df

df_train = impute_missing_values(df_train, Config.CATEGORICAL_COLS, col_type="cat")
df_train = impute_missing_values(df_train, cont_cols, col_type="cont")
df_test = impute_missing_values(df_test, Config.CATEGORICAL_COLS, col_type="cat")
df_test = impute_missing_values(df_test, cont_cols, col_type="cont")

In [137]:
[col for col in df_train.columns if df_train[col].isnull().any()]

[]

In [138]:
[col for col in df_test.columns if df_test[col].isnull().any()]

[]

In [139]:
df_train = pd.get_dummies(df_train, prefix=Config.CATEGORICAL_COLS, columns=Config.CATEGORICAL_COLS)
df_test = pd.get_dummies(df_test, prefix=Config.CATEGORICAL_COLS, columns=Config.CATEGORICAL_COLS)

In [140]:
def col_int_encoding(df, col_name):
    ordinal_encoder = category_encoders.OrdinalEncoder(cols=[col_name])
    col_encoded = ordinal_encoder.fit_transform(df[col_name])
    return col_encoded.astype(np.int64)

In [141]:
# df_train["key"] = col_int_encoding(df_train, "key")
# df_test["key"] = col_int_encoding(df_test, "key")

In [142]:
# print(df_train["key"].dtype)
# print(df_test["key"].dtype)

In [143]:
def get_input_features(df):
    cols_to_leave = ["id", "kfold", "song_popularity_proba", Config.TARGET_COL_NAME]
    col_names = [item for item in df.columns.values.tolist() if item not in cols_to_leave]    
    return df[col_names]

def get_fold_data(fold, df):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]    
    X_train = get_input_features(df_train)
    y_train = df_train[Config.TARGET_COL_NAME]
    X_val = get_input_features(df_val)
    y_val = df_val[Config.TARGET_COL_NAME]
    return X_train, y_train, X_val, y_val 

In [144]:
cols_to_leave = ["id", "kfold", "song_popularity_proba", Config.TARGET_COL_NAME]
col_names = [item for item in df_train.columns.values.tolist() if item not in cols_to_leave]        

In [145]:
def run_training(train_df, train_y, val_df, val_y, params=None, callbacks=None):
    train_data = lgbm.Dataset(
            data=train_df[col_names], label=train_y, feature_name=col_names#, 
            #categorical_feature=Config.CATEGORICAL_COLS
        )
    val_data = lgbm.Dataset(
            data=val_df[col_names], label=val_y, feature_name=col_names, 
            #categorical_feature=Config.CATEGORICAL_COLS, 
            reference=train_data
        )    
    if callbacks is not None:        
        model = lgbm.train(
                    params,
                    train_set=train_data,                
                    valid_sets=val_data,
                    verbose_eval=-1,
                    callbacks=callbacks
                )
    else:
        model = lgbm.train(
                    params,
                    train_set=train_data,                
                    valid_sets=val_data,
                    verbose_eval=-1
                )       
    val_preds = model.predict(val_df, num_iteration=model.best_iteration)    
    auc = roc_auc_score(val_y, val_preds)
    return auc, val_preds, model    

In [146]:
def tune_params(train_df, train_y, params=None):
    train_data = lgbm.Dataset(
            data=train_df[col_names], label=train_y, feature_name=col_names#, 
            #categorical_feature=Config.CATEGORICAL_COLS
        )   
    lgbmtuner_cv = LightGBMTunerCV(
        params,
        train_set=train_data,        
        stratified=True,
        shuffle=True,
        nfold=Config.NUM_FOLDS,
        verbose_eval=-1
    ) 
    lgbmtuner_cv.run()                
    print("Best Params: ", lgbmtuner_cv.best_params)    
    print("Best score: ", lgbmtuner_cv.best_score)    
    return lgbmtuner_cv    

In [147]:
# params = {
#         "objective": "binary",
#         "metric": "auc",
#         "verbosity": -1,
#         "boosting_type": "gbdt",
#     }

# train_y = df_train[Config.TARGET_COL_NAME]
# tuned_model = tune_params(df_train, train_y, params)

In [148]:
# import optuna

# def rf_objective(trial):       
#     params_dynamic = {        
#         "num_iterations": int(trial.suggest_int("num_iterations", 500, 10000)),
#         "max_depth": trial.suggest_int("max_depth", 3, 100),        
#     }
#     params_static = {
#         'objective': 'binary', 
#         'metric': 'auc', 
#         'verbose': -1, 
#         'boosting_type': 'gbdt', 
#         'feature_pre_filter': False, 
#         'lambda_l1': 9.439044618205312, 
#         'lambda_l2': 0.615750336486198, 
#         'num_leaves': 3, 
#         'feature_fraction': 0.62, 
#         'bagging_fraction': 0.5286479709465361, 
#         'bagging_freq': 1, 
#         'min_child_samples': 20,    
#         "cat_smooth": 96,
#         "cat_l2": 17,        
#         "early_stopping_round": 500
#         }
#     params = {**params_dynamic, **params_static}
#     #params["early_stopping_rounds"] = int(params["n_estimators"] * 0.1)    
#     fold_auc = []
#     # Add a callback for pruning.
#     pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")    
#     train_y = df_train[Config.TARGET_COL_NAME]
#     train_data = lgbm.Dataset(data=df_train[col_names], label=train_y, feature_name=col_names) #, categorical_feature=Config.CATEGORICAL_COLS)
#     result = lgbm.cv(
#                 params,
#                 train_set = train_data,
#                 nfold = Config.NUM_FOLDS,
#                 stratified = True,
#                 shuffle = True,
#                 seed = Config.RANDOM_SEED,
#                 feature_name = col_names
#                 #categorical_feature = Config.CATEGORICAL_COLS
#             )
#     #print(result)            
#     mean_auc = np.mean(result["auc-mean"])
#     return mean_auc

# study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize", study_name="LGBMModelTuning")    
# study.optimize(rf_objective, n_trials=20)
# print("Best trial:")
# print(study.best_params)

In [149]:
model_params1 = {
    'objective': 'binary', 
    'metric': 'auc', 
    'verbosity': -1, 
    'boosting_type': 'gbdt', 
    'feature_pre_filter': False, 
    'lambda_l1': 3.5832756412408226e-05, 
    'lambda_l2': 7.499744130226807, 
    'num_leaves': 2, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.9969384880158432, 
    'bagging_freq': 2, 
    'min_child_samples': 20, 
    'num_iterations': 1000, 
    'early_stopping_round': None, 
    'categorical_column': [5, 8, 11]
    }

In [151]:
# LightGBMTunerCV with 5 folds
model_params3 = {
    'objective': 'binary', 
    'metric': 'auc', 
    'verbose': -1, 
    'boosting_type': 'gbdt', 
    'feature_pre_filter': False, 
    'lambda_l1': 7.994567596327698, 
    'lambda_l2': 0.04882156030098934, 
    'num_leaves': 2, 
    'feature_fraction': 0.48000000000000004, 
    'bagging_fraction': 0.957852828579762, 
    'bagging_freq': 5, 
    'min_child_samples': 100,
    "cat_smooth": 96,
    "cat_l2": 17,
    'num_iterations': 10000,
    "early_stopping_round": 1000
}

In [None]:
# tuning with LightGBMTunerCV

model_params2 = {
    'objective': 'binary', 
    'metric': 'auc', 
    'verbose': -1, 
    'boosting_type': 'gbdt', 
    'feature_pre_filter': False, 
    'lambda_l1': 9.439044618205312, 
    'lambda_l2': 0.615750336486198, 
    'num_leaves': 3, 
    'feature_fraction': 0.62, 
    'bagging_fraction': 0.5286479709465361, 
    'bagging_freq': 1, 
    'min_child_samples': 20,    
    "cat_smooth": 96,
    "cat_l2": 17,
    'num_iterations': 10000,
    "early_stopping_round": 500
    }

In [186]:
model_params4 = {
    'objective': 'binary', 
    'metric': 'auc', 
    'verbose': -1, 
    'boosting_type': 'gbdt', 
    'feature_pre_filter': False, 
    'lambda_l1': 1.8963776746935667e-06, 
    'lambda_l2': 9.579973179083996, 
    'num_leaves': 3, 
    'feature_fraction': 0.7, 
    'bagging_fraction': 0.7684172265445463, 
    'bagging_freq': 2, 
    'min_child_samples': 20,
    'num_iterations': 10000,
    "early_stopping_round": 800,
    "learning_rate": 0.01
    }

#Best score:  0.5740860150826701

In [187]:
fold_metrics_model = []
test_preds = {}
df_train["song_popularity_proba"] = 0.0
for fold in range(Config.NUM_FOLDS):
    train_df, train_y, val_df, val_y = get_fold_data(fold, df_train) 
    test_df = get_input_features(df_test)       
    fold_auc_score, fold_val_preds, model = run_training(train_df, train_y, val_df, val_y, params=model_params4)
    print(f"fold {fold } auc score = {fold_auc_score}")
    # add the validation probability predictions for the fold to a new column in train data
    df_train.loc[df_train.kfold == fold, "song_popularity_proba"] = fold_val_preds    
    fold_test_preds = model.predict(test_df, num_iteration=model.best_iteration)
    pred_col_name = f"fold_{fold}_test_preds"
    test_preds[pred_col_name] = fold_test_preds 
    fold_metrics_model.append((round(fold_auc_score, 6), model))

Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[5088]	valid_0's auc: 0.573621
fold 0 auc score = 0.5736209863592064
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[7914]	valid_0's auc: 0.571415
fold 1 auc score = 0.5714154738232213
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[2826]	valid_0's auc: 0.579844
fold 2 auc score = 0.5798443273948367
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[2038]	valid_0's auc: 0.580331
fold 3 auc score = 0.5803313482124637
Training until validation scores don't improve for 800 rounds
Early stopping, best iteration is:
[3684]	valid_0's auc: 0.576271
fold 4 auc score = 0.5762711772930507


In [182]:
fold_metrics = [item[0] for item in fold_metrics_model]
print(f"auc scores = {fold_metrics}")    
cv_auc_mean = statistics.mean(fold_metrics)
cv_auc_stdev = statistics.stdev(fold_metrics)
print(f"mean auc across folds = {cv_auc_mean}, auc stdev across folds = {cv_auc_stdev}")

auc scores = [0.573367, 0.571255, 0.581349, 0.580906, 0.576361]
mean auc across folds = 0.5766476, auc stdev across folds = 0.004476654532125554


In [176]:
df_test_preds = pd.DataFrame(test_preds)
test_pred_cols = [f"fold_{fold}_test_preds" for fold in range(Config.NUM_FOLDS)]
df_test_preds["mean_test_pred"] = df_test_preds[test_pred_cols].mean(axis=1)
print(f"Completed prediction for {len(df_test)} test rows")
df_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df_submission['song_popularity']= df_test_preds["mean_test_pred"]
df_submission.to_csv('submission_lgbm.csv',index=False)
df_submission.head()

Completed prediction for 10000 test rows


Unnamed: 0,id,song_popularity
0,0,0.44362
1,1,0.462369
2,2,0.278161
3,3,0.292563
4,4,0.358876


In [178]:
lgbm_val_preds = df_train[["id", "song_popularity_proba", "song_popularity"]]
lgbm_val_preds.to_csv("lgbm_val_preds.csv")
print("Saved validation predictions for all folds to csv")

Saved validation predictions for all folds to csv


In [177]:
df_submission[df_submission.song_popularity > 0.5]

Unnamed: 0,id,song_popularity
39,39,0.508618
97,97,0.515870
130,130,0.504178
171,171,0.503326
254,254,0.555047
...,...,...
9839,9839,0.502612
9866,9866,0.540699
9972,9972,0.527604
9983,9983,0.507024
