In [101]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings

warnings.filterwarnings('ignore')

In [102]:
class Config:
    RANDOM_SEED = 13
    NUM_FOLDS = 5
    TARGET_COL_NAME = "song_popularity"
    CATEGORICAL_COLS = ["audio_mode", "time_signature", "key"]

DATA_PATH = "./data/"

In [103]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")

In [104]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=Config.RANDOM_SEED).reset_index(drop=True)
    # get the target data
    y = df[target_col_name].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold    
    return df     

df_train = strat_kfold_dataframe(df_train, target_col_name=Config.TARGET_COL_NAME)
df_train.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity,kfold
0,37020,241605.0,0.043564,0.668579,0.940644,0.001371,5.0,0.660077,-5.33475,1,0.098402,145.488755,3,0.752138,0,4
1,5531,,0.136844,0.857481,0.780121,0.003005,5.0,0.200062,-12.28902,0,0.040943,122.209006,3,0.747739,1,4
2,15625,181809.0,0.17635,0.722215,0.904362,0.000554,8.0,0.094481,-4.299832,0,0.061723,119.554958,4,0.648478,0,2
3,10671,167442.0,,0.35493,0.496588,0.000845,8.0,0.112777,-13.174818,0,0.0421,106.928615,3,0.531688,1,2
4,4393,164342.0,0.112951,0.633773,0.922505,0.004053,5.0,0.141997,-5.16945,0,0.036036,126.757199,4,0.661839,0,1


In [105]:
cont_cols = ['song_duration_ms', 'acousticness', 'danceability', 'energy', 
            'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'audio_valence']

In [106]:
def impute_df_col(df, col_name, imputer):
    imputed_col = imputer.fit_transform(df[col_name].to_numpy().reshape(-1, 1))
    return pd.Series(imputed_col.reshape(-1))    

In [107]:
def impute_missing_values(df, cols, col_type="cont"):    
    if col_type == "cont":
        imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    elif col_type == "cat":
        imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")            
    for col in cols:
        df[col] = impute_df_col(df, col, imputer)
    return df

df_train = impute_missing_values(df_train, Config.CATEGORICAL_COLS, col_type="cat")
df_train = impute_missing_values(df_train, cont_cols, col_type="cont")
df_test = impute_missing_values(df_test, Config.CATEGORICAL_COLS, col_type="cat")
df_test = impute_missing_values(df_test, cont_cols, col_type="cont")

In [108]:
[col for col in df_train.columns if df_train[col].isnull().any()]

[]

In [109]:
[col for col in df_test.columns if df_test[col].isnull().any()]

[]

In [110]:
def col_one_hot_encode(df, cols):    
    one_hot_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    one_hot_enc.fit(df[cols])
    return one_hot_enc.transform(df[cols])

In [111]:
def get_input_features(df):
    non_cont_cols = Config.CATEGORICAL_COLS + ["id", "kfold", Config.TARGET_COL_NAME]
    cont_col_names = [item for item in df.columns.values.tolist() if item not in non_cont_cols]     
    X_cont = df[cont_col_names].to_numpy()       
    X_cat_one_hot = col_one_hot_encode(df, Config.CATEGORICAL_COLS)    
    X = np.concatenate((X_cont, X_cat_one_hot), axis=1)    
    return X

def get_fold_data(fold, df):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    X_train = get_input_features(df_train)
    y_train = df_train[Config.TARGET_COL_NAME].to_numpy()
    X_val = get_input_features(df_val)
    y_val = df_val[Config.TARGET_COL_NAME].to_numpy()
    return X_train, y_train, X_val, y_val

In [112]:
def run_training(train_X, train_y, val_X, val_y, params=None):
    # Create the SVC model
    model = LogisticRegression(solver="lbfgs", n_jobs=-1, max_iter=200, C=params["C"], penalty=params["penalty"])
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X)    
    val_X_scaled = scaler.fit_transform(val_X)    
    model.fit(train_X_scaled, train_y.ravel())
    val_y_pred_proba = model.predict_proba(val_X_scaled)
    return roc_auc_score(val_y, val_y_pred_proba[:, 1], average="weighted"), model

In [113]:
# import optuna

# train_X, train_y, val_X, val_y = get_fold_data(0, df_train)

# def objective(trial):   
#     penalty = ['l1', 'l2']
#     params = {        
#         "C": trial.suggest_loguniform("C", 1e-3, 1),
#         "penalty": trial.suggest_categorical("penalty", penalty)
#     }
#     auc_score, _ = run_training(train_X, train_y, val_X, val_y, params)
#     trial_num = trial.number
#     print(f"auc score at end of trial {trial_num} execution = {auc_score}")
#     print(f"trial {trial_num} params = {trial.params}")
#     return auc_score

# study = optuna.create_study(direction="maximize", study_name="ModelTuning")    
# study.optimize(objective, n_trials=15)
# print("Best trial:")
# print(study.best_params)

In [114]:
fold_metrics_model = []
model_params = {'C': 1.0, 'penalty': 'l2'}
for fold in range(Config.NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(fold, df_train)
    fold_auc_score, model = run_training(X_train, y_train, X_val, y_val, params=model_params)
    fold_metrics_model.append((round(fold_auc_score, 4), model))

fold_metrics = [item[0] for item in fold_metrics_model]
print(fold_metrics)    

[0.5541, 0.5492, 0.5538, 0.5421, 0.5505]


In [115]:
fold_metrics_model_sorted = sorted(fold_metrics_model, key=lambda x:x[0], reverse=True) 

In [116]:
best_model = fold_metrics_model_sorted[0][1]
print(best_model)
X_test = get_input_features(df_test)
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
predictions = best_model.predict(X_test_scaled)
print(f"Completed prediction for {len(predictions)} test rows")
df_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df_submission['song_popularity']= predictions
df_submission.to_csv('submission_lr.csv',index=False)

LogisticRegression(max_iter=200, n_jobs=-1)
Completed prediction for 10000 test rows
