In [105]:
import numpy as np
import pandas as pd
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import warnings

warnings.filterwarnings('ignore')

In [106]:
MODELS_TO_USE = ["resnext50", "effnetb4"]
NUM_FOLDS = 5
NUM_CLASSES = 19
DATA_ROOT_FOLDER = "./data/"

In [107]:
df_test = pd.read_csv("./data/test.csv")
df_test_resnext50 = pd.read_csv("./data/test_preds_proba_resnext50.csv")
df_test_effnetb4 = pd.read_csv("./data/test_preds_proba_effnetb4.csv")
df_train_oof_resnext50 = pd.read_csv("./data/df_train_resnext50_oof_preds.csv")
df_train_oof_resnext50 = df_train_oof_resnext50.drop(["Unnamed: 0"], axis=1)
df_train_oof_effnetb4 = pd.read_csv("./data/df_train_effnet_b4_oof_preds.csv")
df_train_oof_effnetb4 = df_train_oof_effnetb4.drop(["Unnamed: 0"], axis=1)

In [108]:
def process_test_df(df, model_name):
    proba_cols = ["proba_" + str(i) for i in range(19)]
    result_softmax = softmax(df.loc[:, proba_cols].astype(float).values, axis=1)
    softmax_cols = [f"{model_name}_softmax_" + str(i) for i in range(19)]
    df_softmax = pd.DataFrame(result_softmax, columns=softmax_cols)
    df_cols_subset = df.loc[:, ["song_id"]]    
    return pd.concat([df_cols_subset, df_softmax], axis=1)

In [109]:
def process_oof_df(df, model_name):
    proba_cols = ["proba_" + str(i) for i in range(19)]
    result_softmax = softmax(df.loc[:, proba_cols].astype(float).values, axis=1)
    softmax_cols = [f"{model_name}_softmax_" + str(i) for i in range(19)]
    df_softmax = pd.DataFrame(result_softmax, columns=softmax_cols)
    df_cols_subset = df.loc[:, ["song_id", "genre_id", "kfold", "val_preds"]]
    df_cols_subset = df_cols_subset.rename(columns = {"val_preds": f"{model_name}_val_preds"})
    return pd.concat([df_cols_subset, df_softmax], axis=1)

In [110]:
df_train_oof_resnext50_final = process_oof_df(df_train_oof_resnext50, "resnext50")
df_train_oof_effnetb4_final = process_oof_df(df_train_oof_effnetb4, "effnetb4")

In [111]:
df_test_resnext50_final = process_test_df(df_test_resnext50, "resnext50")
df_test_effnetb4_final = process_test_df(df_test_effnetb4, "effnetb4")

In [112]:
df_test_resnext50_final.head()

Unnamed: 0,song_id,resnext50_softmax_0,resnext50_softmax_1,resnext50_softmax_2,resnext50_softmax_3,resnext50_softmax_4,resnext50_softmax_5,resnext50_softmax_6,resnext50_softmax_7,resnext50_softmax_8,resnext50_softmax_9,resnext50_softmax_10,resnext50_softmax_11,resnext50_softmax_12,resnext50_softmax_13,resnext50_softmax_14,resnext50_softmax_15,resnext50_softmax_16,resnext50_softmax_17,resnext50_softmax_18
0,7072,0.056585,0.43182,0.221335,0.095421,0.002688,0.015298,0.01254721,0.100401,0.023768,0.001646,0.022204,0.005581,2.7e-05,0.00888,0.0004552833,0.0001257025,0.0007568391,0.0004152336,4.446321e-05
1,10207,1.1e-05,1.3e-05,1e-06,4.6e-05,1e-06,6e-06,1.789244e-07,8e-06,6e-06,4e-06,1e-06,5e-06,0.999895,1e-06,6.134041e-09,1.101776e-08,8.764869e-07,9.427575e-07,1.068574e-07
2,20008,0.638078,0.046274,0.011235,0.103736,0.00311,0.000525,0.1624136,0.006229,0.005406,0.000589,0.019372,0.000252,1.6e-05,0.002374,4.781622e-05,6.270782e-05,0.0002250669,3.560082e-05,2.073471e-05
3,10924,0.557568,0.023448,0.011676,0.023453,0.05783,0.000902,0.2767621,0.004719,0.024925,0.004445,0.007735,3.4e-05,8e-06,0.003341,0.0004451062,0.001783987,9.806292e-05,0.0007498062,7.663517e-05
4,21896,0.047341,0.027849,0.009823,0.04497,0.821797,0.008706,0.002372168,0.001906,0.014753,0.005695,0.00407,0.000292,5.5e-05,0.000328,0.0006122166,0.0003833099,0.007680539,0.0009570738,0.0004089861


In [114]:
df_train_final = pd.merge(
    left = df_train_oof_resnext50_final,
    right = df_train_oof_effnetb4_final,
    how = "inner",
    on = ["song_id", "genre_id", "kfold"]
)

In [115]:
df_test_final = pd.merge(
    left = df_test_resnext50_final,
    right = df_test_effnetb4_final,
    how = "inner",
    on = ["song_id"]
)

In [116]:
df_test_final.head()

Unnamed: 0,song_id,resnext50_softmax_0,resnext50_softmax_1,resnext50_softmax_2,resnext50_softmax_3,resnext50_softmax_4,resnext50_softmax_5,resnext50_softmax_6,resnext50_softmax_7,resnext50_softmax_8,...,effnetb4_softmax_9,effnetb4_softmax_10,effnetb4_softmax_11,effnetb4_softmax_12,effnetb4_softmax_13,effnetb4_softmax_14,effnetb4_softmax_15,effnetb4_softmax_16,effnetb4_softmax_17,effnetb4_softmax_18
0,7072,0.056585,0.43182,0.221335,0.095421,0.002688,0.015298,0.01254721,0.100401,0.023768,...,0.002271,0.066919,0.022075,5.5e-05,0.010613,0.0004814314,0.0001724179,0.001282,0.000506,4e-05
1,10207,1.1e-05,1.3e-05,1e-06,4.6e-05,1e-06,6e-06,1.789244e-07,8e-06,6e-06,...,4e-06,4e-06,6e-06,0.999715,6e-06,6.848597e-08,5.255151e-07,1.2e-05,1.2e-05,3.2e-05
2,20008,0.638078,0.046274,0.011235,0.103736,0.00311,0.000525,0.1624136,0.006229,0.005406,...,0.000694,0.036751,0.000325,3e-05,0.001613,2.685051e-05,0.0001387402,0.000189,2.6e-05,9e-06
3,10924,0.557568,0.023448,0.011676,0.023453,0.05783,0.000902,0.2767621,0.004719,0.024925,...,0.000637,0.009676,8.2e-05,2.1e-05,0.000479,2.769872e-05,0.0028406,3e-05,6.1e-05,1.8e-05
4,21896,0.047341,0.027849,0.009823,0.04497,0.821797,0.008706,0.002372168,0.001906,0.014753,...,0.002262,0.000617,3e-05,9e-06,0.000102,0.0002502954,0.000256118,0.000266,0.000822,0.000102


In [117]:
y_col = "genre_id"
X_cols = []
for model_name in MODELS_TO_USE:
    X_cols.extend([f"{model_name}_softmax_" + str(i) for i in range(19)])

In [118]:
def get_fold_data(fold, df, X_cols, y_col):
    df_train = df[df.kfold != fold]
    df_val = df[df.kfold == fold]
    X_train = df_train[X_cols].to_numpy()
    y_train = df_train[y_col].to_numpy()
    X_val = df_val[X_cols].to_numpy()
    y_val = df_val[y_col].to_numpy()
    return X_train, y_train, X_val, y_val

In [119]:
def run_training(train_X, train_y, val_X, val_y, params=None):    
    model = LogisticRegression(
        solver="liblinear", 
        n_jobs=-1, 
        max_iter=600, 
        C=params["C"], 
        penalty=params["penalty"],
        random_state=42
    )    
    model = model.fit(train_X, train_y)
    val_y_pred = np.argmax(model.predict_proba(val_X), axis=1)        
    f1 = f1_score(val_y, val_y_pred, average="micro")
    return f1, model, val_y_pred    


In [120]:
# import optuna

# train_X, train_y, val_X, val_y = get_fold_data(0, df_train_final, X_cols, y_col)

# def objective(trial):   
#     penalty = ['l1', 'l2']
#     params = {        
#         "C": trial.suggest_loguniform("C", 1e-3, 1),
#         "penalty": trial.suggest_categorical("penalty", penalty)
#     }
#     f1, _, _ = run_training(train_X, train_y, val_X, val_y, params)    
#     return f1

# study = optuna.create_study(direction="maximize", study_name="ModelTuning")    
# study.optimize(objective, n_trials=30)
# print("Best trial:")
# print(study.best_params)

In [124]:
fold_metrics_model = []
test_preds = {}
model_params = {'C': 0.24, "penalty": "l1"}

for fold in range(NUM_FOLDS):
    X_train, y_train, X_val, y_val = get_fold_data(fold, df_train_final, X_cols, y_col)    
    fold_f1, model, fold_val_preds = run_training(X_train, y_train, X_val, y_val, params=model_params)    
    print(f"fold {fold } f1 = {fold_f1}")
    # add the validation probability predictions for the fold to a new column in train data
    df_train_final.loc[df_train_final.kfold == fold, "meta_val_preds"] = fold_val_preds    
    X_test = df_test_final[X_cols].to_numpy()    
    fold_test_preds = np.argmax(model.predict_proba(X_test), axis=1)
    pred_col_name = f"fold_{fold}_genre_id"
    df_test_final.loc[:, pred_col_name] = fold_test_preds    
    fold_metrics_model.append((round(fold_f1, 4), model))    

fold 0 f1 = 0.5190858864892014
fold 1 f1 = 0.5391762933199398
fold 2 f1 = 0.5293822199899548
fold 3 f1 = 0.5394274234053239
fold 4 f1 = 0.5420748555639286


In [125]:
import statistics

fold_metrics = [item[0] for item in fold_metrics_model]
print(f"f1 scores = {fold_metrics}")    
cv_auc_mean = statistics.mean(fold_metrics)
cv_auc_stdev = statistics.stdev(fold_metrics)
print(f"mean f1 across folds = {cv_auc_mean}, f1 stdev across folds = {cv_auc_stdev}")

f1 scores = [0.5191, 0.5392, 0.5294, 0.5394, 0.5421]
mean f1 across folds = 0.53384, f1 stdev across folds = 0.009550549722398188


In [126]:
cv_f1 = f1_score(y_pred=df_train_final.meta_val_preds, y_true=df_train_final.genre_id, average="micro")
print(f"Cross validation F1 score across {len(fold_metrics)} folds = {cv_f1}")

Cross validation F1 score across 5 folds = 0.5338289215932492


In [128]:
df_test_final

Unnamed: 0,song_id,resnext50_softmax_0,resnext50_softmax_1,resnext50_softmax_2,resnext50_softmax_3,resnext50_softmax_4,resnext50_softmax_5,resnext50_softmax_6,resnext50_softmax_7,resnext50_softmax_8,...,effnetb4_softmax_14,effnetb4_softmax_15,effnetb4_softmax_16,effnetb4_softmax_17,effnetb4_softmax_18,fold_0_genre_id,fold_1_genre_id,fold_2_genre_id,fold_3_genre_id,fold_4_genre_id
0,7072,0.056585,0.431820,2.213345e-01,0.095421,2.688377e-03,1.529788e-02,1.254721e-02,1.004014e-01,2.376840e-02,...,4.814314e-04,1.724179e-04,0.001282,0.000506,0.000040,1,1,1,1,1
1,10207,0.000011,0.000013,1.455328e-06,0.000046,1.161467e-06,5.858841e-06,1.789244e-07,7.952931e-06,6.318192e-06,...,6.848597e-08,5.255151e-07,0.000012,0.000012,0.000032,12,12,12,12,12
2,20008,0.638078,0.046274,1.123479e-02,0.103736,3.110118e-03,5.249538e-04,1.624136e-01,6.228733e-03,5.405520e-03,...,2.685051e-05,1.387402e-04,0.000189,0.000026,0.000009,0,0,0,0,0
3,10924,0.557568,0.023448,1.167569e-02,0.023453,5.782992e-02,9.022776e-04,2.767621e-01,4.718635e-03,2.492536e-02,...,2.769872e-05,2.840600e-03,0.000030,0.000061,0.000018,0,6,6,0,6
4,21896,0.047341,0.027849,9.823382e-03,0.044970,8.217968e-01,8.706402e-03,2.372168e-03,1.906367e-03,1.475291e-02,...,2.502954e-04,2.561180e-04,0.000266,0.000822,0.000102,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5071,6427,0.117352,0.050071,2.960082e-02,0.664346,1.944399e-03,4.280234e-03,2.346022e-02,5.978660e-02,1.039580e-02,...,3.616243e-04,3.324445e-05,0.003006,0.000394,0.000024,3,3,3,3,3
5072,16903,0.594144,0.053450,1.599247e-02,0.006315,1.946459e-02,1.994264e-03,5.400405e-02,2.527278e-02,4.808720e-02,...,1.267324e-04,6.151938e-03,0.000042,0.000198,0.000585,0,0,0,0,0
5073,1731,0.028183,0.000269,6.861578e-05,0.003083,7.069880e-04,2.131417e-06,9.665425e-01,2.426378e-05,4.097531e-04,...,7.497674e-06,2.363183e-04,0.000020,0.000006,0.000001,6,6,6,6,6
5074,12871,0.640112,0.019862,2.806539e-02,0.011417,6.120078e-02,9.174632e-04,8.198505e-02,2.583728e-03,6.424158e-02,...,1.371638e-03,2.271060e-02,0.000286,0.001121,0.003979,0,0,0,0,0


In [129]:
def combine_preds(test_row):
    preds = np.zeros(NUM_CLASSES)
    for fold in range(NUM_FOLDS):
        fold_pred = int(test_row[f"fold_{fold}_genre_id"])
        preds[fold_pred] += 1
    return np.argmax(preds)

if NUM_FOLDS > 1:
    df_test_final["genre_id"] = df_test_final.apply(lambda row:combine_preds(row), axis=1)
    df_test_final.to_csv(DATA_ROOT_FOLDER + "test_fold_preds_final.csv")    

In [130]:
from IPython.display import display

df_submission = pd.read_csv(DATA_ROOT_FOLDER + "sample_submission.csv")
# records present both in df_test and df_submission
df_subm_test = pd.merge(
    left=df_test_final,
    right=df_submission,
    how="right",
    on="song_id",
    suffixes=("_test", "_subm")
)
print(f"len(df_subm_test) = {len(df_subm_test)}")
display(df_subm_test.head())

len(df_subm_test) = 5078


Unnamed: 0,song_id,filename,filepath,genre_id
0,7072,007072.ogg,test/007072.ogg,0
1,10207,010207.ogg,test/010207.ogg,0
2,20008,020008.ogg,test/020008.ogg,0
3,10924,010924.ogg,test/010924.ogg,0
4,21896,021896.ogg,test/021896.ogg,0


In [131]:
df_invalid = df_subm_test[df_subm_test.filename.isnull()][["song_id", "genre_id"]]
df_preds = pd.concat([df_test_final[["song_id", "genre_id"]], df_invalid], axis=0)
df_preds.genre_id = df_preds.genre_id.astype(int)
print(f"len(df_preds) = {len(df_preds)}")

len(df_preds) = 5076


In [133]:
df_preds.to_csv(DATA_ROOT_FOLDER + "submission_lr_meta_model.csv", index=False)