In [1]:
import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from autoxgb import AutoXGB
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
0,0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,,-5.619088,0,0.08257,158.386236,4,0.734642,0
1,1,,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1,0.127358,102.752988,3,0.711531,1
2,2,193213.0,,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0,0.052282,178.685791,3,0.425536,0
3,3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0,0.035618,128.71563,3,0.453597,0
4,4,165969.0,0.493017,,0.740982,0.002033,10.0,0.094891,-2.684095,0,0.050746,121.928157,4,0.741311,0


In [4]:
X = train.drop(["id", "song_popularity"], axis=1)
y = train.song_popularity

In [5]:
test_df = test.drop("id", 1)

In [6]:
# col_after transform
col_x = X.columns.to_list()

In [7]:
imp = IterativeImputer(max_iter=10)
X = imp.fit_transform(X)
x_test = imp.transform(test_df)

In [8]:
X

array([[2.12990000e+05, 6.42285517e-01, 8.56520289e-01, ...,
        1.58386236e+02, 4.00000000e+00, 7.34641557e-01],
       [1.97014838e+05, 5.48656388e-02, 7.33288830e-01, ...,
        1.02752988e+02, 3.00000000e+00, 7.11530551e-01],
       [1.93213000e+05, 1.59724777e-01, 1.88387099e-01, ...,
        1.78685791e+02, 3.00000000e+00, 4.25536016e-01],
       ...,
       [1.60879000e+05, 8.77431248e-01, 4.09064592e-01, ...,
        9.95560744e+01, 3.00000000e+00, 1.77947389e-01],
       [1.93918000e+05, 1.73402378e-01, 3.65738206e-01, ...,
        1.39857384e+02, 3.00000000e+00, 7.72978406e-01],
       [1.96475000e+05, 7.11618441e-03, 3.54585290e-01, ...,
        1.01974949e+02, 3.00000000e+00, 5.88549127e-01]])

In [9]:
x_test

array([[3.08523000e+05, 1.98453405e-02, 4.35637804e-01, ...,
        1.26129304e+02, 4.00000000e+00, 3.99620451e-01],
       [2.00011000e+05, 7.01191036e-02, 7.31256023e-01, ...,
        8.64481489e+01, 3.00000000e+00, 4.99424461e-01],
       [2.79758000e+05, 8.10637132e-01, 5.68857904e-01, ...,
        9.95443512e+01, 3.00000000e+00, 5.64950591e-01],
       ...,
       [1.88434000e+05, 7.53472359e-01, 7.83946794e-01, ...,
        1.19090558e+02, 4.00000000e+00, 4.82961338e-01],
       [1.73044000e+05, 8.63578837e-01, 3.73283420e-01, ...,
        1.03048349e+02, 3.00000000e+00, 5.18741779e-01],
       [1.57896000e+05, 1.98523554e-02, 7.20380057e-01, ...,
        1.09873611e+02, 4.00000000e+00, 9.71367981e-01]])

In [10]:
X = pd.DataFrame(columns=col_x, data=X)
X.head()

Unnamed: 0,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,0.215368,-5.619088,0.0,0.08257,158.386236,4.0,0.734642
1,197014.838414,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1.0,0.127358,102.752988,3.0,0.711531
2,193213.0,0.159725,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0.0,0.052282,178.685791,3.0,0.425536
3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0.0,0.035618,128.71563,3.0,0.453597
4,165969.0,0.493017,0.644204,0.740982,0.002033,10.0,0.094891,-2.684095,0.0,0.050746,121.928157,4.0,0.741311


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [12]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [13]:
pred = rf.predict(X_test)
f1_score(y_test, pred)

0.12269207861822513

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.64      0.95      0.77      2557
           1       0.44      0.07      0.12      1443

    accuracy                           0.63      4000
   macro avg       0.54      0.51      0.44      4000
weighted avg       0.57      0.63      0.53      4000



In [15]:
from sklearn.metrics import roc_auc_score

In [16]:
roc_auc_score(y_test, pred)

0.5096824961901223

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
0,0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,,-5.619088,0,0.08257,158.386236,4,0.734642,0
1,1,,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1,0.127358,102.752988,3,0.711531,1
2,2,193213.0,,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0,0.052282,178.685791,3,0.425536,0
3,3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0,0.035618,128.71563,3,0.453597,0
4,4,165969.0,0.493017,,0.740982,0.002033,10.0,0.094891,-2.684095,0,0.050746,121.928157,4,0.741311,0


In [19]:
df.drop("id", axis=1, inplace=True)

In [20]:
df.isna().sum()

song_duration_ms    4101
acousticness        3992
danceability        4026
energy              3975
instrumentalness    3985
key                 4065
liveness            4086
loudness            3957
audio_mode             0
speechiness            0
tempo                  0
time_signature         0
audio_valence          0
song_popularity        0
dtype: int64

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [22]:
lgr = make_pipeline(
    SimpleImputer(strategy="mean", add_indicator=True),
    StandardScaler(),
    LogisticRegression(random_state=0),
)

In [23]:
log_reg_scores = cross_val_score(lgr, X_train, y_train)

In [24]:
log_reg_scores

array([0.63444444, 0.63402778, 0.63430556, 0.635     , 0.63513889])

In [25]:
log_reg_scores.mean()

0.6345833333333333

# Cross validation Strategies

## KFold

In [26]:
from sklearn.model_selection import KFold

cross_val_score(lgr, X_train, y_train, cv=KFold(n_splits=5)).mean()

0.6346111111111111

# RepeatedFold

In [27]:
from sklearn.model_selection import RepeatedKFold

scores = cross_val_score(
    lgr, X_train, y_train, cv=RepeatedKFold(n_splits=5, n_repeats=3)
)

In [28]:
scores

array([0.62527778, 0.64333333, 0.63222222, 0.63902778, 0.63361111,
       0.63638889, 0.63125   , 0.63236111, 0.64111111, 0.63236111,
       0.63652778, 0.63805556, 0.63875   , 0.63152778, 0.62861111])

In [29]:
scores.shape

(15,)

In [30]:
scores.mean()

0.6346944444444446

# RepeatedStratifiedKfold

In [31]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [32]:
scores = cross_val_score(
    lgr, X_train, y_train, cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=3)
)

In [33]:
scores

array([0.635     , 0.63458333, 0.63486111, 0.63527778, 0.63375   ,
       0.63333333, 0.63527778, 0.63388889, 0.63541667, 0.63472222,
       0.63583333, 0.635     , 0.63472222, 0.63402778, 0.635     ])

In [34]:
scores.mean()

0.634712962962963

# StratifiedShuffleSplit

You will need to scale X

In [35]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

In [36]:
sss = StratifiedShuffleSplit(n_splits=15, random_state=42)

In [37]:
# scale = StandardScaler()
# X = scale.fit_transform(X)

In [38]:
# for train_idx, test_idx in sss.split(X, y):
#     print("TRAIN:", train_idx, "TEST:", test_idx)
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]

In [44]:
from sklearn.metrics import auc, roc_auc_score, roc_curve
from xgboost import XGBClassifier

In [46]:
sss = StratifiedShuffleSplit(n_splits=10, random_state=1234)

preds = []
scores = []
feature_importance_df = pd.DataFrame()
for fold, (idx_train, idx_valid) in enumerate(sss.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = XGBClassifier()
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
    pred_valid = model.predict_proba(X_valid)[:, 1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = X.columns
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0
    )

    print(f"Fold: {fold + 1} Score: {score}" "\n")
    print("||" * 40, "\n")

    test_preds = model.predict_proba(X_test)[:, 1]
    preds.append(test_preds)

print(f"Overall Validation Score: {np.mean(scores)}")

[0]	validation_0-logloss:0.67258
[1]	validation_0-logloss:0.66195
[2]	validation_0-logloss:0.65538
[3]	validation_0-logloss:0.65236
[4]	validation_0-logloss:0.65030
[5]	validation_0-logloss:0.64875
[6]	validation_0-logloss:0.64942
[7]	validation_0-logloss:0.64933
[8]	validation_0-logloss:0.65042
[9]	validation_0-logloss:0.64995
[10]	validation_0-logloss:0.64945
[11]	validation_0-logloss:0.64941
[12]	validation_0-logloss:0.64968
[13]	validation_0-logloss:0.64968
[14]	validation_0-logloss:0.64977
[15]	validation_0-logloss:0.64986
[16]	validation_0-logloss:0.64989
[17]	validation_0-logloss:0.65002
[18]	validation_0-logloss:0.64960
[19]	validation_0-logloss:0.64922
[20]	validation_0-logloss:0.64923
[21]	validation_0-logloss:0.64976
[22]	validation_0-logloss:0.64990
[23]	validation_0-logloss:0.65055
[24]	validation_0-logloss:0.65073
[25]	validation_0-logloss:0.65126
[26]	validation_0-logloss:0.65128
[27]	validation_0-logloss:0.65141
[28]	validation_0-logloss:0.65137
[29]	validation_0-loglos

In [47]:
from xgboost import XGBRFClassifier

In [48]:
sss = StratifiedShuffleSplit(n_splits=10, random_state=1234)

preds = []
scores = []
feature_importance_df = pd.DataFrame()
for fold, (idx_train, idx_valid) in enumerate(sss.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = XGBRFClassifier()
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
    pred_valid = model.predict_proba(X_valid)[:, 1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = X.columns
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat(
        [feature_importance_df, fold_importance_df], axis=0
    )

    print(f"Fold: {fold + 1} Score: {score}" "\n")
    print("||" * 40, "\n")

    test_preds = model.predict_proba(X_test)[:, 1]
    preds.append(test_preds)

print(f"Overall Validation Score: {np.mean(scores)}")

[0]	validation_0-logloss:0.64881
Fold: 1 Score: 0.5755309429836631

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 

[0]	validation_0-logloss:0.64848
Fold: 2 Score: 0.5749350554039192

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 

[0]	validation_0-logloss:0.64907
Fold: 3 Score: 0.5700003993269722

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 

[0]	validation_0-logloss:0.65006
Fold: 4 Score: 0.5706710258062357

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 

[0]	validation_0-logloss:0.65034
Fold: 5 Score: 0.5630916379852767

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 

[0]	validation_0-logloss:0.65048
Fold: 6 Score: 0.5601509186139253

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 

[0]	validation_0-logloss:0.65002
Fold: 7 Score: 0.566472561380333

|||||||||||||||||||||||||||