In [13]:
import opendatasets as od
import pandas as pd
import numpy as np
import os
from scipy import stats

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from feature_engine import imputation

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [28]:
# load dataset
os.chdir("C:/Users/benlc/OneDrive/Desktop/python_learn/song-prediction")
train = pd.read_csv(os.getcwd() + "/data/train.csv")
test = pd.read_csv(os.getcwd() + "/data/test.csv")
test = test.iloc[:,1:]
train.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
0,0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,,-5.619088,0,0.08257,158.386236,4,0.734642,0
1,1,,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1,0.127358,102.752988,3,0.711531,1
2,2,193213.0,,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0,0.052282,178.685791,3,0.425536,0
3,3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0,0.035618,128.71563,3,0.453597,0
4,4,165969.0,0.493017,,0.740982,0.002033,10.0,0.094891,-2.684095,0,0.050746,121.928157,4,0.741311,0


In [29]:
# convert column to categorical
cat_vars = ['key','audio_mode','time_signature','song_popularity']
train[cat_vars] = train[cat_vars].astype("category")
test[['key','audio_mode','time_signature']] = test[['key','audio_mode','time_signature']].astype("category")

cont_vars = [var for var in train.columns if var not in cat_vars]
cont_vars.remove('id')

In [30]:
# baseline
# create end to end modeling pipeline
pipe = Pipeline([

    ("median_imputer", imputation.MeanMedianImputer(imputation_method="median",
                                                    variables=cont_vars)),
    ("add_missing_ind_row", imputation.CategoricalImputer(imputation_method="frequent",
                                                    variables=['key'])),
    ('model', KNeighborsClassifier(n_neighbors=2))
])

# split independent and dependent variables
y = train['song_popularity']
X = train[cont_vars + ['key','audio_mode','time_signature']]

# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv = StratifiedKFold(n_splits=10, random_state=1)
scores = cross_val_score(pipe, X, y, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
print(f"list of scores: {scores}")
print(f"roc_auc mean: {scores.mean()}")

final_pipeline = pipe.fit(X, y)
test_pred = final_pipeline.predict(test)
test_pred



list of scores: [0.50387636 0.50439429 0.49832449 0.5068954  0.500097   0.50847167
 0.50018847 0.5107593  0.50377094 0.51349091]
roc_auc mean: 0.5050268827252091


array([0, 0, 0, ..., 0, 0, 1], dtype=int64)