In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 



def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)


In [8]:
train = pd.read_csv('../raw_data/train.csv', index_col = 0)
test = pd.read_csv('../raw_data/test.csv', index_col = 0)


train['instrumentalness'] = np.log(train['instrumentalness'] + 1) 
useful_cols = [col for col in train.columns if col not in ['id', 'song_popularity']]
missing_cols = [col for col in train.columns if train[col].isna().sum() > 0]
for col in missing_cols:
    train[col].fillna(train[col].median(), inplace = True)

X_train, X_valid, y_train, y_valid = train_test_split(train[useful_cols], train['song_popularity'], test_size=0.2, random_state=42)

In [9]:
clf = KNeighborsClassifier(n_neighbors=100)
clf.fit(X_train, y_train)

clf.score(X_valid, y_valid)

0.635

In [10]:
LR_AUC = roc_auc_score(y_valid, clf.predict(X_valid))
print("AUC: " + str(LR_AUC))

AUC: 0.49985085134215906


In [11]:
clf.predict(X_valid)

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
np.mean(clf.predict(X_valid))

0.003625

In [15]:
np.mean(y_valid)

0.363875