In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel


def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [8]:
train = pd.read_csv('../raw_data/train.csv', index_col = 0)
test = pd.read_csv('../raw_data/test.csv', index_col = 0)

useful_cols = [col for col in train.columns if col not in ['id', 'song_popularity']]
missing_cols = [col for col in train.columns if train[col].isna().sum() > 0]
cols_dist_missing = [col for col in missing_cols if col not in ['key', 'audio_mode', 'time_signature']]
cols_mode_missing = [col for col in missing_cols if col in ['key', 'audio_mode', 'time_signature']]

# print(missing_cols)
for col in cols_dist_missing:
    # print(col)
    train[col].fillna(train[col].median(), inplace = True)

for col in cols_mode_missing:
    # print(col)
    train[col].fillna(train[col].mode()[0], inplace = True)

X_train, X_valid, y_train, y_valid = train_test_split(train[useful_cols], train['song_popularity'], test_size=0.2, random_state=42)


In [9]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))

In [10]:
sel_.fit(X_train, y_train)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [11]:
sel_.get_support()

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [13]:
selected_features = X_train.columns[(sel_.get_support())]
print(selected_features)

Index(['acousticness', 'danceability', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'audio_mode', 'speechiness', 'tempo',
       'time_signature', 'audio_valence'],
      dtype='object')


In [15]:
remove_cols = [col for col in train.columns if col not in selected_features]
print(remove_cols)

['song_duration_ms', 'song_popularity']


In [16]:
X_train_selected = sel_.transform(X_train)
X_valid_selected = sel_.transform(X_valid)


In [20]:
clf = RandomForestClassifier(n_estimators=100, min_samples_leaf = 10, random_state=42, class_weight='balanced_subsample')
clf.fit(X_train_selected, y_train)

y_pred = clf.predict(X_valid_selected)


In [21]:
LR_AUC = roc_auc_score(y_valid, clf.predict(X_valid_selected))
print("AUC: " + str(LR_AUC))

AUC: 0.5369250089728831
