In [1]:
# Import some libraries that will be used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 300)

%matplotlib inline

In [2]:
df = pd.read_csv("rap_df.csv").drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,danceability,energy,loudness,musical_mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,explicit,popular,duration_per_num_sections,key__1,key__2,key__3,key__4,key__5,key__6,key__7,key__8,key__9,key__10,key__11,time_sig__1,time_sig__3,time_sig__4,time_sig__5,no_instruments
0,0.32,0.934,-4.363,0,0.246,0.134,0.0,0.0925,0.495,138.982,196476,0,0,21830.666667,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1
1,0.282,0.579,-12.653,0,0.38,0.0524,0.000812,0.0977,0.205,178.866,386556,0,0,20345.052632,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0.548,0.724,-6.681,0,0.275,0.39,0.0,0.469,0.481,110.066,516000,1,0,25800.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1
3,0.735,0.89,-4.886,0,0.252,0.148,0.0,0.314,0.565,122.014,572547,1,0,28627.35,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1
4,0.713,0.883,-5.407,0,0.265,0.00588,2e-05,0.724,0.452,124.967,338627,1,0,19919.235294,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0


#### Recal the class imbalance and decide how to address
- I do not intend to use SMOTE. This artificially creates songs, and I don't want to artificially create songs that are considered popular. I do not believe a machine can accurately replicate a good song by simply using KNN to create new observations.
- I will likely use both upsampling and class weight = balanced as means to address class imbalance

### Create feature and target data 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [4]:
X = df.drop('popular', axis=1)
y = df.popular

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Create upsampled feature and target data 

In [6]:
from sklearn.utils import resample

In [7]:
popular = df[df.popular == 1]
not_popular = df[df.popular == 0]

In [8]:
popular_upsampled = resample(popular,
                          replace=True, # sample with replacement
                          n_samples=len(not_popular), # match number in majority class
                          random_state=42) 

In [9]:
upsampled = pd.concat([not_popular, popular_upsampled])
upsampled.popular.value_counts()

1    771
0    771
Name: popular, dtype: int64

In [10]:
X_upsampled = upsampled.drop('popular', axis=1)
y_upsampled = upsampled.popular

In [11]:
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_upsampled, 
                                                                y_upsampled, test_size=0.20, random_state=42)

### Try a LogReg Gridsearch

In [12]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

#### Upsampled

In [38]:
upsampled_lr = LogisticRegressionCV(solver='liblinear', cv=10, penalty='l1', Cs = [5,10,15,20,25])

In [39]:
upsampled_lr.fit(X_train_up, y_train_up)



LogisticRegressionCV(Cs=[5, 10, 15, 20, 25], cv=10, penalty='l1',
                     solver='liblinear')

In [41]:
pred = upsampled_lr.predict(X_test_up)
print(accuracy_score(y_test_up, pred))
print(f1_score(y_test_up, pred))

0.6666666666666666
0.6925373134328359


In [42]:
print(confusion_matrix(y_test_up, pred))
print('---------------------------')
print(classification_report(y_test_up, pred))


[[ 90  67]
 [ 36 116]]
---------------------------
              precision    recall  f1-score   support

           0       0.71      0.57      0.64       157
           1       0.63      0.76      0.69       152

    accuracy                           0.67       309
   macro avg       0.67      0.67      0.66       309
weighted avg       0.67      0.67      0.66       309



#### Weighted

### Try a Pruned Decision Tree

### Try a Random Forest

### Try an XGBoost

### Try a Voting Classifier