In [1]:
import numpy as np 
import seaborn as sns # for data visualization 

from matplotlib import pyplot as plt # for plotting
%matplotlib inline
sns.set_style("whitegrid")
import pandas as pd 
import time 

In [4]:
df = pd.read_csv('df_final.csv')
del df['Unnamed: 0']

df.head()

Unnamed: 0,Name,Album,Artist,Release_date,Length,Popularity,Acousticness,Danceability,Energy,Instrumentalness,liveness,Loudness,Speechiness,Tempo,Time_signature,Key,Duration_ms,Mode,Speechiness.1,Valence
0,"It Must Have Been Love - From the Film ""Pretty...",It Must Have Been Love,Roxette,1990-05-20,258786,1,0.34,0.52,0.652,5.5e-05,0.256,-6.655,0.0274,80.609,0,11,258787,1,0.0274,0.722
1,Freedom! '90,Ladies And Gentlemen... The Best Of George Mic...,George Michael,1998-11-09,388400,1,0.0998,0.675,0.873,1.5e-05,0.0541,-7.697,0.0441,91.77,0,11,388400,1,0.0441,0.817
2,Nothing Compares 2 U,I Do Not Want What I Haven't Got,Sinéad O'Connor,1990-07-01,280040,1,0.0425,0.511,0.574,2.3e-05,0.105,-7.016,0.0273,119.917,0,11,280040,1,0.0273,0.161
3,Kingston Town,Labour Of Love II,UB40,1989-01-01,231733,1,0.119,0.957,0.226,0.0358,0.0796,-14.606,0.0594,102.072,0,7,231733,1,0.0594,0.768
4,Thunderstruck,The Razors Edge,AC/DC,1990-09-24,292880,1,0.000147,0.502,0.89,0.0117,0.217,-5.175,0.0364,133.52,0,8,292880,1,0.0364,0.259


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [8]:
features = ["Acousticness", "Danceability", "Duration_ms", "Energy", "Instrumentalness", "Key", "liveness", 
            "Mode", "Speechiness", "Tempo", "Time_signature", "Valence"]

In [9]:
training = df.sample(frac = 0.8,random_state = 420)
X_train = training[features]
y_train = training['Popularity']
X_test = df.drop(training.index)[features]

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 420)

In [11]:
LR_Model = LogisticRegression()
LR_Model.fit(X_train, y_train)
LR_Predict = LR_Model.predict(X_valid)
LR_Accuracy = accuracy_score(y_valid, LR_Predict)
print("Accuracy: " + str(LR_Accuracy))

LR_AUC = roc_auc_score(y_valid, LR_Predict) 
print("AUC: " + str(LR_AUC))

Accuracy: 0.8134920634920635
AUC: 0.5


In [12]:
RFC_Model = RandomForestClassifier()
RFC_Model.fit(X_train, y_train)
RFC_Predict = RFC_Model.predict(X_valid)
RFC_Accuracy = accuracy_score(y_valid, RFC_Predict)
print("Accuracy: " + str(RFC_Accuracy))

RFC_AUC = roc_auc_score(y_valid, RFC_Predict) 
print("AUC: " + str(RFC_AUC))

Accuracy: 0.8154761904761905
AUC: 0.5258173326414115


In [13]:
KNN_Model = KNeighborsClassifier()
KNN_Model.fit(X_train, y_train)
KNN_Predict = KNN_Model.predict(X_valid)
KNN_Accuracy = accuracy_score(y_valid, KNN_Predict)
print("Accuracy: " + str(KNN_Accuracy))

KNN_AUC = roc_auc_score(y_valid, KNN_Predict) 
print("AUC: " + str(KNN_AUC))

Accuracy: 0.748015873015873
AUC: 0.4884535547483135


In [14]:
DT_Model = DecisionTreeClassifier()
DT_Model.fit(X_train, y_train)
DT_Predict = DT_Model.predict(X_valid)
DT_Accuracy = accuracy_score(y_valid, DT_Predict)
print("Accuracy: " + str(DT_Accuracy))

DT_AUC = roc_auc_score(y_valid, DT_Predict) 
print("AUC: " + str(DT_AUC))

Accuracy: 0.6865079365079365
AUC: 0.5408406850025946


In [15]:
training_LSVC = training.sample(1000)
X_train_LSVC = training_LSVC[features]
y_train_LSVC = training_LSVC['Popularity']
X_test_LSVC = df.drop(training_LSVC.index)[features]
X_train_LSVC, X_valid_LSVC, y_train_LSVC, y_valid_LSVC = train_test_split(
    X_train_LSVC, y_train_LSVC, test_size = 0.2, random_state = 420)

In [16]:
LSVC_Model = DecisionTreeClassifier()
LSVC_Model.fit(X_train_LSVC, y_train_LSVC)
LSVC_Predict = LSVC_Model.predict(X_valid_LSVC)
LSVC_Accuracy = accuracy_score(y_valid_LSVC, LSVC_Predict)
print("Accuracy: " + str(LSVC_Accuracy))

LSVC_AUC = roc_auc_score(y_valid_LSVC, LSVC_Predict) 
print("AUC: " + str(LSVC_AUC))

Accuracy: 0.63
AUC: 0.4928728070175439


In [17]:
model_performance_accuracy = pd.DataFrame({'Model': ['LogisticRegression', 
                                                      'RandomForestClassifier', 
                                                      'KNeighborsClassifier',
                                                      'DecisionTreeClassifier',
                                                      'LinearSVC'],
                                            'Accuracy': [LR_Accuracy,
                                                         RFC_Accuracy,
                                                         KNN_Accuracy,
                                                         DT_Accuracy,
                                                         LSVC_Accuracy]})

model_performance_AUC = pd.DataFrame({'Model': ['LogisticRegression', 
                                                      'RandomForestClassifier', 
                                                      'KNeighborsClassifier',
                                                      'DecisionTreeClassifier',
                                                      'LinearSVC'],
                                            'AUC': [LR_AUC,
                                                         RFC_AUC,
                                                         KNN_AUC,
                                                         DT_AUC,
                                                         LSVC_AUC]})

In [18]:
model_performance_accuracy.sort_values(by = "Accuracy", ascending = False)

Unnamed: 0,Model,Accuracy
1,RandomForestClassifier,0.815476
0,LogisticRegression,0.813492
2,KNeighborsClassifier,0.748016
3,DecisionTreeClassifier,0.686508
4,LinearSVC,0.63


In [19]:
model_performance_AUC.sort_values(by = "AUC", ascending = False)

Unnamed: 0,Model,AUC
3,DecisionTreeClassifier,0.540841
1,RandomForestClassifier,0.525817
0,LogisticRegression,0.5
4,LinearSVC,0.492873
2,KNeighborsClassifier,0.488454
