## Preprocessing the data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/SpotifyFeatures.csv").drop_duplicates(subset=['artist_name', 'track_name'])
top2017 = pd.read_csv("data/featuresdf2017.csv").drop(columns=['id', 'time_signature', 'mode'])
top2018 = pd.read_csv("data/featuresdf2017.csv").drop(columns=['id', 'time_signature', 'mode'])


In [3]:
data = data.replace({'key':{'A':0, 'A#':1, 'B':2, 'C':3, 'C#':4, 'D':5, 'D#':6, 'E':7, 'F':8, 'F#':9, 'G':10, 'G#':11}})

data['track_name'] = data['track_name'].apply(lambda x: len(x)); 
data['artist_name'] = data['artist_name'].apply(lambda x: len(x))

top2017['name'] = top2017['name'].apply(lambda x: len(x)); top2017['artists'] = top2017['artists'].apply(lambda x: len(x))
top2018['name'] = top2018['name'].apply(lambda x: len(x)); top2018['artists'] = top2018['artists'].apply(lambda x: len(x))

# unique tracks
data = data.drop_duplicates(subset=['track_id']).drop(columns=['genre','track_id','time_signature'])

# discard songs somewhere in between
data = data[(data['popularity'] <40) | (data['popularity'] > 60)] # < 30 improves result even more

# > 50 is popular
data['popularity'] = data['popularity'].apply(lambda x: 1 if x > 60 else 0);

data= data.rename(columns={'artist_name':'artists', 'track_name':'name','popularity':'popularity_class'})
data = data[['name', 'artists', 'danceability', 'energy', 'key', 'loudness','speechiness', 'acousticness', 'instrumentalness', 'liveness','valence', 'tempo', 'duration_ms','popularity_class']]

In [4]:
print(f"data\t\ttop2017\n------------------------")
for a, b in zip(data.columns,top2017.columns):
    print(a,'\t\t' if len(a) < 7 else '\t' if len(a) < 14 else '',b)

data		top2017
------------------------
name 		 name
artists 	 artists
danceability 	 danceability
energy 		 energy
key 		 key
loudness 	 loudness
speechiness 	 speechiness
acousticness 	 acousticness
instrumentalness  instrumentalness
liveness 	 liveness
valence 	 valence
tempo 		 tempo
duration_ms 	 duration_ms


In [5]:
data['popularity_class'].value_counts()

0    97799
1    13320
Name: popularity_class, dtype: int64

In [6]:
top2017.head()

Unnamed: 0,name,artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,12,10,0.825,0.652,1.0,-3.183,0.0802,0.581,0.0,0.0931,0.931,95.977,233713.0
1,17,10,0.694,0.815,2.0,-4.328,0.12,0.229,0.0,0.0924,0.813,88.931,228827.0
2,34,10,0.66,0.786,2.0,-4.757,0.17,0.209,0.0,0.112,0.846,177.833,228200.0
3,24,16,0.617,0.635,11.0,-6.769,0.0317,0.0498,1.4e-05,0.164,0.446,103.019,247160.0
4,11,9,0.609,0.668,7.0,-4.284,0.0367,0.0552,0.0,0.167,0.811,80.924,288600.0


In [7]:
data.head()

Unnamed: 0,name,artists,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,popularity_class
0,27,14,0.389,0.91,4,-1.828,0.0525,0.611,0.0,0.346,0.814,166.969,99373,0
1,32,17,0.59,0.737,9,-5.559,0.0868,0.246,0.0,0.151,0.816,174.003,137373,0
2,30,15,0.663,0.131,3,-13.879,0.0362,0.952,0.0,0.103,0.368,99.488,170267,0
3,30,14,0.24,0.326,4,-12.178,0.0395,0.703,0.0,0.0985,0.227,171.758,152427,0
4,9,12,0.331,0.225,8,-21.15,0.0456,0.95,0.123,0.202,0.39,140.576,82625,0


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['popularity_class']), data['popularity_class'], test_size=0.2, random_state=0)

In [9]:
from collections import Counter
Counter(y_train)

Counter({0: 78290, 1: 10605})

## Model training and initial testing

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [11]:
rf = RandomForestClassifier(n_estimators=100)

In [12]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=100, p=2,
                     weights='uniform')

In [14]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
y_pred_rf = rf.predict(X_test)
y_pred_knn = knn.predict(X_test)
y_pred_dtc = dtc.predict(X_test)

print("random forest accuracy", accuracy_score(y_test, y_pred_rf))
print("k nearest neighbors accuracy", accuracy_score(y_test, y_pred_knn))
print("decision tree accuracy", accuracy_score(y_test, y_pred_dtc))

random forest accuracy 0.8932685385169187
k nearest neighbors accuracy 0.8778347732181425
decision tree accuracy 0.8334683225341972


## Cross-validation

In [17]:
from sklearn.model_selection import cross_validate

In [18]:
rf = RandomForestClassifier(n_estimators=100)
cv_results_rf = cross_validate(rf, data, data['popularity_class'], cv=5, return_train_score=True)

In [19]:
print("random forest cross-validation")
print(cv_results_rf["test_score"])
print(cv_results_rf["train_score"])
print(cv_results_rf["test_score"].mean())

random forest cross-validation
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
1.0


In [20]:
knn = KNeighborsClassifier(n_neighbors=100)
cv_results_knn = cross_validate(knn, data, data['popularity_class'], cv=5, return_train_score=True)

In [21]:
print("k nearest neighbors cross-validation")
print(cv_results_knn["test_score"])
print(cv_results_knn["train_score"])
print(cv_results_knn["test_score"].mean())

k nearest neighbors cross-validation
[0.88012959 0.88012959 0.88012959 0.88012959 0.8801242 ]
[0.88012824 0.88012824 0.88012824 0.88012824 0.88012959]
0.8801285108368939


In [22]:
dtc = DecisionTreeClassifier()
cv_results_dtc = cross_validate(dtc, data, data['popularity_class'], cv=5, return_train_score=True)

In [23]:
print("decision tree cross-validation")
print(cv_results_dtc["test_score"])
print(cv_results_dtc["train_score"])
print(cv_results_dtc["test_score"].mean())

decision tree cross-validation
[1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1.]
1.0


## Testing on 2017|2018 top

In [24]:
X_top2017 = top2017
X_top2017.columns

Index(['name', 'artists', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms'],
      dtype='object')

still fitting on X_train and y_train because those are balanced

In [25]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=100, p=2,
                     weights='uniform')

In [27]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [28]:
y_pred_2017_rf = rf.predict(X_top2017)
y_pred_2017_knn = knn.predict(X_top2017)
y_pred_2017_dtc = dtc.predict(X_top2017)

print("random forest accuracy (2017 top)", accuracy_score([1]*100, y_pred_2017_rf))
print("k nearest neighbors accuracy (2017 top)", accuracy_score([1]*100, y_pred_2017_knn))
print("decision tree accuracy (2017 top)", accuracy_score([1]*100, y_pred_2017_dtc))

random forest accuracy (2017 top) 0.72
k nearest neighbors accuracy (2017 top) 0.0
decision tree accuracy (2017 top) 0.68


In [29]:
y_pred_2018_rf = rf.predict(top2018)
y_pred_2018_knn = knn.predict(top2018)
y_pred_2018_dtc = dtc.predict(top2018)

print("random forest accuracy (2018 top)", accuracy_score([1]*100, y_pred_2018_rf))
print("k nearest neighbors accuracy (2018 top)", accuracy_score([1]*100, y_pred_2018_knn))
print("decision tree accuracy (2018 top)", accuracy_score([1]*100, y_pred_2018_dtc))

random forest accuracy (2018 top) 0.72
k nearest neighbors accuracy (2018 top) 0.0
decision tree accuracy (2018 top) 0.68


In [30]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

reg = LinearRegression()
reg.fit(X_train, y_train)
print(f'LinearRegression: {mean_squared_error(y_test, reg.predict(X_test))}')

mean_squared_error
ridge = Ridge()
ridge.fit(X_train, y_train)
print(f'Ridge:            {mean_squared_error(y_test, ridge.predict(X_test))}')

lasso = Lasso()
lasso.fit(X_train, y_train)
print(f'Lasso:            {mean_squared_error(y_test, lasso.predict(X_test))}')

LinearRegression: 0.09374355787462633
Ridge:            0.09374357460940232
Lasso:            0.10715368964714947


In [31]:
for i, x in enumerate(lasso.coef_):
      print(data.columns[i],":\t", x)

name :	 -0.0
artists :	 -0.0
danceability :	 0.0
energy :	 0.0
key :	 0.0
loudness :	 0.0
speechiness :	 -0.0
acousticness :	 -0.0
instrumentalness :	 -0.0
liveness :	 -0.0
valence :	 0.0
tempo :	 0.0
duration_ms :	 -6.644119056356302e-08


In [32]:
print(accuracy_score(y_test, [1 if x > 0.5 else 0 for x in reg.predict(X_test)]))
print(accuracy_score(y_test, [1 if x > 0.5 else 0 for x in ridge.predict(X_test)]))
print(accuracy_score(y_test, [1 if x > 0.5 else 0 for x in lasso.predict(X_test)]))

0.8778347732181425
0.8778347732181425
0.8778347732181425


In [33]:
print(accuracy_score([1]*100, [1 if x > 0.5 else 0 for x in reg.predict(X_top2017)]))
print(accuracy_score([1]*100, [1 if x > 0.5 else 0 for x in ridge.predict(X_top2017)]))
print(accuracy_score([1]*100, [1 if x > 0.5 else 0 for x in lasso.predict(X_top2017)]))

0.0
0.0
0.0
