# Build basic model using audio features only

models used: Logistic Regression(baseline), SVM, random forest

In [1]:
import pandas as pd
audio_data = pd.read_pickle('Same_album_track_audio_features')
not_top10 = len(audio_data)
audio_data2 = pd.read_pickle('Spotify_audio_features')
top10 = len(audio_data2)
data = audio_data.append(audio_data2)
data['is_top10'] = [[0,1][i>=not_top10] for i in range(not_top10+top10)]
data.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,uri,valence,is_top10
0,0.034,0.818,225983,0.803,0.0,1,0.153,-4.282,1,0.0797,106.97,4,spotify:track:6b8Be6ljOzmkOmFslEb23P,0.618,0
0,0.0362,0.886,161067,0.58,2e-06,7,0.0882,-2.144,0,0.0513,144.988,4,spotify:track:0WgBb7XgdtbUW0GlYel9mH,0.806,0


In [2]:
del data['uri']
data.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,is_top10
0,0.034,0.818,225983,0.803,0.0,1,0.153,-4.282,1,0.0797,106.97,4,0.618,0
0,0.0362,0.886,161067,0.58,2e-06,7,0.0882,-2.144,0,0.0513,144.988,4,0.806,0
0,0.21,0.404,212933,0.758,0.0,11,0.161,-4.304,0,0.397,139.534,5,0.311,0
0,0.00532,0.853,215467,0.784,0.0,5,0.0985,-4.874,1,0.0457,132.072,4,0.306,0
0,0.408,0.378,202360,0.403,1.2e-05,9,0.0481,-7.472,1,0.0387,204.581,3,0.2,0


In [3]:
data.isnull().any()

acousticness        False
danceability         True
duration_ms         False
energy              False
instrumentalness    False
key                 False
liveness             True
loudness            False
mode                False
speechiness          True
tempo               False
time_signature       True
valence              True
is_top10            False
dtype: bool

In [4]:
# remove null values
data_notnull = data.dropna()

# normalize all columns since they have 0 values and their value ranges are quite different
from sklearn import preprocessing
data_scaled = preprocessing.scale(data_notnull.ix[:,:-1]) # not normalize Y variable
data_scaled = pd.DataFrame(data_scaled,columns=data_notnull.columns[:-1])

data_scaled['is_top10'] = data_notnull['is_top10'].values
data_scaled.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,is_top10
0,-0.943133,1.404985,0.102166,0.903164,-0.274262,-1.18427,-0.279805,1.262767,0.629122,0.131219,-0.465609,0.199308,-0.031404,0
1,-0.93543,1.849583,-0.86172,-0.17224,-0.274247,0.509469,-0.653368,1.839245,-1.589516,-0.211136,0.906921,0.199308,0.74884,0


In [5]:
print('Randomly select will produce prediction accuracy of %0.2f' % (sum(data_scaled.is_top10==1)/float(len(data_scaled.is_top10))))

Randomly select will produce prediction accuracy of 0.51


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf = LogisticRegression()
scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.47 (+/- 0.12)


In [7]:
from sklearn import svm

Cs = [0.2,0.5,0.8,1]
for c in Cs:
    clf = svm.SVC(kernel='rbf', C=c)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using margin %0.1f" % (scores.mean(), scores.std() * 2,c))

Accuracy: 0.45 (+/- 0.10) using margin 0.2
Accuracy: 0.45 (+/- 0.09) using margin 0.5
Accuracy: 0.46 (+/- 0.09) using margin 0.8
Accuracy: 0.46 (+/- 0.09) using margin 1.0


In [8]:
clf = svm.SVC(kernel='linear', C=c)
scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

print("Accuracy: %0.2f (+/- %0.2f) using linear kernel" % (scores.mean(), scores.std() * 2))

Accuracy: 0.46 (+/- 0.11) using linear kernel


In [9]:
from sklearn.ensemble import RandomForestClassifier

n_trees = [5,10,20,30]
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using %s estimators" % (scores.mean(), scores.std() * 2,n))

Accuracy: 0.45 (+/- 0.05) using 5 estimators
Accuracy: 0.45 (+/- 0.10) using 10 estimators
Accuracy: 0.44 (+/- 0.08) using 20 estimators
Accuracy: 0.44 (+/- 0.08) using 30 estimators
