# Build basic model using audio features only

models used: Logistic Regression(baseline), SVM, random forest

In [1]:
import pandas as pd
audio_data = pd.read_pickle('MSD_audio_features')
not_top10 = len(audio_data)
audio_data2 = pd.read_pickle('Spotify_audio_features')
top10 = len(audio_data2)
data = audio_data.append(audio_data2)
data['is_top10'] = [[0,1][i>=not_top10] for i in range(not_top10+top10)]
data.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,uri,valence,is_top10
0,0.063,0.751,218933,0.549,2.5e-05,6,0.106,-10.508,0,0.429,92.224,4,spotify:track:01TR6aAKrA2cI3Z0gnCOsu,0.564,0
0,0.195,0.749,147760,0.53,3.5e-05,9,0.0748,-12.94,1,0.0326,121.395,4,spotify:track:4yPl1mK1oluIrCwI4HInPR,0.968,0


In [2]:
del data['uri']
data.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,is_top10
0,0.063,0.751,218933,0.549,2.5e-05,6,0.106,-10.508,0,0.429,92.224,4,0.564,0
0,0.195,0.749,147760,0.53,3.5e-05,9,0.0748,-12.94,1,0.0326,121.395,4,0.968,0
0,0.197,0.596,232107,0.934,0.00741,0,0.184,-3.812,1,0.0459,120.055,4,0.846,0
0,0.000356,0.525,209600,0.918,0.566,2,0.0904,-4.512,1,0.0543,129.727,4,0.652,0
0,0.813,0.677,112947,0.183,0.0,1,0.26,-16.059,1,0.926,87.661,4,0.735,0


In [3]:
data.isnull().any()

acousticness        False
danceability         True
duration_ms         False
energy              False
instrumentalness    False
key                 False
liveness            False
loudness            False
mode                False
speechiness          True
tempo               False
time_signature       True
valence              True
is_top10            False
dtype: bool

In [4]:
# remove null values
data_notnull = data.dropna()

# normalize all columns since they have 0 values and their value ranges are quite different
from sklearn import preprocessing
data_scaled = preprocessing.scale(data_notnull.ix[:,:-1]) # not normalize Y variable
data_scaled = pd.DataFrame(data_scaled,columns=data_notnull.columns[:-1])

data_scaled['is_top10'] = data_notnull['is_top10'].values
data_scaled.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,is_top10
0,-0.813976,0.999522,-0.155646,-0.277563,-0.421397,0.220124,-0.537636,-0.313251,-1.572732,3.352853,-0.962947,0.238822,-0.061014,0
1,-0.387663,0.987877,-0.962531,-0.360561,-0.421355,1.064178,-0.703653,-0.878755,0.635836,-0.447199,0.051558,0.238822,1.502051,0


In [5]:
print('Randomly select will produce prediction accuracy of %0.2f' % (sum(data_scaled.is_top10==1)/float(len(data_scaled.is_top10))))

Randomly select will produce prediction accuracy of 0.52


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf = LogisticRegression()
scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.67 (+/- 0.02)


In [7]:
from sklearn import svm

Cs = [0.2,0.5,0.8,1]
for c in Cs:
    clf = svm.SVC(kernel='rbf', C=c)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using margin %0.1f" % (scores.mean(), scores.std() * 2,c))

Accuracy: 0.70 (+/- 0.02) using margin 0.2
Accuracy: 0.70 (+/- 0.01) using margin 0.5
Accuracy: 0.70 (+/- 0.01) using margin 0.8
Accuracy: 0.70 (+/- 0.01) using margin 1.0


In [8]:
clf = svm.SVC(kernel='linear', C=c)
scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

print("Accuracy: %0.2f (+/- %0.2f) using linear kernel" % (scores.mean(), scores.std() * 2))

Accuracy: 0.66 (+/- 0.02) using linear kernel


In [9]:
from sklearn.ensemble import RandomForestClassifier

n_trees = [5,10,20,30]
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using %s estimators" % (scores.mean(), scores.std() * 2,n))

Accuracy: 0.66 (+/- 0.02) using 5 estimators
Accuracy: 0.67 (+/- 0.02) using 10 estimators
Accuracy: 0.69 (+/- 0.01) using 20 estimators
Accuracy: 0.70 (+/- 0.01) using 30 estimators
