# Build basic model using audio features only

models used: Logistic Regression(baseline), SVM, random forest

In [1]:
import pandas as pd
audio_data = pd.read_pickle('Extra_audio_features')
not_top10 = len(audio_data)
audio_data2 = pd.read_pickle('Billboard_audio_features')
top10 = len(audio_data2)
data = audio_data.append(audio_data2)
data['is_top10'] = [[0,1][i>=not_top10] for i in range(not_top10+top10)]
data.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,uri,valence,is_top10
0,0.0362,0.886,161067,0.58,2e-06,7,0.0882,-2.144,0,0.0513,144.988,4,spotify:track:0WgBb7XgdtbUW0GlYel9mH,0.806,0
0,0.21,0.404,212933,0.758,0.0,11,0.161,-4.304,0,0.397,139.534,5,spotify:track:5HxyJa6E8OXdYKKfufrbKE,0.311,0


In [2]:
print not_top10,top10

3119 2950


In [3]:
del data['uri']
data.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,is_top10
0,0.0362,0.886,161067,0.58,2e-06,7,0.0882,-2.144,0,0.0513,144.988,4,0.806,0
0,0.21,0.404,212933,0.758,0.0,11,0.161,-4.304,0,0.397,139.534,5,0.311,0
0,0.00532,0.853,215467,0.784,0.0,5,0.0985,-4.874,1,0.0457,132.072,4,0.306,0
0,0.408,0.378,202360,0.403,1.2e-05,9,0.0481,-7.472,1,0.0387,204.581,3,0.2,0
0,0.749,0.453,83400,0.38,2e-06,9,0.145,-7.914,0,0.0298,99.893,4,0.154,0


In [4]:
data.isnull().any()

acousticness        False
danceability         True
duration_ms         False
energy              False
instrumentalness    False
key                 False
liveness             True
loudness            False
mode                False
speechiness          True
tempo               False
time_signature       True
valence              True
is_top10            False
dtype: bool

In [6]:
# remove null values
data_notnull = data.dropna()

# normalize all columns since they have 0 values and their value ranges are quite different
from sklearn import preprocessing
data_scaled = preprocessing.scale(data_notnull.ix[:,:-1]) # not normalize Y variable
data_scaled = pd.DataFrame(data_scaled,columns=data_notnull.columns[:-1])

data_scaled['is_top10'] = data_notnull['is_top10'].values
data_scaled.head(2)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,is_top10
0,-0.887515,1.857172,-0.987611,-0.217135,-0.237975,0.500834,-0.639775,1.777625,-1.534333,-0.226638,0.922991,0.191563,0.801502,0
1,-0.263664,-1.329468,-0.196773,0.638699,-0.237994,1.62814,-0.204393,1.195628,-1.534333,3.891784,0.724554,3.547246,-1.256792,0


In [7]:
print('Randomly select will produce prediction accuracy of %0.2f' % (sum(data_scaled.is_top10==1)/float(len(data_scaled.is_top10))))

Randomly select will produce prediction accuracy of 0.49


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf = LogisticRegression()
scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.52 (+/- 0.17)


In [9]:
from sklearn import svm

Cs = [0.2,0.5,0.8,1]
for c in Cs:
    clf = svm.SVC(kernel='rbf', C=c)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using margin %0.1f" % (scores.mean(), scores.std() * 2,c))

Accuracy: 0.52 (+/- 0.16) using margin 0.2
Accuracy: 0.51 (+/- 0.15) using margin 0.5
Accuracy: 0.51 (+/- 0.14) using margin 0.8
Accuracy: 0.51 (+/- 0.14) using margin 1.0


In [10]:
clf = svm.SVC(kernel='linear', C=c)
scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

print("Accuracy: %0.2f (+/- %0.2f) using linear kernel" % (scores.mean(), scores.std() * 2))

Accuracy: 0.52 (+/- 0.18) using linear kernel


In [11]:
from sklearn.ensemble import RandomForestClassifier

n_trees = [5,10,20,30]
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using %s estimators" % (scores.mean(), scores.std() * 2,n))

Accuracy: 0.51 (+/- 0.10) using 5 estimators
Accuracy: 0.50 (+/- 0.09) using 10 estimators
Accuracy: 0.49 (+/- 0.11) using 20 estimators
Accuracy: 0.50 (+/- 0.13) using 30 estimators


In [14]:
n_trees = range(1,10)
for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n)
    scores = cross_val_score(clf, data_scaled.ix[:,:-1].values, data_scaled.is_top10.values, cv=5)

    print("Accuracy: %0.2f (+/- %0.2f) using %s estimators" % (scores.mean(), scores.std() * 2,n))

Accuracy: 0.50 (+/- 0.08) using 1 estimators
Accuracy: 0.51 (+/- 0.03) using 2 estimators
Accuracy: 0.49 (+/- 0.08) using 3 estimators
Accuracy: 0.49 (+/- 0.07) using 4 estimators
Accuracy: 0.51 (+/- 0.09) using 5 estimators
Accuracy: 0.50 (+/- 0.09) using 6 estimators
Accuracy: 0.49 (+/- 0.07) using 7 estimators
Accuracy: 0.49 (+/- 0.08) using 8 estimators
Accuracy: 0.50 (+/- 0.09) using 9 estimators
