In [115]:
## Packages 
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
import time
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer

In [137]:
# Load data
train = pd.read_csv("training_data.csv")
test = pd.read_csv("songs_to_classify.csv")
train.shape, test.shape

((750, 14), (200, 13))

In [166]:
## Take a peek
train.sample(10)

Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,label,duration_energy
389,0.05,0.761,12.278891,0.838,0.0,8,0.113,-4.656,1,0.227,109.961,4,0.805,0,10.289711
166,0.808,0.55,12.329175,0.154,0.0,7,0.184,-14.917,1,0.0539,82.126,4,0.751,1,1.898693
122,6.3e-05,0.448,12.447095,0.991,0.0,8,0.354,-2.507,0,0.113,130.032,4,0.242,0,12.335072
352,0.129,0.788,12.482996,0.918,0.000471,1,0.0513,-2.756,1,0.102,124.981,4,0.736,0,11.45939
135,0.0246,0.371,12.476405,0.823,0.0,7,0.0894,-6.766,1,0.0568,188.055,4,0.471,0,10.268081
439,0.514,0.573,12.588293,0.387,1e-06,4,0.117,-12.733,0,0.0388,84.236,3,0.394,1,4.871669
473,0.194,0.882,12.335521,0.442,0.0177,4,0.0904,-15.274,1,0.0549,115.895,4,0.965,1,5.4523
739,0.102,0.803,12.285692,0.631,7e-06,8,0.0858,-4.213,1,0.0444,104.999,4,0.659,0,7.752272
82,0.137,0.666,12.264016,0.948,0.0,10,0.192,-2.776,1,0.0638,100.996,4,0.523,0,11.626287
563,0.424,0.723,12.196361,0.679,5.6e-05,2,0.317,-7.517,1,0.0357,126.028,4,0.71,1,8.281329


In [139]:
# Range of values for time_signature
print(min(train['time_signature']), max(train['time_signature']))

1 5


In [140]:
# should time_signature be categorical?
for i in range(1,6):
    count = 0;
    for row in test["time_signature"]:
        if(row == i):
            count += 1
    print(f"Frequency of {i} is: {count/len(test)}")    

Frequency of 1 is: 0.01
Frequency of 2 is: 0.0
Frequency of 3 is: 0.065
Frequency of 4 is: 0.9
Frequency of 5 is: 0.025


Feature engineering:
- Maybe log duration? It is not that interesting if it is 9 or 12 minutes but more if it is short or long

- Normalize all data 
- Time signature has 6 ones, 0 twos, 64 threes, 671 fours, 9 fives and none else. Four categories? 
- test data has about the same frequency distribution as train regarding time_signature

Key is 0 to 11, maybe just normalize?

Divide instrumentalness with speechiness for an amplified speech/instrument divergence? **Not a good idea**


In [141]:
# log duration
train["duration"] = np.log(train["duration"])
test["duration"] = np.log(test["duration"])

In [205]:
# Artificial features
train_duration_energy = train["duration"]*train["energy"]
train_acoustic_loud = train["acousticness"]*train["loudness"]
test_duration_energy = test["duration"]*test["energy"]
test_acoustic_loud = test["acousticness"]*test["loudness"]

train["duration_energy"] = train_duration_energy
train["acoustic_loud"] = train_acoustic_loud
test["duration_energy"] = test_duration_energy
test["acoustic_loud"] = test_acoustic_loud

In [168]:
train_1 = train.loc[train["label"] == 1]
train_0 = train.loc[train["label"] == 0]

In [195]:
train.head()

Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,label,duration_energy,acoustic_loud
0,0.713,0.514,11.514175,0.521,0.816,8,0.112,-14.835,0,0.0444,119.879,4,0.143,1,5.998885,-10.577355
1,0.192,0.714,12.240566,0.614,0.0,4,0.263,-6.935,1,0.0319,123.969,4,0.582,1,7.515707,-1.33152
2,0.333,0.63,12.283959,0.455,4e-06,5,0.127,-9.29,1,0.0292,139.931,4,0.199,1,5.589201,-3.09357
3,0.601,0.81,11.823442,0.221,0.21,5,0.184,-11.005,1,0.0429,109.96,4,0.798,1,2.612981,-6.614005
4,0.883,0.465,12.10868,0.459,0.000173,6,0.0692,-8.137,0,0.0351,90.807,4,0.288,1,5.557884,-7.184971


In [206]:
# Feature selection
features = ['acousticness','danceability','duration','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence','duration_energy', 'acoustic_loud']#without speechiness
X_train = train.loc[:,features].values
y_train = train.loc[:,'label'].values
X_test = test.loc[:,features].values

In [200]:
# Choose which columns to encode and which to scale
scaling_columns = [0,1,2,3,4,6,7,9,10,12,13,14]
OHE_columns = [5,11]

In [201]:
# ColumnTransformer making it possible to do different preprocessing on different columns
ct = ColumnTransformer(
    [("scaler", preprocessing.StandardScaler(),scaling_columns),
    ("OHE",preprocessing.OneHotEncoder(sparse = False),OHE_columns)]
)

In [207]:
# Fitting and tranforming test and train data 
train_trans = ct.fit_transform(X_train)
test_trans = ct.fit_transform(X_test)

In [208]:
# Determine intervals for extended grid search
interv_est = [5, 15, 40, 100, 200, 400, 800, 1200]
interv_lr = [0.001, 0.005, 0.01, 0.015, 0.02, 0.04, 0.065, 0.1, 0.12, 0.2, 0.4, 0.7, 1.1, 1.6] 
interv_d = list(range(1,11))
interv_C = list(np.around(np.arange(0.1,10.11,0.8),2))

In [209]:
# what classifiers and what gridspace to test
classifiers = [('GBC', GradientBoostingClassifier(),{'loss': ('deviance', 'exponential'), 'n_estimators': interv_est, 'learning_rate': interv_lr}),
               ('ADA', AdaBoostClassifier(), {'n_estimators': interv_est, 'learning_rate': interv_lr}),
               ('RFC', RandomForestClassifier(), {'n_estimators': interv_est, 'criterion': ('gini', 'entropy'), 'max_depth': interv_d, 'max_features': ('sqrt', 'log2')}),
               ('LogReg',LogisticRegression(solver = 'saga', max_iter = 100000), {'penalty': ('l1', 'l2'), 'C': interv_C}),
               ('KNN', KNeighborsClassifier(), {'n_neighbors':interv_d, 'weights': ('uniform','distance')}),
               ('SVC',SVC(probability=True), {'kernel':('linear','poly', 'rbf'), 'C':interv_C})
              ]

In [210]:
## Extended grid search. Computationally inefficient but since it is a very small dataset and simple models 
## it is not too much work. 
clfs_scores_names = []
for name, classifier, param_grid in classifiers:
    print(f"Now running tests on {name}. We will test the parameters {param_grid}\n")
    t1 = time.time()
    
    clf = GridSearchCV(estimator = classifier, param_grid = param_grid)
    clf.fit(train_trans, y_train)
    
    print(f"The best estimator was: {clf.best_estimator_}")
    print(f"It recieved a score of: {np.around(clf.best_score_, 5)}")
    if(clf.best_score_ > 0.7):
        clfs_scores_names.append((clf.best_estimator_, clf.best_score_,name))    
        
    t2 = time.time()
    print(f"It took {np.around((t2-t1)/60,2)} minutes\n")

Now running tests on GBC. We will test the parameters {'loss': ('deviance', 'exponential'), 'n_estimators': [5, 15, 40, 100, 200, 400, 800, 1200], 'learning_rate': [0.001, 0.005, 0.01, 0.015, 0.02, 0.04, 0.065, 0.1, 0.12, 0.2, 0.4, 0.7, 1.1, 1.6]}

The best estimator was: GradientBoostingClassifier(learning_rate=0.005, n_estimators=800)
It recieved a score of: 0.84133
It took 11.97 minutes

Now running tests on ADA. We will test the parameters {'n_estimators': [5, 15, 40, 100, 200, 400, 800, 1200], 'learning_rate': [0.001, 0.005, 0.01, 0.015, 0.02, 0.04, 0.065, 0.1, 0.12, 0.2, 0.4, 0.7, 1.1, 1.6]}

The best estimator was: AdaBoostClassifier(learning_rate=0.04, n_estimators=800)
It recieved a score of: 0.82
It took 6.15 minutes

Now running tests on RFC. We will test the parameters {'n_estimators': [5, 15, 40, 100, 200, 400, 800, 1200], 'criterion': ('gini', 'entropy'), 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'max_features': ('sqrt', 'log2')}

The best estimator was: RandomForestC

In [211]:
clfs_scores_names.sort(key = lambda tup: tup[1], reverse = True)
estimators = []
votes = []
for row in clfs_scores_names:
    estimators.append((row[2],row[0]))
    votes.append(row[1])

In [212]:
riksdagen = VotingClassifier(estimators = estimators, voting = 'soft', weights = votes)
cv = 8
print(f"Mean score from {cv} cross validations: {np.mean(cross_val_score(riksdagen, train_trans, y_train))}")

Mean score from 8 cross validations: 0.8413333333333334


In [213]:
ans = riksdagen.fit(train_trans,y_train).predict(test_trans)

In [214]:
answer = ""
for i in ans:
    answer = answer + str(i)
print(len(answer))
print(answer)

200
00010011001101101011001100100111011111010101110110001101100011001111101011110110110101101000011011111011110111110011101001101110101011111111101011001011001111100101111111111000111011111110100110100111
