In [43]:
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, confusion_matrix


In [50]:
conn = sqlite3.connect('/Users/aaronlevi/Documents/sql_db/pick-a-tune.db')
# conn = sqlite3.connect('/Users/aaronlevi/Documents/sql_db/chords_list.db')
cur = conn.cursor()

# cur.close()
# conn.close()

In [65]:
allFeatures = pd.read_sql_query("SELECT * FROM features", conn)
labels = pd.read_sql_query("SELECT label FROM basic_info", conn)

# allFeatures = allFeatures.reset_index(drop=True)
# allFeatures.drop('index', axis=1, inplace=True)

# labels = labels.reset_index(drop=True)

allFeatures = allFeatures.iloc[57:1329].reset_index(drop=True)
allFeatures.drop('index', axis=1, inplace=True)

labels = labels.iloc[57:1329].reset_index(drop=True)

In [66]:
dummyLabel=pd.get_dummies(labels)
dummyLabel.head()

Unnamed: 0,Label_advanced,Label_intermediate,Label_novice
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,0,1


In [67]:
# making a few df's for ease of plotting... kinda clunky
featuresWithLabels = pd.concat([allFeatures, dummyLabel], axis=1, sort=False)
featuresWithLabels.dropna(axis=0, how='any', inplace=True)

featuresNovice = pd.concat([allFeatures, dummyLabel['Label_novice']], axis=1, sort=False)
featuresNovice.dropna(axis=0, how='any', inplace=True)

featuresAdv = pd.concat([allFeatures, dummyLabel['Label_advanced']], axis=1, sort=False)
featuresAdv.dropna(axis=0, how='any', inplace=True)
featuresAdv.head()

Unnamed: 0,tempo,duration_m,time_signature,energy,n_unique_chords,n_difficult_chords,chord_per_min,Label_advanced
0,123.015,1.786167,4.0,0.245,15,14,23.514043,1
1,92.64,3.737333,4.0,0.139,12,0,21.673207,0
2,131.712,12.112833,4.0,0.26,8,4,18.823011,0
3,129.937,4.4761,4.0,0.364,14,8,21.447242,1
4,83.72,2.723117,4.0,0.307,13,4,23.135256,0


In [68]:
small_adv = featuresAdv[featuresAdv['Label_advanced']==1]['n_unique_chords']<=3 
small_adv_ix = [i for i, x in enumerate(small_adv) if x]
sum(small_adv)

2

In [69]:
# advanced label
X = featuresAdv.drop(featuresAdv.index[small_adv_ix])
X.dropna(axis=0, how='any', inplace=True)
X.reset_index(drop=True)

y = X['Label_advanced']
X.drop('Label_advanced', axis=1, inplace=True)

In [70]:
def balanced_subsample(x,y,subsample_size=1.0):

    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
#             np.random.shuffle(this_xs)
            this_xs = this_xs.reindex(np.random.permutation(this_xs.index))


        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

#     xs = np.concatenate(xs)
#     ys = np.concatenate(ys)
    xs = pd.concat(xs)
    ys = pd.Series(data=np.concatenate(ys),name='target')

    return xs,ys

In [71]:
Xs, ys = balanced_subsample(X,y)
sum(ys) / len(ys)

0.5

In [72]:
scaler = StandardScaler()
scaler.fit(Xs)
X_scale = scaler.transform(Xs)

X_train, X_test, y_train, y_test = train_test_split(X_scale, ys, test_size=0.2, random_state=10)

lr_model_adv = LogisticRegression()
lr_model_adv.fit(X_train, y_train) 

LogisticRegression()

In [73]:
adv_predictions = lr_model_adv.predict(X_test)
adv_true = y_test

print(confusion_matrix(y_test, adv_predictions))
print(classification_report(y_test, adv_predictions))

[[87  8]
 [23 63]]
              precision    recall  f1-score   support

         0.0       0.79      0.92      0.85        95
         1.0       0.89      0.73      0.80        86

    accuracy                           0.83       181
   macro avg       0.84      0.82      0.83       181
weighted avg       0.84      0.83      0.83       181



In [74]:
pd.DataFrame(lr_model_adv.coef_, columns=X.columns)

Unnamed: 0,tempo,duration_m,time_signature,energy,n_unique_chords,n_difficult_chords,chord_per_min
0,0.02886,0.200514,-0.09744,0.130787,1.072434,1.761474,0.294124


In [75]:
import pickle
# save the model to disk
filename = 'model_files/lr_adv_model_1004.sav'
pickle.dump(lr_model_adv, open(filename, 'wb'))

In [79]:
# novice label
# X = featuresNovice.drop(featuresNovice.index[small_adv_ix])
X = featuresNovice.drop(featuresAdv.index[small_adv_ix])
X.reset_index(drop=True, inplace=True)

X.dropna(axis=0, how='any', inplace=True)
X.reset_index(drop=True, inplace=True)

y = X['Label_novice']
X.drop('Label_novice', axis=1, inplace=True)

In [82]:
Xs, ys = balanced_subsample(X,y)

scaler.fit(Xs)
X_scale = scaler.transform(Xs)

splitIx=y_test.index.tolist()
X_train, X_test, y_train, y_test = train_test_split(X_scale, ys, test_size=0.2, random_state=101)

lr_model_nov = LogisticRegression()
lr_model_nov.fit(X_train, y_train) 

nov_predictions = lr_model_nov.predict(X_test)
nov_true = y_test

In [83]:
print(confusion_matrix(y_test, nov_predictions))
print(classification_report(y_test, nov_predictions))

[[56 27]
 [14 61]]
              precision    recall  f1-score   support

         0.0       0.80      0.67      0.73        83
         1.0       0.69      0.81      0.75        75

    accuracy                           0.74       158
   macro avg       0.75      0.74      0.74       158
weighted avg       0.75      0.74      0.74       158



In [84]:
pd.DataFrame(lr_model_nov.coef_, columns=X.columns)

Unnamed: 0,tempo,duration_m,time_signature,energy,n_unique_chords,n_difficult_chords,chord_per_min
0,0.105185,-0.024694,0.100003,-0.217765,-1.427649,-0.796986,0.035308


In [85]:
import pickle
# save the model to disk
filename = 'model_files/lr_nov_model_1004.sav'
pickle.dump(lr_model_nov, open(filename, 'wb'))