# Data import

In [1]:
import sqlite3
conn = sqlite3.connect('C:/JupData/mxm_dataset.db')
print(conn)

<sqlite3.Connection object at 0x000002278CC03E30>


In [2]:
res = conn.execute("SELECT * FROM sqlite_master WHERE type='table'")
print(res)

<sqlite3.Cursor object at 0x000002278CF0FCE0>


Metadata import

In [3]:
conn_tmdb = sqlite3.connect('C:/JupData/track_metadata.db')

In [4]:
res.fetchall()

[('table', 'words', 'words', 2, 'CREATE TABLE words (word TEXT PRIMARY KEY)'),
 ('table',
  'lyrics',
  'lyrics',
  4,
  'CREATE TABLE lyrics (track_id, mxm_tid INT, word TEXT, count INT, is_test INT, FOREIGN KEY(word) REFERENCES words(word))')]

# Playing around in sql

In [None]:
res = conn.execute("SELECT word FROM words")
len(res.fetchall())

In [None]:
res = conn.execute("SELECT word FROM words WHERE ROWID BETWEEN 50 AND 60")
res.fetchall()

In [None]:
res = conn.execute("SELECT track_id FROM lyrics WHERE word='countdown' ORDER BY RANDOM() LIMIT 1")

In [None]:
res.fetchall()

In [None]:
res = conn_tmdb.execute("SELECT artist_name, title FROM songs WHERE track_id='TRCKLDK12903CEA5A9'")
res.fetchall()

# Got sick of SQL and switched to pandas

<b> Import data - select quantity with the number after "limit"

In [5]:
#Import words data

import pandas as pd
df = pd.read_sql_query("select * from lyrics limit 50000000;", conn)

In [6]:
print(df.shape)
print(df.tail())
print(len(df['word'].value_counts()))

(19045332, 5)
                    track_id  mxm_tid    word  count  is_test
19045327  TRZZZZD128F4236844  2466899  easili      1        1
19045328  TRZZZZD128F4236844  2466899  disast      1        1
19045329  TRZZZZD128F4236844  2466899   frown      1        1
19045330  TRZZZZD128F4236844  2466899    teas      1        1
19045331  TRZZZZD128F4236844  2466899   upset      1        1
5000


In [7]:
#Import track information data to maps track id to title

df_meta=pd.read_sql_query("SELECT * from songs limit 5000000;", conn_tmdb)

In [1]:
print(df_meta.shape)
print(df_meta.tail(1))

NameError: name 'df_meta' is not defined

In [9]:
df_titles=df_meta.merge(df, on='track_id', how='right')

In [12]:
#print(df_titles.shape, '\n', df_titles[df_titles['track_id']=='TRAAPKW128F428BC93'])

In [10]:
df_titles.drop(labels=['song_id', 'shs_perf', 'shs_work', 'mxm_tid'],axis=1, inplace=True)

<b> first genres dataset, didn't end up using it

In [None]:
#first genres dataset

df_genres=pd.read_csv('C:\JupData\msd-topMAGD-genreAssignment.cls', delimiter="\t", header=None)

In [None]:
df_genres.columns=['track_id','genre']
print(df_genres.head(1))

In [None]:
print(df_genres[df_genres['genre']=='Country'])

<b> second genre dataset, nicer (more merges) 

In [11]:
df_genres2=pd.read_csv('C:\JupData\msd_tagtraum_cd1.cls', delimiter="\t", skiprows=8, usecols=[0,1], header=None)

In [12]:
df_genres2.columns=['track_id', 'genre']
print(df_genres2.head())

             track_id       genre
0  TRAAAAW128F429D538         Rap
1  TRAAABD128F429CF47    Pop_Rock
2  TRAAAED128E0783FAB        Jazz
3  TRAAAEF128F4273421    Pop_Rock
4  TRAAAEM128F93347B9  Electronic


<b> merging genres data with titles and words data

In [13]:
df_merged=df_titles.merge(df_genres2, on='track_id', how='inner')

In [14]:
print(df_merged.shape)

(5215622, 15)


In [15]:
print(df_merged.head(1))

             track_id           title         release           artist_id  \
0  TRMMMNO128F93539AA  In The Journey  In The Journey  AR4TLW81187B99683D   

                            artist_mbid    artist_name   duration  \
0  0685ac4a-5cfc-408a-b391-903ea20e00bf  Martin Sexton  319.81669   

   artist_familiarity  artist_hotttnesss  year  track_7digitalid word  count  \
0            0.641198           0.448653  2001           5749967    i     30   

   is_test genre  
0        0  Folk  


In [16]:
#checking that we have enough country data, making sure the data looks ok

import numpy as np
df_merged['is_country'] = np.where((df_merged['genre'] ==  'Country') | (df_merged['genre']=='Folk'), 1,0)

#df_merged['is_country']=0
#df_merged['is_country'][df_merged['genre']=='Country']=1

In [22]:
print(df_merged.genre.value_counts())

Pop_Rock         3610146
Rap               510046
Country           283100
RnB               217097
Electronic        165838
Folk              120531
Reggae             92112
Latin              81019
Jazz               64214
Blues              42939
International      14898
Vocal               7277
New Age             6405
Name: genre, dtype: int64


In [23]:
print(df_merged[df_merged['is_country']==1])

                   track_id                     title  \
0        TRMMMNO128F93539AA            In The Journey   
1        TRMMMNO128F93539AA            In The Journey   
2        TRMMMNO128F93539AA            In The Journey   
3        TRMMMNO128F93539AA            In The Journey   
4        TRMMMNO128F93539AA            In The Journey   
5        TRMMMNO128F93539AA            In The Journey   
6        TRMMMNO128F93539AA            In The Journey   
7        TRMMMNO128F93539AA            In The Journey   
8        TRMMMNO128F93539AA            In The Journey   
9        TRMMMNO128F93539AA            In The Journey   
10       TRMMMNO128F93539AA            In The Journey   
11       TRMMMNO128F93539AA            In The Journey   
12       TRMMMNO128F93539AA            In The Journey   
13       TRMMMNO128F93539AA            In The Journey   
14       TRMMMNO128F93539AA            In The Journey   
15       TRMMMNO128F93539AA            In The Journey   
16       TRMMMNO128F93539AA    

In [21]:
#excluding pop rock
df_merged2=df_merged[df_merged.genre!='Pop_Rock']

In [17]:
df_model=df_merged[['track_id', 'is_country', 'word', 'count']]

# Sparse matrix representation - efficient

In [77]:
df_model

Unnamed: 0,track_id,is_country,word,count
0,TRMMMNO128F93539AA,1,i,30
1,TRMMMNO128F93539AA,1,the,15
2,TRMMMNO128F93539AA,1,you,4
3,TRMMMNO128F93539AA,1,to,6
4,TRMMMNO128F93539AA,1,and,15
5,TRMMMNO128F93539AA,1,a,5
6,TRMMMNO128F93539AA,1,me,3
7,TRMMMNO128F93539AA,1,it,13
8,TRMMMNO128F93539AA,1,not,11
9,TRMMMNO128F93539AA,1,in,6


In [18]:
from scipy.sparse import csr_matrix

track_sorted=sorted(df_model.track_id.unique())
words_sorted=sorted(df_model.word.unique())

track_id_u = list(track_sorted)
word_u = list(words_sorted)

data = df_model['count'].tolist()
row = df_model.track_id.astype('category', categories=track_id_u).cat.codes
col = df_model.word.astype('category', categories=word_u).cat.codes
sparse_matrix = csr_matrix((data, (row, col)), shape=(len(track_id_u), len(word_u)))

In [19]:
x_train=np.array(sparse_matrix.todense())

In [23]:
y_train=df_model.groupby('track_id', sort=False)['is_country'].agg('mean')
y_train=np.array(y_train)
print(y_train[2980:3000])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
pd.DataFrame(x_train).to_pickle('x_train.pickle')
pd.DataFrame(y_train).to_pickle('y_train.pickle')

<b> TFIDF weighting

In [116]:
counts=sparse_matrix.todense()
from sklearn.feature_extraction.text import TfidfTransformer
x_train_tfidf = TfidfTransformer(use_idf=False).fit_transform(counts)
x_train=x_train_tfidf

# Extra data pre-processing for more expensive models
SVM, Logistic and RF take a while to run, and could be prone to overfitting. The code below only keeps the top "top" most popular words (change 1 cell down to implement this)

In [2]:
#lOAD IN PRELOADED DATASET
import pandas as pd
import numpy as np
x_train=pd.read_csv('x_train.csv')
y_train=pd.read_csv('y_train.csv')

In [20]:
#REMOVE THIS LATER IF NO LONGER LOADING IN CSVS
y_train.drop('Unnamed: 0', inplace=True, axis=1)

In [11]:
#Creates an array that sums up each column of x_train, sorts it, keeps the top x and subsets based on this
#CHANGED TO DENSE LINE BECAUSE I LOADED IN A CSV
#x_train_pd=pd.DataFrame(x_train.todense())
x_train.index=x_train.iloc[:,0]
x_train.drop('Unnamed: 0', inplace=True, axis=1)
x_train_pd=x_train
x_train_sum=pd.DataFrame(x_train_pd.sum(axis=0))
print(x_train_sum.shape)
x_train_sum['idx']=range(len(x_train_sum))
x_train_sum.sort_values(by=0, axis=0, ascending=False, inplace=True)

(5000, 1)


In [13]:
#Change top and the code below to exclude some observations.
_from=0
_to=1000
keep_columns=x_train_sum.iloc[_from:_to]['idx']
keep_columns_array=np.array(keep_columns)
x_train_subset=x_train_pd.iloc[:,keep_columns_array]
x_train2=np.array(x_train_subset)

Removing stop words (the stop words list is at the end of the notebook, for style)

In [14]:
#This significantly decreases our sample size :( hopefully it's all noise
df_model_nostop=df_model.loc[~df_model.word.isin(stop_words)]
df_model_nostop.shape

NameError: name 'df_model' is not defined

In [None]:
from scipy.sparse import csr_matrix

track_sorted=sorted(df_model_nostop.track_id.unique())
words_sorted=sorted(df_model_nostop.word.unique())

track_id_u = list(track_sorted)
word_u = list(words_sorted)

data = df_model_nostop['count'].tolist()
row = df_model_nostop.track_id.astype('category', categories=track_id_u).cat.codes
col = df_model_nostop.word.astype('category', categories=word_u).cat.codes
sparse_matrix_nostop = csr_matrix((data, (row, col)), shape=(len(track_id_u), len(word_u)))

x_train_nostop=np.array(sparse_matrix_nostop.todense())
y_train_nostop=df_model_nostop.groupby('track_id', sort=False)['is_country'].agg('mean')
y_train_nostop=np.array(y_train_nostop)

I'm afraid of overfitting without stop words. Create a subset of the top non stop words (note the index is shorter because of removal of stop words)

In [None]:
x_train_nostop_pd=pd.DataFrame(x_train_nostop)
x_train_nostop_sum=pd.DataFrame(x_train_nostop_pd.sum(axis=0))
print(x_train_nostop_sum.shape)
x_train_nostop_sum['idx']=range(len(x_train_nostop_sum))
x_train_nostop_sum.sort_values(by=0, axis=0, ascending=False, inplace=True)

(4897, 1)


In [None]:
#Change top and the code below to exclude some observations.
_from=0
_to=1000
keep_columns=x_train_nostop_sum.iloc[_from:_to]['idx']
keep_columns_array=np.array(keep_columns)
x_train_nostop_subset=x_train_nostop_pd.iloc[:,keep_columns_array]
x_train_nostop2=np.array(x_train_nostop_subset)

In [None]:
print(x_train_nostop2.shape,y_train_nostop.shape)

(62689, 1000) (62689,)


Dataset with only stop words

In [None]:
#This significantly decreases our sample size :( hopefully it's all noise
df_model_stop=df_model.loc[df_model.word.isin(stop_words)]
df_model_stop.shape

(1726154, 4)

In [None]:
from scipy.sparse import csr_matrix

track_sorted=sorted(df_model_stop.track_id.unique())
words_sorted=sorted(df_model_stop.word.unique())

track_id_u = list(track_sorted)
word_u = list(words_sorted)

data = df_model_stop['count'].tolist()
row = df_model_stop.track_id.astype('category', categories=track_id_u).cat.codes
col = df_model_stop.word.astype('category', categories=word_u).cat.codes
sparse_matrix_stop = csr_matrix((data, (row, col)), shape=(len(track_id_u), len(word_u)))

x_train_stop=np.array(sparse_matrix_stop.todense())
y_train_stop=df_model_stop.groupby('track_id', sort=False)['is_country'].agg('mean')
y_train_stop=np.array(y_train_stop)

<b> PCA

In [70]:
from sklearn.decomposition import PCA
pca_test=PCA(n_components=10, whiten=True)

In [None]:
x_train_pca=pca_test.fit_transform(x_train)

In [73]:
sum(pca_test.explained_variance_ratio_)

0.27942661918653444

Test/train split - choose data here. Options are:
- x_train (all data, top 5k words)
- x_train2 (all data, subset of words - see _from and _to 6 cells up)
- x_train_nostop (no stop words)
- x_train_nostop2 (no stop words, subset of words - see _from and _to 2 cells up)

In [45]:
pd.DataFrame(x_train).to_csv('x_train.csv')

In [46]:
pd.DataFrame(y_train).to_csv('y_train.csv')

In [3]:
#data import to skip above
import pandas as pd
y_train=pd.read_csv('y_train.csv')
x_train=pd.read_csv('x_train.csv')

In [15]:
# RUN THIS CELL TO CHANGE SAMPLE
import numpy as np
x=x_train
y=np.array(y_train['0'])
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Naive bayes

In [75]:
# Naive bayes 
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

model1=MultinomialNB(fit_prior=False, alpha=1)
model1.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


ValueError: Input X must be non-negative

In [48]:
#Metrics
expected=Y_test
predicted=model1.predict(X_test)
print('training data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_test, predicted)
print(matrix)

training data 
             precision    recall  f1-score   support

          0       0.93      0.92      0.93     17449
          1       0.07      0.07      0.07      1359

avg / total       0.87      0.86      0.86     18808

[[16100  1349]
 [ 1259   100]]


# Random Forest


In [86]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_estimators=300, max_features=200, max_depth=3, class_weight='auto', criterion='gini')

In [87]:
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=3, max_features=200, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [88]:
from sklearn import metrics
#Metrics test
expected=Y_test
predicted=rf.predict(X_test)
print('test data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_test, predicted)
print(matrix)

#Metrics train
expected=Y_train
predicted=rf.predict(X_train)
print('training data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_train, predicted)
print(matrix)

test data 
             precision    recall  f1-score   support

          0       0.93      0.80      0.86     17449
          1       0.07      0.19      0.10      1359

avg / total       0.86      0.76      0.80     18808

[[13974  3475]
 [ 1106   253]]
training data 
             precision    recall  f1-score   support

          0       0.95      0.82      0.88     40722
          1       0.17      0.48      0.25      3163

avg / total       0.90      0.79      0.84     43885

[[33305  7417]
 [ 1635  1528]]


In [None]:
for z in rf.feature_importances_:
    print(X_train)

# Logistic

In [16]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(class_weight='balanced')

In [17]:
print(X_train.shape, Y_train.shape)

(43885, 5001) (43885,)


In [18]:
model2=logit.fit(X_train,Y_train)

In [19]:
from sklearn import metrics
#Metrics test
expected=Y_test
predicted=model2.predict(X_test)
print('test data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_test, predicted)
print(matrix)

#Metrics train
expected=Y_train
predicted=model2.predict(X_train)
print('training data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_train, predicted)
print(matrix)

test data 
             precision    recall  f1-score   support

          0       0.93      0.59      0.72     17449
          1       0.07      0.42      0.13      1359

avg / total       0.87      0.58      0.68     18808

[[10335  7114]
 [  790   569]]
training data 
             precision    recall  f1-score   support

          0       0.95      0.61      0.74     40722
          1       0.10      0.56      0.17      3163

avg / total       0.89      0.60      0.70     43885

[[24694 16028]
 [ 1379  1784]]


# SVM

In [52]:
from sklearn import svm
svmlin=svm.LinearSVC(class_weight='balanced')

In [79]:
model4=svmlin.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [80]:
#Metrics test
expected=Y_test
predicted=model4.predict(X_test)
print('test data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_test, predicted)
print(matrix)

#Metrics train
expected=Y_train
predicted=model4.predict(X_train)
print('training data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_train, predicted)
print(matrix)

test data 
             precision    recall  f1-score   support

          0       0.93      0.67      0.78     17449
          1       0.07      0.32      0.12      1359

avg / total       0.86      0.65      0.73     18808

[[11778  5671]
 [  930   429]]
training data 
             precision    recall  f1-score   support

          0       0.93      0.68      0.79     40722
          1       0.07      0.33      0.12      3163

avg / total       0.87      0.66      0.74     43885

[[27763 12959]
 [ 2126  1037]]


# SVM hyper paramater tuning
maybe this will help with overfitting by adjusting the C parameter? Idk

In [54]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from __future__ import print_function


In [55]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['precision', 'recall']


In [56]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, Y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = Y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 

In [None]:
print("x")

# Close sql connection

In [None]:
conn.close()
conn_tmdb.close()

# Sparse matrix inefficient
This is legacy code, it's ridiculously slow

In [None]:
df_model_piv=df_model.pivot(index='track_id', columns='word', values='count')
print(df_model_piv.head(20))

In [None]:
df_model_piv=df_model_piv.iloc[0:100000]
df_model_piv.replace(to_replace='Nan', value=0, inplace=True)
df_model_piv = df_model_piv.rename(columns={'word': 'track_id'})
print(df_model_piv.head(1))

In [None]:
df_is_country=df_merged[['track_id', 'is_country']]
df_is_country=df_is_country.groupby(by='track_id', as_index=True).mean()
print(df_is_country[df_is_country.is_country==True])

In [None]:
df_model_sparse=pd.merge(df_model_piv, df_is_country, left_index=True, right_index=True, how='left')

In [None]:
#df_model_sparse.to_csv('C:/JupData/abc.csv')

In [None]:
from sklearn import preprocessing
import numpy as np

y_train=np.array(df_model_sparse['is_country'])
df_model_sparse.drop('is_country', axis=1, inplace=True)

In [None]:
df_model_sparse.drop('track_id', axis=1, inplace=True)

In [None]:
x_train=np.array(df_model_sparse)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_train[2860], y_train[2980:3000])

# Stop words list

In [36]:
stop_words=["a",
"about",
"above",
"after",
"again",
"against",
"all",
"am",
"an",
"and",
"any",
"are",
"aren't",
"as",
"at",
"be",
"because",
"been",
"before",
"being",
"below",
"between",
"both",
"but",
"by",
"can't",
"cannot",
"could",
"couldn't",
"did",
"didn't",
"do",
"does",
"doesn't",
"doing",
"don't",
"down",
"during",
"each",
"few",
"for",
"from",
"further",
"had",
"hadn't",
"has",
"hasn't",
"have",
"haven't",
"having",
"he",
"he'd",
"he'll",
"he's",
"her",
"here",
"here's",
"hers",
"herself",
"him",
"himself",
"his",
"how",
"how's",
"i",
"i'd",
"i'll",
"i'm",
"i've",
"if",
"in",
"into",
"is",
"isn't",
"it",
"it's",
"its",
"itself",
"let's",
"me",
"more",
"most",
"mustn't",
"my",
"myself",
"no",
"nor",
"not",
"of",
"off",
"on",
"once",
"only",
"or",
"other",
"ought",
"our",
"ours",
"ourselves",
"out",
"over",
"own",
"same",
"shan't",
"she",
"she'd",
"she'll",
"she's",
"should",
"shouldn't",
"so",
"some",
"such",
"than",
"that",
"that's",
"the",
"their",
"theirs",
"them",
"themselves",
"then",
"there",
"there's",
"these",
"they",
"they'd",
"they'll",
"they're",
"they've",
"this",
"those",
"through",
"to",
"too",
"under",
"until",
"up",
"very",
"was",
"wasn't",
"we",
"we'd",
"we'll",
"we're",
"we've",
"were",
"weren't",
"what",
"what's",
"when",
"when's",
"where",
"where's",
"which",
"while",
"who",
"who's",
"whom",
"why",
"why's",
"with",
"won't",
"would",
"wouldn't",
"you",
"you'd",
"you'll",
"you're",
"you've",
"your",
"yours",
"yourself",
"yourselves",
]


# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(class_weight='balanced_subsample')

In [None]:
model3=rfc.fit(X_train, Y_train)

In [None]:
#Metrics test
expected=Y_test
predicted=model3.predict(X_test)
print('test data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_test, predicted)
print(matrix)

#Metrics train
expected=Y_train
predicted=model3.predict(X_train)
print('training data \n' + metrics.classification_report(expected, predicted))
matrix=metrics.confusion_matrix(Y_train, predicted)
print(matrix)