In [1]:
import pandas as pd
import numpy as np
from utils import *
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import hamming_loss
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

start = time.time()

n = 20000

num_tags = 100

# load all lyric data into pandas dataframe
df = pd.read_csv('/Users/Hadoop/Desktop/DS1003_MLProject/lyric_data.csv', index_col=0)

# Sometimes the API returns an error message rather than actual lyrics. This removes it
bad_song = df['lyrics'].value_counts().index[0]
df[df['lyrics'] == bad_song] = ''

# only take the ones that we have data for
df.fillna('', inplace=True)
df = df[df['lyrics'] != '']

# List of list of tags for each example
tags = [clean_tags(raw) for raw in list(df['tags'])]

# list of tuples of (tag, frequency) in desending order
tf = tag_freq(tags)

# Choose which tags to restrict model too
important_tags = [x[0] for x in tf[0:num_tags]]
important_tags = dict(zip(important_tags, range(len(important_tags))))

# maps each of the tags int 'tags' to an int index
indices = tag2index(tags, important_tags)

# Convert indices to binary vectors of tags
y = np.zeros((len(indices), num_tags))
for i, tags in enumerate(indices):
    for tag in tags:
        y[i, tag] = 1

# Build vocabulary and tokenizer
vect = CountVectorizer(max_features=n, stop_words='english')
X = vect.fit_transform(df['lyrics'])
vocab = vect.vocabulary_
tok = vect.build_analyzer()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)




In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
results = []

#NAIVE BAYES
model = OneVsRestClassifier(MultinomialNB())
model.fit(X_train,y_train)

results.append('Naive Bayes', model.score(X_test, y_test ), f_score( y_test, model.predict_proba(X_test)))

#LINEAR SVM

C = [.01,.1,1,10]
for c in C:
    
    start = time.time()
    model = OneVsRestClassifier(LinearSVC(max_iter=4, C=c ))
    model.fit(X_train,y_train)
    results.append('SVM ' +str(c) ,model.score(X_test, y_test ), f1_score(y_test.flatten() , model.predict(X_test).flatten()))
    
C = [.01,.1,1,10]
#LOGISTIC REGRESSION
for c in C:
    
    start = time.time()
    model = OneVsRestClassifier(LogisticRegression(max_iter=4, C=c))
    model.fit(X_train,y_train)
    results.append( 'Logistic Regresion ' + str(c)  model.score(X_test, y_test ) , f_score( y_test , model.predict_proba(X_test)))

In [9]:
#LogisticRegression
start = time.time()
model = OneVsRestClassifier(MultinomialNB())
model.fit(X_train,y_train[:,0:5])
print( model.score(X_test, y_test[:,0:5] ) , time.time() - start)
#print(f_score( y_test[:,0:5] ), model.predict_proba(X_test)) ,time.time() -  start)


0.228381256656 1.4968719482421875


In [19]:
from sklearn.metrics import f1_score

for i in range(10):
    
    start = time.time()
    model = OneVsRestClassifier(LinearSVC(max_iter=i*5 +1  ))
    model.fit(X_train,y_train[:,0:5])
    print( model.score(X_test, y_test[:,0:5] ), f1_score(y_test[:,0:5].flatten() , model.predict(X_test).flatten()) , time.time() - start)

0.455431309904 0.0979994327727 3.4441399574279785
0.451783812567 0.205297359221 16.343451023101807


KeyboardInterrupt: 

In [18]:
for i in range(0,5):
    
    start = time.time()
    model = OneVsRestClassifier(LogisticRegression(max_iter=i+1,dual=False ))
    model.fit(X_train,y_train[:,0:5])
    print( model.score(X_test, y_test[:,0:5] ) , f_score( y_test[:,0:5] , model.predict_proba(X_test)) , time.time() - start)

0.455431309904 0.401055292631 3.6266438961029053
0.44928115016 0.441968190211 5.783310890197754
0.450186368477 0.457843112449 8.269033908843994


KeyboardInterrupt: 