# Text Cleaning & Processing 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500000)
logic_df = pd.read_csv('logic_df_final.csv', index_col=0)

In [2]:
logic_df.shape

(5890, 4)

# Processing 

In [3]:
#clean text 
#tokenize text
#use vectorizers 
#Count Vectorizer does one hot encoding 

**Remove all irrelevant characters such as any non alphanumeric characters**

In [4]:
logic_df.lyric = logic_df.lyric.str.replace(r'([^\s\w]|_)+', '')

In [5]:
logic_df.head()

Unnamed: 0,album,artist,lyric,song
0,Undeniable (2012),Young Sinatra,Yeah pass the mic before I jack it like goretex,Disgusting
1,Undeniable (2012),Young Sinatra,Bust like raw sex rappers suck like vortex,Disgusting
2,Undeniable (2012),Young Sinatra,The life of a Don We living like kings and killing our pawns,Disgusting
3,Undeniable (2012),Young Sinatra,Boy the seconds its on dont know where we going,Disgusting
4,Undeniable (2012),Young Sinatra,Im flowing and killing this shit from dusk till dawn,Disgusting


**make all the text data lowercase**

In [6]:
logic_df.lyric = logic_df.lyric.str.lower()

In [7]:
logic_df.head()

Unnamed: 0,album,artist,lyric,song
0,Undeniable (2012),Young Sinatra,yeah pass the mic before i jack it like goretex,Disgusting
1,Undeniable (2012),Young Sinatra,bust like raw sex rappers suck like vortex,Disgusting
2,Undeniable (2012),Young Sinatra,the life of a don we living like kings and killing our pawns,Disgusting
3,Undeniable (2012),Young Sinatra,boy the seconds its on dont know where we going,Disgusting
4,Undeniable (2012),Young Sinatra,im flowing and killing this shit from dusk till dawn,Disgusting


In [8]:
model_df = logic_df.drop(["album", "song"], axis=1)

In [9]:
model_df.head()

Unnamed: 0,artist,lyric
0,Young Sinatra,yeah pass the mic before i jack it like goretex
1,Young Sinatra,bust like raw sex rappers suck like vortex
2,Young Sinatra,the life of a don we living like kings and killing our pawns
3,Young Sinatra,boy the seconds its on dont know where we going
4,Young Sinatra,im flowing and killing this shit from dusk till dawn


In [10]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Benjamin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
model_df['tokenized_lyrics'] = model_df.lyric.apply(word_tokenize) 
model_df.head()

Unnamed: 0,artist,lyric,tokenized_lyrics
0,Young Sinatra,yeah pass the mic before i jack it like goretex,"[yeah, pass, the, mic, before, i, jack, it, like, goretex]"
1,Young Sinatra,bust like raw sex rappers suck like vortex,"[bust, like, raw, sex, rappers, suck, like, vortex]"
2,Young Sinatra,the life of a don we living like kings and killing our pawns,"[the, life, of, a, don, we, living, like, kings, and, killing, our, pawns]"
3,Young Sinatra,boy the seconds its on dont know where we going,"[boy, the, seconds, its, on, dont, know, where, we, going]"
4,Young Sinatra,im flowing and killing this shit from dusk till dawn,"[im, flowing, and, killing, this, shit, from, dusk, till, dawn]"


In [12]:
model_df = model_df.drop(["lyric"], axis=1)

In [13]:
model_df.head()

Unnamed: 0,artist,tokenized_lyrics
0,Young Sinatra,"[yeah, pass, the, mic, before, i, jack, it, like, goretex]"
1,Young Sinatra,"[bust, like, raw, sex, rappers, suck, like, vortex]"
2,Young Sinatra,"[the, life, of, a, don, we, living, like, kings, and, killing, our, pawns]"
3,Young Sinatra,"[boy, the, seconds, its, on, dont, know, where, we, going]"
4,Young Sinatra,"[im, flowing, and, killing, this, shit, from, dusk, till, dawn]"


# W2V Models

In [14]:
import gensim

In [15]:
word2vec_path = "~/Downloads/GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [16]:
from sklearn.cross_validation import train_test_split

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, dfmerged, generate_missing=False):
    embeddings = model_df['tokenized_lyrics'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

embeddings = get_word2vec_embeddings(word2vec, model_df)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(embeddings, model_df['artist'], 
                                                                                        test_size=0.2, random_state=40)



In [17]:
y_test_word2vec.head()

5067    Logic          
416     Young Sinatra  
3076    Bobby Tarantino
1932    Young Sinatra  
5539    Bobby Tarantino
Name: artist, dtype: object

# Gaussian NB 

In [84]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


#create pipeline
pipe = Pipeline([('nb', GaussianNB())])
#set param_grid
param_grid = {}
#grid search through the parameters 
grid = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid.fit(X_train_word2vec, y_train_word2vec)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None, steps=[('nb', GaussianNB(priors=None))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [85]:
y_test_pred = grid.predict(X_test_word2vec)

In [86]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_word2vec, y_test_pred)

0.41765704584040747

# KNC 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

#create pipeline
pipe = Pipeline([('knc', KNeighborsClassifier())])
#set param_grid
param_grid = {'knc__n_neighbors': [1,2,3,4,5,6,7], 'knc__leaf_size': [10,30,40,50,60]}
#grid search through the parameters 
grid3 = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid3.fit(X_train_word2vec, y_train_word2vec)

In [59]:
knc = KNeighborsClassifier()

In [60]:
knc.fit(X_train_word2vec, y_train_word2vec)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [61]:
y_test_pred = knc.predict(X_test_word2vec)

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_word2vec, y_test_pred)

0.5925297113752123

# SVC

In [63]:
from sklearn.svm import SVC

In [79]:
svc = SVC(C=2000.0)

In [80]:
svc.fit(X_train_word2vec, y_train_word2vec)

SVC(C=2000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [81]:
y_test_pred = svc.predict(X_test_word2vec)
accuracy_score(y_test_word2vec, y_test_pred)

0.5747028862478778