# Text Cleaning & Processing 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500000)
logic_df = pd.read_csv('logic_df_final.csv', index_col=0)

In [2]:
logic_df.shape

(5890, 4)

In [3]:
#clean text 
#tokenize text
#use vectorizers 
#Count Vectorizer does one hot encoding 

**Remove all irrelevant characters such as any non alphanumeric characters**

In [53]:
logic_df.lyric = logic_df.lyric.str.replace(r'([^\s\w]|_)+', '')

In [54]:
logic_df.head()

Unnamed: 0,album,artist,lyric,song
0,Undeniable (2012),Young Sinatra,Yeah pass the mic before I jack it like goretex,Disgusting
1,Undeniable (2012),Young Sinatra,Bust like raw sex rappers suck like vortex,Disgusting
2,Undeniable (2012),Young Sinatra,The life of a Don We living like kings and killing our pawns,Disgusting
3,Undeniable (2012),Young Sinatra,Boy the seconds its on dont know where we going,Disgusting
4,Undeniable (2012),Young Sinatra,Im flowing and killing this shit from dusk till dawn,Disgusting


**make all the text data lowercase**

In [55]:
logic_df.lyric = logic_df.lyric.str.lower()

In [56]:
logic_df.head()

Unnamed: 0,album,artist,lyric,song
0,Undeniable (2012),Young Sinatra,yeah pass the mic before i jack it like goretex,Disgusting
1,Undeniable (2012),Young Sinatra,bust like raw sex rappers suck like vortex,Disgusting
2,Undeniable (2012),Young Sinatra,the life of a don we living like kings and killing our pawns,Disgusting
3,Undeniable (2012),Young Sinatra,boy the seconds its on dont know where we going,Disgusting
4,Undeniable (2012),Young Sinatra,im flowing and killing this shit from dusk till dawn,Disgusting


### use spacy to get the POS for each row... to add that to the existing dataframe.. then put that in a pipeline

**let's use spacy to process our text**

In [57]:
#Lemmatization first
logic_df.lyric = logic_df.lyric.str.replace(r" i i ", " I I ")
logic_df.lyric = logic_df.lyric.str.replace(r" i ", " I ")
logic_df.lyric = logic_df.lyric.str.replace(r" i'", " I'")

In [58]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

from tqdm import tqdm
tqdm.pandas()

In [59]:
# Run Spacy NLP pipeline on text_document, creates DOC object filled with tokens
doc = nlp(logic_df.lyric[0])

In [60]:
#Owen's Spacy Tutorial

# https://spacy.io/usage/linguistic-features
# https://spacy.io/usage/processing-pipelines



def nlp_pipeline(text_document):
    """
    Takes in a string and runs it through Spacy's NLP pipeline consisting of a 
    Tokenizer, Tagger, Dependency Parser, Entity Recognizer, Text Categorizer.
    NLP features are then extracted from each Token's Spacy attributes.
    NLP features are then aggregated for the entire text_document and returned a dictionary.
    
    Args:
        text_document (str): Text data.
    
    Returns:
        dict: Aggregated NLP features.
    """

    # Run Spacy NLP pipeline on text_document, creates DOC object filled with tokens
    doc = nlp(text_document)

    # Tokenization
    tokens = [tok.orth_ for tok in doc]

    # Lemmatization
    lemmas = [tok.lemma_ for tok in doc]

    # Create counter dictionaries to collect counts of NLP features
    pos_counter = {}   # Coarse-grained part-of-speech
    tag_counter = {}   # Fine-grained part-of-speech.
    stop_counter = {}   # Stop word or not

    # Loop through each Token object contained in Doc object
    for token in doc:

        # Add token 'POS' to dictionary
        pos = "pos_" + token.pos_
        if pos in pos_counter.keys():
            pos_counter[pos] += 1
        else:
            pos_counter[pos] = 1

        # Add token 'TAG' to dictionary
        tag = "tag_" + token.tag_
        if tag in tag_counter.keys():
            tag_counter[tag] += 1
        else:
            tag_counter[tag] = 1

        # Add token 'STOP' to dictionary
        stop = "stop_" + str(token.is_stop)
        if stop in stop_counter.keys():
            stop_counter[stop] += 1
        else:
            stop_counter[stop] = 1


        # Print NLP features for token
#         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#               token.shape_, token.is_alpha, token.is_stop)

    # Print NLP feature counters -- these are for the entire input 
#     print(pos_counter, end = '\n\n')
#     print(tag_counter, end = '\n\n')
#     print(stop_counter, end = '\n\n')
#     print(dep_counter, end = '\n\n')

    # Combine NLP features into one dictionary
    nlp_dictionary = {'pos_counter' : pos_counter,
                      'tag_counter' : tag_counter,
                      'stop_counter' : stop_counter}

    return nlp_dictionary

In [61]:
# Create NLP Features Column
logic_df['NLP Features'] = logic_df['lyric'].progress_apply(lambda x: nlp_pipeline(x))

100%|██████████| 5890/5890 [01:44<00:00, 56.49it/s]


In [62]:
logic_df['NLP Features'][0]

{'pos_counter': {'pos_INTJ': 1,
  'pos_VERB': 2,
  'pos_DET': 1,
  'pos_NOUN': 2,
  'pos_ADP': 2,
  'pos_PRON': 2},
 'tag_counter': {'tag_UH': 1,
  'tag_VB': 1,
  'tag_DT': 1,
  'tag_NN': 2,
  'tag_IN': 2,
  'tag_PRP': 2,
  'tag_VBP': 1},
 'stop_counter': {'stop_False': 7, 'stop_True': 3}}

In [63]:
logic_df.head(5)

Unnamed: 0,album,artist,lyric,song,NLP Features
0,Undeniable (2012),Young Sinatra,yeah pass the mic before I jack it like goretex,Disgusting,"{'pos_counter': {'pos_INTJ': 1, 'pos_VERB': 2, 'pos_DET': 1, 'pos_NOUN': 2, 'pos_ADP': 2, 'pos_PRON': 2}, 'tag_counter': {'tag_UH': 1, 'tag_VB': 1, 'tag_DT': 1, 'tag_NN': 2, 'tag_IN': 2, 'tag_PRP': 2, 'tag_VBP': 1}, 'stop_counter': {'stop_False': 7, 'stop_True': 3}}"
1,Undeniable (2012),Young Sinatra,bust like raw sex rappers suck like vortex,Disgusting,"{'pos_counter': {'pos_NOUN': 4, 'pos_ADP': 2, 'pos_ADJ': 1, 'pos_VERB': 1}, 'tag_counter': {'tag_NN': 3, 'tag_IN': 2, 'tag_JJ': 1, 'tag_NNS': 1, 'tag_VBP': 1}, 'stop_counter': {'stop_False': 8}}"
2,Undeniable (2012),Young Sinatra,the life of a don we living like kings and killing our pawns,Disgusting,"{'pos_counter': {'pos_DET': 2, 'pos_NOUN': 4, 'pos_ADP': 2, 'pos_PRON': 1, 'pos_VERB': 2, 'pos_CCONJ': 1, 'pos_ADJ': 1}, 'tag_counter': {'tag_DT': 2, 'tag_NN': 2, 'tag_IN': 2, 'tag_PRP': 1, 'tag_VBG': 2, 'tag_NNS': 2, 'tag_CC': 1, 'tag_PRP$': 1}, 'stop_counter': {'stop_True': 6, 'stop_False': 7}}"
3,Undeniable (2012),Young Sinatra,boy the seconds its on dont know where we going,Disgusting,"{'pos_counter': {'pos_INTJ': 1, 'pos_DET': 1, 'pos_NOUN': 1, 'pos_ADJ': 1, 'pos_ADP': 1, 'pos_SPACE': 1, 'pos_VERB': 3, 'pos_ADV': 2, 'pos_PRON': 1}, 'tag_counter': {'tag_UH': 1, 'tag_DT': 1, 'tag_NNS': 1, 'tag_PRP$': 1, 'tag_IN': 1, 'tag_': 1, 'tag_VBP': 1, 'tag_RB': 1, 'tag_VB': 1, 'tag_WRB': 1, 'tag_PRP': 1, 'tag_VBG': 1}, 'stop_counter': {'stop_False': 6, 'stop_True': 6}}"
4,Undeniable (2012),Young Sinatra,im flowing and killing this shit from dusk till dawn,Disgusting,"{'pos_counter': {'pos_PRON': 1, 'pos_VERB': 3, 'pos_CCONJ': 1, 'pos_DET': 1, 'pos_NOUN': 3, 'pos_ADP': 2}, 'tag_counter': {'tag_PRP': 1, 'tag_VBP': 1, 'tag_VBG': 2, 'tag_CC': 1, 'tag_DT': 1, 'tag_NN': 3, 'tag_IN': 2}, 'stop_counter': {'stop_True': 4, 'stop_False': 7}}"


Now i want to create separate columns that track each of the metrics in NLP features

In [64]:
def nlp_dict_to_df(nlp_features, feature):
    """
    Take in nlp_features dictionary, outputs bag of words
    
    Args:
        nlp_features (str): Aggregated NLP features
    
    Returns:
        pd.Series: Spacy DEP fractions
    """
    
    # Dep dictionary
    nlp_dict = nlp_features[feature]

    # Total number of entries in dep dictionary
    nlp_total = np.sum(list(nlp_dict.values()))

    # Calculating dep fractions
    nlp_dict_fractions = {k: v / nlp_total for k, v in nlp_dict.items()}

    # Turn into a pandas Series to return
    nlp_series = pd.Series(nlp_dict_fractions)
    
    return nlp_series

In [65]:
# Create dataframe of coarse-grained parts-of-speech counts 'pos'
df_pos = logic_df['NLP Features'].apply(lambda x: nlp_dict_to_df(x, 'pos_counter'))
df_pos = df_pos.fillna(value = 0)

# Create dataframe of stop word counts 'stop'
df_stop = logic_df['NLP Features'].apply(lambda x: nlp_dict_to_df(x, 'stop_counter'))
df_stop = df_stop.fillna(value = 0)

# Create dataframe of fine-grained parts-of-speech counts 'tag'
df_tag = logic_df['NLP Features'].apply(lambda x: nlp_dict_to_df(x, 'tag_counter'))
df_tag = df_tag.fillna(value = 0)

# Combine all NLP dataframes
logic_df_nlp = pd.concat([df_stop, df_pos, df_tag], axis = 1)

In [66]:
logic_df_nlp.shape

(5890, 56)

In [67]:
logic_df_nlp.head

<bound method NDFrame.head of       stop_False  stop_True  pos_INTJ  pos_VERB   pos_DET  pos_NOUN   pos_ADP  \
0     0.700000    0.300000   0.100000  0.200000  0.100000  0.200000  0.200000   
1     1.000000    0.000000   0.000000  0.125000  0.000000  0.500000  0.250000   
2     0.538462    0.461538   0.000000  0.153846  0.153846  0.307692  0.153846   
3     0.500000    0.500000   0.083333  0.250000  0.083333  0.083333  0.083333   
4     0.636364    0.363636   0.000000  0.272727  0.090909  0.272727  0.181818   
5     0.500000    0.500000   0.000000  0.166667  0.166667  0.333333  0.083333   
6     0.500000    0.500000   0.000000  0.100000  0.200000  0.400000  0.200000   
7     0.500000    0.500000   0.000000  0.250000  0.083333  0.166667  0.083333   
8     0.500000    0.500000   0.071429  0.214286  0.000000  0.142857  0.000000   
9     0.384615    0.615385   0.000000  0.153846  0.000000  0.153846  0.076923   
10    0.500000    0.500000   0.000000  0.300000  0.000000  0.100000  0.100000  

In [68]:
logic_df_nlp

Unnamed: 0,stop_False,stop_True,pos_INTJ,pos_VERB,pos_DET,pos_NOUN,pos_ADP,pos_PRON,pos_ADJ,pos_CCONJ,...,tag_EX,tag_JJR,tag_ADD,tag_FW,tag_NNP,tag_NFP,tag_.,tag_RBS,tag_LS,tag_SYM
0,0.7,0.3,0.1,0.2,0.1,0.2,0.2,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.125,0.0,0.5,0.25,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.538462,0.461538,0.0,0.153846,0.153846,0.307692,0.153846,0.076923,0.076923,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.5,0.5,0.083333,0.25,0.083333,0.083333,0.083333,0.083333,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.636364,0.363636,0.0,0.272727,0.090909,0.272727,0.181818,0.090909,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.5,0.5,0.0,0.166667,0.166667,0.333333,0.083333,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.5,0.5,0.0,0.1,0.2,0.4,0.2,0.0,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.5,0.5,0.0,0.25,0.083333,0.166667,0.083333,0.166667,0.083333,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.5,0.5,0.071429,0.214286,0.0,0.142857,0.0,0.142857,0.142857,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.384615,0.615385,0.0,0.153846,0.0,0.153846,0.076923,0.230769,0.153846,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Models - just Spacy features 

In [18]:
personas_df = logic_df.artist

In [19]:
final_df = pd.concat([logic_df_nlp, personas_df ], axis=1)

In [21]:
final_df.head()

Unnamed: 0,stop_False,stop_True,pos_INTJ,pos_VERB,pos_DET,pos_NOUN,pos_ADP,pos_PRON,pos_ADJ,pos_CCONJ,...,tag_JJR,tag_ADD,tag_FW,tag_NNP,tag_NFP,tag_.,tag_RBS,tag_LS,tag_SYM,artist
0,0.7,0.3,0.1,0.2,0.1,0.2,0.2,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
1,1.0,0.0,0.0,0.125,0.0,0.5,0.25,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
2,0.538462,0.461538,0.0,0.153846,0.153846,0.307692,0.153846,0.076923,0.076923,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
3,0.5,0.5,0.083333,0.25,0.083333,0.083333,0.083333,0.083333,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
4,0.636364,0.363636,0.0,0.272727,0.090909,0.272727,0.181818,0.090909,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra


In [22]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(final_df, test_size=0.2, random_state=1234)

In [23]:
#split both into predictor and response
x_train = train_data.loc[:, train_data.columns != 'artist']
y_train = train_data.artist


x_test = test_data.loc[:, test_data.columns != 'artist']
y_test = test_data.artist

## Naive Bayes

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


#create pipeline
pipe = Pipeline([('nb', MultinomialNB())])
#set param_grid
param_grid = {'nb__alpha': [0.0000001,0.0001,0.001,0.001,0.1, 0.5, 1, 2]}
#grid search through the parameters 
grid = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'nb__alpha': [1e-07, 0.0001, 0.001, 0.001, 0.1, 0.5, 1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [25]:
grid.best_params_

{'nb__alpha': 1e-07}

In [26]:
from sklearn.model_selection import cross_val_score
grid_score = cross_val_score(grid, x_train, y_train, cv=5).mean()
grid_score

0.4115046587597314

In [27]:
y_test_pred = grid.predict(x_test)
y_test_pred

array(['Logic', 'Logic', 'Logic', ..., 'Logic', 'Logic', 'Logic'],
      dtype='<U15')

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.42784380305602715

## KN Classifier

In [29]:
from sklearn.neighbors import KNeighborsClassifier

#create pipeline
pipe = Pipeline([('knc', KNeighborsClassifier())])
#set param_grid
param_grid = {'knc__n_neighbors': [1,2,3,4,5,6,7], 'knc__leaf_size': [30,40,50,60], 'knc__weights':['uniform', 'distance']}
#grid search through the parameters 
grid3 = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid3.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('knc', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knc__n_neighbors': [1, 2, 3, 4, 5, 6, 7], 'knc__leaf_size': [30, 40, 50, 60], 'knc__weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [30]:
grid3.best_params_

{'knc__leaf_size': 30, 'knc__n_neighbors': 6, 'knc__weights': 'distance'}

In [31]:
y_test_pred = grid3.predict(x_test)
accuracy_score(y_test, y_test_pred)

0.532258064516129

## SVC 

In [32]:
from sklearn.svm import SVC

#create pipeline
pipe = Pipeline([('svm', SVC(decision_function_shape='ovo'))])
#set param_grid
param_grid = {'svm__C':[1, 2,5,10]}
#grid search through the parameters 
grid4 = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid4.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svm__C': [1, 2, 5, 10]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [33]:
y_test_pred = grid4.predict(x_test)
accuracy_score(y_test, y_test_pred)

0.4295415959252971

# Models - BOTH Spacy features & bag of words

In [25]:
final_df.head()

Unnamed: 0,stop_False,stop_True,pos_INTJ,pos_VERB,pos_DET,pos_NOUN,pos_ADP,pos_PRON,pos_ADJ,pos_CCONJ,...,tag_JJR,tag_ADD,tag_FW,tag_NNP,tag_NFP,tag_.,tag_RBS,tag_LS,tag_SYM,artist
0,0.7,0.3,0.1,0.2,0.1,0.2,0.2,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
1,1.0,0.0,0.0,0.125,0.0,0.5,0.25,0.0,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
2,0.538462,0.461538,0.0,0.153846,0.153846,0.307692,0.153846,0.076923,0.076923,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
3,0.5,0.5,0.083333,0.25,0.083333,0.083333,0.083333,0.083333,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra
4,0.636364,0.363636,0.0,0.272727,0.090909,0.272727,0.181818,0.090909,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Young Sinatra


In [69]:
final_df.shape

(5890, 57)

In [70]:
final_df2 = final_df.copy()

In [71]:
final_df2.shape

(5890, 57)

In [72]:
bagofwords_df = pd.read_csv('THEbagofwords_df.csv', index_col=0)

In [73]:
bagofwords_df.shape

(5890, 4413)

In [74]:
bagofwords_df.drop(["artist"], axis=1, inplace=True)

In [75]:
bagofwords_df.isnull().values.any()

False

In [76]:
model_df = pd.concat([bagofwords_df, final_df2], axis=1)

**this is our final dataset**

In [78]:
model_df.shape

(5890, 4469)

In [79]:
model_df.isnull().values.any()

False

In [80]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(model_df, test_size=0.2, random_state=1234)

In [81]:
#split both into predictor and response
x_train = train_data.loc[:, train_data.columns != 'artist']
y_train = train_data.artist


x_test = test_data.loc[:, test_data.columns != 'artist']
y_test = test_data.artist

## NB

In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


#create pipeline
pipe = Pipeline([('nb', MultinomialNB())])
#set param_grid
param_grid = {'nb__alpha': [0.0000001,0.0001,0.001,0.001,0.1, 0.5, 1, 2]}
#grid search through the parameters 
grid = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'nb__alpha': [1e-07, 0.0001, 0.001, 0.001, 0.1, 0.5, 1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [71]:
from sklearn.model_selection import cross_val_score
grid_score = cross_val_score(grid, x_train, y_train, cv=3).mean()
grid_score

0.6192692390339231

In [46]:
y_test_pred = grid.predict(x_test)
y_test_pred

array(['Young Sinatra', 'Logic', 'Young Sinatra', ..., 'Young Sinatra',
       'Logic', 'Logic'], dtype='<U15')

In [47]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.66553480475382

## KN Classifier

In [89]:
from sklearn.neighbors import KNeighborsClassifier

#create pipeline
pipe = Pipeline([('knc', KNeighborsClassifier())])
#set param_grid
param_grid = {'knc__n_neighbors': [1,2,3,4], 'knc__leaf_size': [10,30,40]}
#grid search through the parameters 
grid3 = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid3.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('knc', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knc__n_neighbors': [1, 2, 3, 4], 'knc__leaf_size': [10, 30, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [84]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()

knc.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [85]:
y_test_pred = knc.predict(x_test)
y_test_pred


accuracy_score(y_test, y_test_pred)

0.5772495755517827

## SVC

In [86]:
from sklearn.svm import SVC

#create pipeline
pipe = Pipeline([('svm', SVC())])
#set param_grid
param_grid = {'svm__C':[1, 2,5,10]}
#grid search through the parameters 
grid4 = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid4.fit(x_train, y_train)

KeyboardInterrupt: 

In [87]:
from sklearn.svm import SVC
svc = SVC()

svc.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [88]:
y_test_pred = svc.predict(x_test)
y_test_pred


accuracy_score(y_test, y_test_pred)

0.4295415959252971

# OPTIONAL - MAYBE JUST FOCUS ON DEEP LEARNING NOW. 

# NB Model with ONLY SOME of the Spacy features & bag of words

**took out both stop words column and all tag columns**

In [81]:
pd.set_option('display.max_rows', 500000)
pd.set_option('display.max_colwidth', -1)
model_df.shape

(5890, 4469)

In [83]:
 model_df2 = model_df[model_df.columns.drop(list(model_df.filter(regex='stop')))]

In [85]:
 model_df2 = model_df[model_df.columns.drop(list(model_df.filter(regex='tag')))]

In [86]:
model_df2.shape

(5890, 4431)

In [87]:
train_data, test_data = train_test_split(model_df2, test_size=0.2, random_state=1234)

In [88]:
#split both into predictor and response
x_train = train_data.loc[:, train_data.columns != 'artist']
y_train = train_data.artist


x_test = test_data.loc[:, test_data.columns != 'artist']
y_test = test_data.artist

In [89]:
#create pipeline
pipe = Pipeline([('nb', MultinomialNB())])
#set param_grid
param_grid = {'nb__alpha': [0.0000001,0.0001,0.001,0.001,0.1, 0.5, 1, 2]}
#grid search through the parameters 
grid = GridSearchCV(pipe, cv=3, param_grid=param_grid, scoring='accuracy', refit=True)
#fit it to my data 
grid.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'nb__alpha': [1e-07, 0.0001, 0.001, 0.001, 0.1, 0.5, 1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [91]:
grid.best_params_

{'nb__alpha': 0.5}

In [90]:
from sklearn.model_selection import cross_val_score
grid_score = cross_val_score(grid, x_train, y_train, cv=3).mean()
grid_score

0.6203312155969193