# Modeling - Naive Bayes (after deep data cleaning)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix, balanced_accuracy_score

In [2]:
#read in data

postings = pd.read_csv('../datasets/postings_cleaned.csv')

In [3]:
postings

Unnamed: 0,topic,text
0,hiking,boulder flatiron loop hike
1,hiking,washington state lakes to swim
2,hiking,here a fun episode demonstrating why a is a fo...
3,hiking,picture i took of my friend at angel s landing...
4,hiking,hiking to bertha peak via cougar crest trail a...
...,...,...
11852,gardening,nope haha oklahoma so not too far away
11853,gardening,i transitioning to everbearing in strawberry p...
11854,gardening,so pretty
11855,gardening,i think it s chance of frost after that date


In [4]:
#define X and y

X = postings['text']
y = np.where(postings['topic'] == 'hiking',1,0)

In [5]:
#checking X

X

0                               boulder flatiron loop hike
1                           washington state lakes to swim
2        here a fun episode demonstrating why a is a fo...
3        picture i took of my friend at angel s landing...
4        hiking to bertha peak via cougar crest trail a...
                               ...                        
11852               nope haha oklahoma so not too far away
11853    i transitioning to everbearing in strawberry p...
11854                                            so pretty
11855         i think it s chance of frost after that date
11856    my understanding is that the caterpillars eat ...
Name: text, Length: 11857, dtype: object

In [6]:
#checking y

y

array([1, 1, 1, ..., 0, 0, 0])

### Train, test, split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=123)

In [59]:
def input_to_df(model, cleaning, lemmatized):
    """
    Appends model information into model summary dataframe for comparison
    """
    df = pd.read_csv('../datasets/model_list_summary.csv')
    
    if lemmatized == 'no':
        info = [cleaning,          #Indicates if data was minimally or deeply cleaned
                lemmatized,        #Indicates if text was lemmatized or not
                model.score(X_train,y_train), 
                model.score(X_test, y_test), 
                balanced_accuracy_score(y_train, model.predict(X_train)),
                balanced_accuracy_score(y_test, model.predict(X_test)),
                model.best_params_
               ]
    elif lemmatized == 'yes': 
        info = [cleaning,          #Indicates if data was minimally or deeply cleaned
                lemmatized,        #Indicates if text was lemmatized or not
                model.score(X_train_lem,y_train), 
                model.score(X_test_lem, y_test), 
                balanced_accuracy_score(y_train, model.predict(X_train_lem)),
                balanced_accuracy_score(y_test, model.predict(X_test_lem)),
                model.best_params_
               ]
    info_series = pd.Series(info, index=df.columns)
    df = df.append(info_series, ignore_index=True)
    df.to_csv('../datasets/model_list_summary.csv', index=False)

### Building my pipe and GS

In [9]:
def model_fit_and_scores(model):
    model.fit(X_train,y_train)
    print(f' Score on train data:  {model.score(X_train,y_train)}')
    print(f' Score on test data: {model.score(X_test, y_test)}')
    print(f' balanced accuracy score on train data: {balanced_accuracy_score(y_train, model.predict(X_train))}')
    print(f' balanced accuracy score on test data: {balanced_accuracy_score(y_test, model.predict(X_test))}')
    print(model.best_params_)

In [14]:
pipe = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[50, 150, 500, 1000],
          'countvectorizer__ngram_range':[(1,2), (2,3)],
          'multinomialnb__alpha':[.5, 1]
}

gs = GridSearchCV(pipe,params, n_jobs = -1)

In [15]:
model_fit_and_scores(gs)

 Score on train data:  0.877080521817364
 Score on test data: 0.8782462057335582
 balanced accuracy score on train data: 0.8742356471890451
 balanced accuracy score on test data: 0.8751279320741716
{'countvectorizer__max_features': 1000, 'countvectorizer__ngram_range': (1, 2), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.5}


### the following is the best so far:

In [60]:
pipe2 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params2 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[1000,3000],
          'countvectorizer__ngram_range':[(1,2), (2,2)],
          'multinomialnb__alpha':[.3, .5, .7]
}

gs2 = GridSearchCV(pipe2,params2, n_jobs = -1)

In [61]:
model_fit_and_scores(gs2)

 Score on train data:  0.9114934772829509
 Score on test data: 0.8974704890387858
 balanced accuracy score on train data: 0.91034709756017
 balanced accuracy score on test data: 0.8955302767201855
{'countvectorizer__max_features': 3000, 'countvectorizer__ngram_range': (1, 2), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.3}


In [62]:
# Since this is the best so far using CountVectorizer, I will input this into my model summary

input_to_df(gs2, 'deep', 'no')

In [18]:
pipe3 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     MultinomialNB())
params3 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[3000,4000],
          'countvectorizer__ngram_range':[(1,2), (1,1)],
          'multinomialnb__alpha':[.1, .3]
}

gs3 = GridSearchCV(pipe3,params3, n_jobs = -1)

In [19]:
model_fit_and_scores(gs3)

 Score on train data:  0.9201529464687359
 Score on test data: 0.890725126475548
 balanced accuracy score on train data: 0.9195007424033281
 balanced accuracy score on test data: 0.8896055592761923
{'countvectorizer__max_features': 3000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'multinomialnb__alpha': 0.1}


In [20]:
pipe4 = make_pipeline(CountVectorizer(),
                     StandardScaler(with_mean=False),
                     MultinomialNB())
params4 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[2700, 2800, 3000],
          'countvectorizer__ngram_range':[(1,1), (1,2)],
          'multinomialnb__alpha':[ .05, .1, .3]
}

gs4 = GridSearchCV(pipe4,params4, n_jobs = -1)

In [21]:
model_fit_and_scores(gs4)

 Score on train data:  0.9186909581646424
 Score on test data: 0.8897133220910624
 balanced accuracy score on train data: 0.9179832899720628
 balanced accuracy score on test data: 0.8885436424553009
{'countvectorizer__max_features': 2800, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'multinomialnb__alpha': 0.05}


### TfidfVectorizer - non-lemmatized

In [22]:
pipe10 = make_pipeline(TfidfVectorizer(),
                     MultinomialNB())
params10 = { 
          'tfidfvectorizer__stop_words': [None, 'english'], 
          'tfidfvectorizer__max_features':[3000, 3200],
          'tfidfvectorizer__ngram_range':[(1,2), (1,1)],
          'multinomialnb__alpha':[ .00005, .3]
}

gs10 = GridSearchCV(pipe10, params10, n_jobs = -1)

In [23]:
model_fit_and_scores(gs10)

 Score on train data:  0.9237516869095816
 Score on test data: 0.9005059021922428
 balanced accuracy score on train data: 0.9220166946652195
 balanced accuracy score on test data: 0.8984901021337827
{'multinomialnb__alpha': 0.3, 'tfidfvectorizer__max_features': 3200, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [24]:
pipe11 = make_pipeline(TfidfVectorizer(),
                     MultinomialNB())
params11 = { 
          'tfidfvectorizer__stop_words': [None, 'english'], 
          'tfidfvectorizer__max_features':[3100, 3200,3300],
          'tfidfvectorizer__ngram_range':[(1,2), (1,1)],
          'multinomialnb__alpha':[ .1, .3, .5]
}

gs11 = GridSearchCV(pipe11, params11, n_jobs = -1)

In [25]:
model_fit_and_scores(gs11)

 Score on train data:  0.9263382816014395
 Score on test data: 0.9021922428330523
 balanced accuracy score on train data: 0.9248478130787423
 balanced accuracy score on test data: 0.9003101690683966
{'multinomialnb__alpha': 0.1, 'tfidfvectorizer__max_features': 3300, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [63]:
pipe12 = make_pipeline(TfidfVectorizer(),
                     MultinomialNB())
params12 = { 
          'tfidfvectorizer__stop_words': ['english'], 
          'tfidfvectorizer__max_features':[3300,3350],
          'tfidfvectorizer__ngram_range':[(1,1)],
          'multinomialnb__alpha':[.04, .05,]
}

gs12 = GridSearchCV(pipe12, params12, n_jobs = -1)

In [64]:
model_fit_and_scores(gs12)

 Score on train data:  0.9272379667116509
 Score on test data: 0.903204047217538
 balanced accuracy score on train data: 0.9257334737485723
 balanced accuracy score on test data: 0.9014473942389803
{'multinomialnb__alpha': 0.05, 'tfidfvectorizer__max_features': 3300, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


### ^^ better than countvectorizer

In [65]:
# Since this is the better than the CountVectorizer, I will input this into my model summary

input_to_df(gs12, 'deep', 'no')

### Lemmatized

In [66]:
def model_lem_fit_and_scores(model):
    model.fit(X_train_lem,y_train)
    print(f' Score on train data:  {model.score(X_train_lem,y_train)}')
    print(f' Score on test data: {model.score(X_test_lem, y_test)}')
    print(f' balanced accuracy score on train data: {balanced_accuracy_score(y_train, model.predict(X_train_lem))}')
    print(f' balanced accuracy score on test data: {balanced_accuracy_score(y_test, model.predict(X_test_lem))}')
    print(model.best_params_)

In [67]:
nlp = spacy.load('en_core_web_sm')
def lemmatized(sentence):
    doc = nlp(sentence)
    lemma_list = [token.lemma_ for token in doc]
    lemma_sentence = ' '.join(lemma_list)
    return lemma_sentence

In [34]:
X_train_lem = X_train.apply(lemmatized)

In [35]:
X_test_lem = X_test.apply(lemmatized)

In [36]:
pipe5 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params5 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[2800, 3000],
          'countvectorizer__ngram_range':[(1,1), (1,2)],
          'multinomialnb__alpha':[ .05, .1]
}

gs5 = GridSearchCV(pipe5, params5, n_jobs = -1)

In [37]:
model_lem_fit_and_scores(gs5)

 Score on train data:  0.9212775528565003
 Score on test data: 0.9075885328836425
 balanced accuracy score on train data: 0.9207893671558924
 balanced accuracy score on test data: 0.9069025284260233
{'countvectorizer__max_features': 3000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': None, 'multinomialnb__alpha': 0.05}


In [38]:
pipe6 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params6 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[3000,4000],
          'countvectorizer__ngram_range':[(1,1), (2,2)],
          'multinomialnb__alpha':[ .01, .05, .07]
}

gs6 = GridSearchCV(pipe6, params6, n_jobs = -1)

In [39]:
model_lem_fit_and_scores(gs6)

 Score on train data:  0.9324111560953666
 Score on test data: 0.9102866779089376
 balanced accuracy score on train data: 0.9312955456568446
 balanced accuracy score on test data: 0.908203036837989
{'countvectorizer__max_features': 4000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.07}


In [68]:
pipe7 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params7 = { 
          'countvectorizer__stop_words': [None, 'english'], 
          'countvectorizer__max_features':[4000,5000,6000],
          'countvectorizer__ngram_range':[(1,1)],
          'multinomialnb__alpha':[ .07, .08, .09]
}

gs7 = GridSearchCV(pipe7, params7, n_jobs = -1)

In [69]:
model_lem_fit_and_scores(gs7)

 Score on train data:  0.939608636977058
 Score on test data: 0.9129848229342327
 balanced accuracy score on train data: 0.9385811608530302
 balanced accuracy score on test data: 0.9114615623419562
{'countvectorizer__max_features': 5000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.07}


## better than tfidfvectorizer non-lemmatized^

In [70]:
# I will input this into my model summary

input_to_df(gs7, 'deep', 'yes')

### TfidfVectorizer - lemmatized

In [50]:
pipe8= make_pipeline(TfidfVectorizer(),
                     MultinomialNB())
params8 = { 
          'tfidfvectorizer__stop_words': ['english', None], 
          'tfidfvectorizer__max_features':[3000,4000],
          'tfidfvectorizer__ngram_range':[(1,1), (1,2)],
          'multinomialnb__alpha':[ .1, .5, 1]
}

gs8 = GridSearchCV(pipe8, params8, n_jobs = -1)

In [51]:
model_lem_fit_and_scores(gs8)

 Score on train data:  0.9345479082321188
 Score on test data: 0.9045531197301855
 balanced accuracy score on train data: 0.9334459420533654
 balanced accuracy score on test data: 0.9032649278651949
{'multinomialnb__alpha': 0.5, 'tfidfvectorizer__max_features': 4000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [71]:
pipe9= make_pipeline(TfidfVectorizer(),
                     MultinomialNB())
params9 = { 
          'tfidfvectorizer__stop_words': ['english'], 
          'tfidfvectorizer__max_features':[4000,5000],
          'tfidfvectorizer__ngram_range':[(1,1), (2,2)],
          'multinomialnb__alpha':[.8, .5, .3 ]
}

gs9 = GridSearchCV(pipe9, params9, n_jobs = -1)

In [72]:
model_lem_fit_and_scores(gs9)

 Score on train data:  0.9429824561403509
 Score on test data: 0.9059021922428331
 balanced accuracy score on train data: 0.9423218289622561
 balanced accuracy score on test data: 0.9049318447920246
{'multinomialnb__alpha': 0.3, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': 'english'}


In [None]:
# NOT better than countvectorizer

In [73]:
# But Since this is the best so far using TfidfVectorizer, I will input this into my model summary

input_to_df(gs9, 'deep', 'yes')

In [None]:
# Checking highest and lowest correlated words

In [74]:
best = gs7.best_estimator_

In [75]:
coefs = best.named_steps['multinomialnb'].coef_
vocab = best.named_steps['countvectorizer'].get_feature_names()



In [76]:
coef_df = pd.DataFrame({'coefs': coefs[0], 'word': vocab})

In [77]:
coef_df.nlargest(10,'coefs')

Unnamed: 0,coefs,word
1932,-3.425864,hike
4521,-3.977466,trail
2983,-4.096973,park
4657,-4.320086,usa
2696,-4.469873,mountain
2742,-4.537772,national
1074,-4.730923,day
2295,-4.795228,lake
1448,-4.898234,fall
4190,-4.93373,state


In [78]:
coef_df.nsmallest(10,'coefs')

Unnamed: 0,coefs,word
14,-13.110724,acclimate
16,-13.110724,accord
17,-13.110724,accordingly
22,-13.110724,acid
36,-13.110724,adapt
39,-13.110724,addition
41,-13.110724,address
42,-13.110724,adhere
55,-13.110724,advertising
59,-13.110724,aerogarden
