# Modeling - Naive Bayes (after minimal data cleaning)

In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix, balanced_accuracy_score

In [26]:
#read in data

postings = pd.read_csv('../datasets/postings_minimally_cleaned.csv')

In [27]:
postings

Unnamed: 0,topic,text
0,hiking,Boulder Flatiron Loop Hike
1,hiking,Washington state lakes to swim
2,hiking,"Here's a fun episode, demonstrating why a fly-..."
3,hiking,Picture I took of my friend at Angel’s Landing...
4,hiking,Hiking to Bertha Peak via Cougar Crest Trail a...
...,...,...
12448,gardening,Nope haha Oklahoma so not too far away.
12449,gardening,I'm transitioning to everbearing in strawberry...
12450,gardening,So pretty
12451,gardening,I think it’s 30% chance of frost after that date.


In [28]:
#define X and y

X = postings['text']
y = np.where(postings['topic'] == 'hiking',1,0)

In [29]:
#checking X

X

0                               Boulder Flatiron Loop Hike
1                           Washington state lakes to swim
2        Here's a fun episode, demonstrating why a fly-...
3        Picture I took of my friend at Angel’s Landing...
4        Hiking to Bertha Peak via Cougar Crest Trail a...
                               ...                        
12448              Nope haha Oklahoma so not too far away.
12449    I'm transitioning to everbearing in strawberry...
12450                                            So pretty
12451    I think it’s 30% chance of frost after that date.
12452    My understanding is that the caterpillars eat ...
Name: text, Length: 12453, dtype: object

In [30]:
#checking y

y

array([1, 1, 1, ..., 0, 0, 0])

In [31]:
sum(y)  #this should be 6986 (number of hiking rows) -- confirmed!

6986

### Train, test, split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=123)

In [33]:
def input_to_df(model, cleaning, lemmatized):
    """
    Appends model information into model summary dataframe for comparison
    """
    df = pd.read_csv('../datasets/model_list_summary.csv')
    
    if lemmatized == 'no':
        info = [cleaning,          #Indicates if data was minimally or deeply cleaned
                lemmatized,        #Indicates if text was lemmatized or not
                model.score(X_train,y_train), 
                model.score(X_test, y_test), 
                balanced_accuracy_score(y_train, model.predict(X_train)),
                balanced_accuracy_score(y_test, model.predict(X_test)),
                model.best_params_
               ]
    elif lemmatized == 'yes': 
        info = [cleaning,          #Indicates if data was minimally or deeply cleaned
                lemmatized,        #Indicates if text was lemmatized or not
                model.score(X_train_lem,y_train), 
                model.score(X_test_lem, y_test), 
                balanced_accuracy_score(y_train, model.predict(X_train_lem)),
                balanced_accuracy_score(y_test, model.predict(X_test_lem)),
                model.best_params_
               ]
    info_series = pd.Series(info, index=df.columns)
    df = df.append(info_series, ignore_index=True)
    df.to_csv('../datasets/model_list_summary.csv', index=False)

### Building my pipe and GS

In [34]:
def model_fit_and_scores(model):
    model.fit(X_train,y_train)
    print(f' Score on train data:  {model.score(X_train,y_train)}')
    print(f' Score on test data: {model.score(X_test, y_test)}')
    print(f' balanced accuracy score on train data: {balanced_accuracy_score(y_train, model.predict(X_train))}')
    print(f' balanced accuracy score on test data: {balanced_accuracy_score(y_test, model.predict(X_test))}')
    print(model.best_params_)

In [12]:
pipe = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params = { 
        'countvectorizer__stop_words': [None, 'english'], 
        'countvectorizer__max_features':[500,4000,6000],
        'countvectorizer__ngram_range':[(1,1), (2,2)],
        'multinomialnb__alpha':[.005,.1, 1, 10]
}

gs = GridSearchCV(pipe,params)

In [13]:
model_fit_and_scores(gs)

 Score on train data:  0.934468358496627
 Score on test data: 0.9165061014771998
 balanced accuracy score on train data: 0.9337440118436307
 balanced accuracy score on test data: 0.915641779470209
{'countvectorizer__max_features': 6000, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.1}


In [14]:
pipe2 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params2 = { 
        'countvectorizer__stop_words': [None, 'english'], 
        'countvectorizer__max_features':[5500,6000,6500],
        'countvectorizer__ngram_range':[(1,1), (1,2)],
        'multinomialnb__alpha':[.05, .1, .5]
}

gs2 = GridSearchCV(pipe2, params2)

In [15]:
model_fit_and_scores(gs2)

 Score on train data:  0.9373594603276582
 Score on test data: 0.9171483622350675
 balanced accuracy score on train data: 0.9367980763411375
 balanced accuracy score on test data: 0.9162937488406293
{'countvectorizer__max_features': 6500, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.1}


In [16]:
pipe3 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params3 = { 
        'countvectorizer__stop_words': ['english'], 
        'countvectorizer__max_features':[6500,6600,6700],
        'countvectorizer__ngram_range':[(1,1)],
        'multinomialnb__alpha':[.09, .1, .12]
}

gs3 = GridSearchCV(pipe3, params3)

In [17]:
model_fit_and_scores(gs3)

 Score on train data:  0.9376806938644394
 Score on test data: 0.9171483622350675
 balanced accuracy score on train data: 0.9371639299996741
 balanced accuracy score on test data: 0.9163733083656003
{'countvectorizer__max_features': 6700, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.12}


In [35]:
pipe4 = make_pipeline(CountVectorizer(),
                     MultinomialNB())
params4 = { 
        'countvectorizer__stop_words': ['english'], 
        'countvectorizer__max_features':[6500,6700,7000],
        'countvectorizer__ngram_range':[(1,1)],
        'multinomialnb__alpha':[.12, .14, .15]
}

gs4 = GridSearchCV(pipe4, params4)

In [36]:
model_fit_and_scores(gs4)

 Score on train data:  0.9376806938644394
 Score on test data: 0.9171483622350675
 balanced accuracy score on train data: 0.9371639299996741
 balanced accuracy score on test data: 0.9163733083656003
{'countvectorizer__max_features': 6700, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__stop_words': 'english', 'multinomialnb__alpha': 0.12}


In [37]:
# Since this is the best so far, I will input this into my model summary

input_to_df(gs4, 'minimal', 'no')