### Modeling

In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [20]:
# read in data

data = pd.read_csv('./data/reddit_working_cleaned.csv')

In [21]:
# inspect scifi data - first and last five rows

data.head()
data.tail()

Unnamed: 0,created_utc,selftext,subreddit,title
47641,1421701672,how do they get the particles and find their e...,askscience,How do people who study entangled particles ge...
47642,1417723027,the delta iv rocket isn t that new so what is...,askscience,What is the goal/purpose of the Orion test lau...
47643,1519983877,this doesn t seem to be true of all icicles e...,askscience,Why do large icicles appear ribbed?
47644,1432885551,i know that nature only deals with the d and l...,askscience,What would happen if you tried to eat a sandwi...
47645,1491402075,i ve seen the maps of the ice sheets in north ...,askscience,Why was the southern hemisphere ice sheet smal...


In [22]:
# create function - to inspect dataframe

def inspect(dataframe):
    print('Rows, columns:', dataframe.shape)
    print('')
    print(dataframe.info())      

In [23]:
# print info about dataframe

inspect(data)

Rows, columns: (47646, 4)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47646 entries, 0 to 47645
Data columns (total 4 columns):
created_utc    47646 non-null int64
selftext       47646 non-null object
subreddit      47646 non-null object
title          47646 non-null object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB
None


**Set Variables**

In [24]:
# recode subreddit variable 
# scifi = 0, askscience = 1

data['is_science'] = data['subreddit'].map({'askscience': 1, 'scifi': 0})

In [25]:
# define X and y variables

X = data['selftext']

y = data['is_science']

In [26]:
# train-test-split the data
# stratify split - for balanced test set

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [32]:
# check X_train, X_test, y_train, y_test

X_train.head()
X_test.head()
y_train.head()
y_test.head()

47374    1
21288    0
27823    1
25463    1
18991    0
Name: is_science, dtype: int64

#### Modeling

In [52]:
# create function that instantiates pipeline
# arguments: transformer, estimator

def piper(transformer, estimator):
    pipe = make_pipeline(transformer, estimator)
    return pipe

In [53]:
# define parameters for gridsearch

def pipe_params(transformer):
    transf = str(transformer).split('(')[0].lower()
    params = {
        transf + '__max_features': [100, 500],
        transf + '__stop_words': ['english', None],
        transf + '__ngram_range': [(1, 1), (1, 2), (1, 3)]
    }
    return params

In [60]:
# create function: arguments = pipeline, params
# runs gridsearch cv
# gives back: gridsearch cv score, best parameters

def gridsearch(transformer, estimator):

    pipe = piper(transformer, estimator)
    params = pipe_params(transformer)
    gs = GridSearchCV(pipe,
                      param_grid=params,
                      cv=5,
                      n_jobs=-1)
    gs.fit(X_train, y_train)
    best_model = gs.best_estimator_
    
    print(str(transformer).split('(')[0])
    print('Best model:', gs.best_params_)
    print('')
    print('Best model:', best_model)
    print('')
    print('Cross-val score of the best model:', gs.best_score_)
    print('Accuracy of best model on the training data:', best_model.score(X_train, y_train))
    print('Accuracy of best model on the testing data:', best_model.score(X_test, y_test))
    print('')
    print('')

**Baseline Accuracy**

In [33]:
# value count on y_test

y_test.value_counts(normalize=True)

1    0.504113
0    0.495887
Name: is_science, dtype: float64

If we predict for every single submission that it came from the askscience subreddit - we will be right 50.41% of the time. 

**Model 1** <br>
*count vectorizer - logistic regression*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [41]:
# run the gridsearch function with CountVectorizer and Logistic Regression

gridsearch(CountVectorizer(), LogisticRegression(max_iter=500))

CountVectorizer

Best model: Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=500, min_df=1, ngram_range=(1, 1),
                                 preprocessor=None, stop_words=None,
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=500,
                                    multi

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams <br>
Cross-Val accuracy: 90.9%, Training accuracy: 91.5%, Testing accuracy: 91%

**Model 2** <br>
*tfidf-vectorizer - logistic regression*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500
        - ngrams: 1-grams, 2-grams, 3-grams

In [42]:
# run gridsearch function on TfidfVectorizer and LogisticRegression

gridsearch(TfidfVectorizer(), LogisticRegression(max_iter=500))

TfidfVectorizer

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                    

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams <br>
Cross-Val accuracy: 91.28%, Training accuracy: 91.8%, Testing accuracy: 91.34%
Comparing the cross-val accuracy score with the 90.9% of Model 1 - Model 2 performed slightly (0.38 percentage points) better.

**Model 3** <br>
*count vectorizer - Bernoulli Naive Bayes*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [45]:
# run gridsearch function on CountVectorizer and BernoulliNB (a binary classifier)

gridsearch(CountVectorizer(), BernoulliNB())



CountVectorizer

Best model: Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=500, min_df=1, ngram_range=(1, 2),
                                 preprocessor=None, stop_words='english',
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('bernoullinb',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

Cross-val score of the best model: 0.8451054425377027
Accuracy of best model on the training data: 0.8467845749146

Best parameters: stop_words = English, max_features = 500, ngrams = 2-grams <br>
Cross-Val accuracy: 84.51%, Training accuracy: 84.68%, Testing accuracy: 84.7%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 3 performed 6.77 percentage points worse.

**Model 4** <br>
*tfidf-vectorizer - Bernoulli Naive Bayes*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [46]:
# run gridsearch function on TfidfVectorizer and BernoulliNB (a binary classifier)

gridsearch(TfidfVectorizer(), BernoulliNB())

TfidfVectorizer

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('bernoullinb',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                        

Best parameters: stop_words = English, max_features = 500, ngrams = 2-grams <br>
Cross-Val accuracy: 84.51%, Training accuracy: 84.68%, Testing accuracy: 84.7%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 3 performed 6.77 percentage points worse. (The Bernoulli NB models with CountVectorizer and with Tfidf Vectorizer performed exactly the same; this was because the Bernoulli NB model classifies based on whether a feature is in a certain document or it is missing from there.)

**Model 5** <br>
*count vectorizer - Multinomial Naive Bayes*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [56]:
# run gridsearch

gridsearch(CountVectorizer(), MultinomialNB())



CountVectorizer

Best model: Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=500, min_df=1, ngram_range=(1, 1),
                                 preprocessor=None, stop_words=None,
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

Cross-val score of the best model: 0.9023338682987321
Accuracy of best model on the training data: 0.9028936027312924
Accuracy of best model on the testing da

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams <br>
Cross-Val accuracy: 90.23%, Training accuracy: 90.29%, Testing accuracy: 90.21%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 3 performed 1.05 percentage points worse.

**Model 6** <br>
*Tfidf vectorizer - Multinomial Naive Bayes*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [57]:
# run gridsearch

gridsearch(TfidfVectorizer(), MultinomialNB())



TfidfVectorizer

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=Fal

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams <br>
Cross-Val accuracy: 90.44%, Training accuracy: 90.50%, Testing accuracy: 90.47%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 6 performed 0.84 percentage points worse.

***Decision Tree Models***

**Model 7** <br>
*count vectorizer - decision tree*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [58]:
# run gridsearch

gridsearch(CountVectorizer(), DecisionTreeClassifier(max_depth=2))



CountVectorizer

Best model: Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=100, min_df=1, ngram_range=(1, 1),
                                 preprocessor=None, stop_words='english',
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None...lary=None)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=2,
                                        max_features=None, max_leaf_nodes=None,
                            

Best parameters: stop_words = English, max_features = 100, ngrams = 1-grams <br>
Cross-Val accuracy: 65.16%, Training accuracy: 65.16%, Testing accuracy: 65.58%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 7 performed 26.12 percentage points worse.

**Model 8** <br>
*tfidf vectorizer - decision tree*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [59]:
# run gridsearch

gridsearch(TfidfVectorizer(), DecisionTreeClassifier(max_depth=4))



TfidfVectorizer

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=100,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 t...
                                 vocabulary=None)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=4,
                                 

Best parameters: stop_words = English, max_features = 100, ngrams = 1-grams <br>
Cross-Val accuracy: 72.62%, Training accuracy: 72.68%, Testing accuracy: 73.05%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 8 performed 18.66 percentage points worse.

**Model 9** <br>
*count vectorizer - ada boost*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [62]:
# run gridsearch

gridsearch(CountVectorizer(), AdaBoostClassifier())



CountVectorizer
Best model: {'countvectorizer__max_features': 500, 'countvectorizer__ngram_range': (1, 2), 'countvectorizer__stop_words': None}

Best model: Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=500, min_df=1, ngram_range=(1, 2),
                                 preprocessor=None, stop_words=None,
                                 strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('adaboostclassifier',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1

Best parameters: stop_words = None, max_features = 500, ngrams = 2-grams <br>
Cross-Val accuracy: 87.52%, Training accuracy: 87.7%, Testing accuracy: 87.57%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 9 performed 3.76 percentage points worse.

**Model 10** <br>
*tfidf vectorizer - ada boost*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [69]:
# run gridsearch

gridsearch(TfidfVectorizer(), AdaBoostClassifier())



TfidfVectorizer
Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': None}

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
     

Best parameters: stop_words = None, max_features = 500, ngrams = 2-grams <br>
Cross-Val accuracy: 87.89%, Training accuracy: 88%, Testing accuracy: 87.99%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 10 performed 3.39 percentage points worse.

**Model 11** <br>
*tfidf-vectorizer - AdaBoost - decision tree*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams, 3-grams

In [71]:
# run gridsearch

gridsearch(TfidfVectorizer(), AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4)))



TfidfVectorizer
Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None}

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_...
                                    base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams <br>
Cross-Val accuracy: 89.4%, Training accuracy: 93.8%, Testing accuracy: 89.38%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 11 performed 1.88 percentage points worse. (note for myself: this model looks promising)

**Model 12** <br>
*count vectorizer - gradient boosting*

    parameters: 
        - stop words: english, None 
        - max_features: 100, 500 
        - ngrams: 1-grams, 2-grams
        - gradientboosting max_depth: 3, 4, 5

In [66]:
# instantiate pipeline - with GradientBoostingClassifier

pipe_grad = Pipeline([
    ('cvec', CountVectorizer()),
    ('grad', GradientBoostingClassifier())
])

In [67]:
# parameters for gs_grad

param_pipe_grad = {
    'cvec__max_features': [100, 500],
    'cvec__stop_words': ['english', None],
    'cvec__ngram_range': [(1, 1), (1, 2)],
    'grad__max_depth': [3, 4, 5]
}

In [68]:
# instantiate grid search

gs_grad = GridSearchCV(
    pipe_grad,
    param_grid=param_pipe_grad,
    cv=5,
    n_jobs=-1
)

gs_grad.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [70]:
# print the accuracy scores of the best GradientBoost model

best_model = gs_grad.best_estimator_
print('Best model:', gs_grad.best_params_)
print('')
print('Cross-val score of the best model:', gs_grad.best_score_)
print('Accuracy of best model on the training data:', best_model.score(X_train, y_train))
print('Accuracy of best model on the testing data:', best_model.score(X_test, y_test))
print('')
print('')

Best model: {'cvec__max_features': 500, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': None, 'grad__max_depth': 5}

Cross-val score of the best model: 0.894302475568928
Accuracy of best model on the training data: 0.9057480270890469
Accuracy of best model on the testing data: 0.8938045668233714




Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams, gradient boosting max_depth = 5 <br>
Cross-Val accuracy: 89.43%, Training accuracy: 90.57%, Testing accuracy: 89.38%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 12 performed 1.85 percentage points worse.

#### Models with Lemmatizer and Stemmer <br>
Used best model - Model 2 - Tfidf Vectorizer - Logistic Regression

**Tokenize Documents**

In [73]:
# instantiate tokenizer

tokenizer = RegexpTokenizer('\w+')

In [74]:
# define function to
# tokenize the documents

def tokens(column):
    list_of_tokens = []
    for string in column:
        list_of_tokens.append(tokenizer.tokenize(string))
    return list_of_tokens

In [75]:
# tokenize X_train and X_test data

X_train_tokens = tokens(X_train)
X_test_tokens = tokens(X_test)

**Model 13** <br>
*lemmatizer - Tfidf-Vectorizer - logistic regression*

    parameters: 
        - stop words: english, None 
        - max_features: 500, 600 
        - ngrams: 1-grams, 2-grams
    tokenizer: removed punctuation, all non-word character

In [76]:
# instantiate WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [77]:
# define lemmatizer function

def lemma(token_list):
    list_of_lemmas = []
    for token in token_list:
        list_of_lemmas.append(lemmatizer.lemmatize(str(token)))
    return list_of_lemmas

In [78]:
# lemmatize X_train and X_test tokens

X_train = lemma(X_train_tokens)
X_test = lemma(X_test_tokens)

In [79]:
# run gridsearch on TfidfVectorizer and LogisticRegressin after lemmatizing

gridsearch(TfidfVectorizer(), LogisticRegression())



TfidfVectorizer
Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None}

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
     

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams<br>
Cross-Val accuracy: 91.28%, Training accuracy: 91.84%, Testing accuracy: 91.33%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 13 performed equally well.

**Model 14** <br>
*stemmer - tfidf-vectorizer - logistic regression*

    parameters: 
        - stop words: english, None 
        - max_features: 500, 600 
        - ngrams: 1-grams, 2-grams, 3-grams

In [80]:
# instantiate stemmer
stemmer = PorterStemmer()

# create function to
# stem tokenized documents
def stem(token_list):
    list_of_stems = []
    for string in token_list:
        list_of_stems.append(stemmer.stem(str(string)))
    return list_of_stems

In [81]:
# run stemmer function on X_train_tokens
# set X_train to stemmed tokens (to run gridsearch function)
X_train = stem(X_train_tokens)

# run stemmer function on X_test_tokens
# set X_test to stemmed tokens (to run gridsearch function)
X_test = stem(X_test_tokens)

In [82]:
# run gridsearch function on stemmed tokens

gridsearch(TfidfVectorizer(), LogisticRegression())

TfidfVectorizer
Best model: {'tfidfvectorizer__max_features': 500, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': None}

Best model: Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
     

Best parameters: stop_words = None, max_features = 500, ngrams = 1-grams<br>
Cross-Val accuracy: 91.29%, Training accuracy: 91.84%, Testing accuracy: 91.33%
Comparing the cross-val accuracy scores with the 91.28% of Model 2 (most accurate model up to this point) - Model 14 performed 0.01 percentage points better.

Based on the analysis above I hypertune the Logistic Regression model with Tfidf Vectorizer. Since lemmatizing and stemming produced similar results, I use only the basic model.