In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from nltk.corpus import stopwords

In [2]:
# reading in data
main = pd.read_csv('../data/results.csv')
main.head()


Unnamed: 0,is_fifth,all_text
0,1,Please help ! This coded message keep appearin...
1,1,Help : I accidentally changed `` dime dozen ''...
2,1,My universal remote controller n't controlling...
3,1,Accidentally programmed universe rain nuke eve...
4,1,Help I accidentally The Thing . I accidentally...


In [4]:
# defining X and y and checking X
X = main['all_text']
y = main['is_fifth']





X

0       Please help ! This coded message keep appearin...
1       Help : I accidentally changed `` dime dozen ''...
2       My universal remote controller n't controlling...
3       Accidentally programmed universe rain nuke eve...
4       Help I accidentally The Thing . I accidentally...
                              ...                        
4609    My wife us shared calendar personal reminder G...
4610    Ca n't find car key car still start Ca n't fin...
4611    𝕴 𝖉𝖎𝖘𝖈𝖔𝖛𝖊𝖗𝖊𝖉 𝖘𝖔𝖒𝖊 𝖓𝖊𝖜 𝖋𝖔𝖓𝖙𝖘 𝔹𝕦𝕥 𝕕𝕠𝕟 ' 𝕥 𝕜𝕟𝕠𝕨 𝕨...
4612    Not Do ever go gargle mayonnaise pick wrong ja...
4613    I ca n't read funny factoid Tropico 5 loading ...
Name: all_text, Length: 4614, dtype: object

In [5]:
# making a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# defining stopwords
stop = stopwords.words('english')

# defining parameters for grid search
params = {'cvec__max_features': [1500, 2000, None], 'cvec__stop_words': [stop, None],
          'cvec__min_df': [2, 3, 4, 5, 6], 'cvec__ngram_range': [(1, 1), (1, 2)], }

In [6]:
# creating a pipline to create a predictive model
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
     ])

In [7]:
# instantiating a grid search
gs = GridSearchCV(pipe, params, cv=5)

In [8]:
# fitting the default and the grid search to the training data
pipe.fit(X_train, y_train)

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [9]:
# evaluating the model on the training and test data
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9222543352601156, 0.8535528596187175)

In [10]:
# checking the grid search best parameters
gs.best_params_

{'cvec__max_features': None,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

In [11]:
# evaluating grid seatch on test and train data
# thanks Elizabeth for telling me to clean my data properly (I nearly didn't remove posts without selftext)
gs.score(X_train, y_train), gs.best_score_, gs.score(X_test, y_test)

(0.9008670520231213, 0.8317919075144509, 0.8518197573656846)