In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import TransformerMixin

from nltk.corpus import stopwords

In [2]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()
# from https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required

In [3]:
main = pd.read_csv('../data/results.csv')
main.head()


Unnamed: 0,is_fifth,all_text
0,1,Please help ! This coded message keep appearin...
1,1,Help : I accidentally changed `` dime dozen ''...
2,1,My universal remote controller n't controlling...
3,1,Accidentally programmed universe rain nuke eve...
4,1,Help I accidentally The Thing . I accidentally...


In [4]:
X = main['all_text']
y = main['is_fifth']





X

0       Please help ! This coded message keep appearin...
1       Help : I accidentally changed `` dime dozen ''...
2       My universal remote controller n't controlling...
3       Accidentally programmed universe rain nuke eve...
4       Help I accidentally The Thing . I accidentally...
                              ...                        
4609    My wife us shared calendar personal reminder G...
4610    Ca n't find car key car still start Ca n't fin...
4611    𝕴 𝖉𝖎𝖘𝖈𝖔𝖛𝖊𝖗𝖊𝖉 𝖘𝖔𝖒𝖊 𝖓𝖊𝖜 𝖋𝖔𝖓𝖙𝖘 𝔹𝕦𝕥 𝕕𝕠𝕟 ' 𝕥 𝕜𝕟𝕠𝕨 𝕨...
4612    Not Do ever go gargle mayonnaise pick wrong ja...
4613    I ca n't read funny factoid Tropico 5 loading ...
Name: all_text, Length: 4614, dtype: object

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
stop = stopwords.words('english')





params = {'tvec__max_features':[1250,1500, None], 'tvec__stop_words':[stop, None],
          'tvec__min_df':[2,3,4,5,6], 'tvec__ngram_range':[(1,1), (1,2), (1,3), (1,4)]}

In [6]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('to_dense', DenseTransformer()),
    ('gnb', GaussianNB())
     ])

In [7]:
gs = GridSearchCV(pipe, params, cv=5)

In [8]:
pipe.fit(X_train, y_train)

gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [9]:
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9604046242774567, 0.7582322357019065)

In [10]:
gs.best_params_

{'tvec__max_features': 1500,
 'tvec__min_df': 6,
 'tvec__ngram_range': (1, 3),
 'tvec__stop_words': None}

In [11]:
gs.score(X_train, y_train), gs.best_score_, gs.score(X_test, y_test)

(0.9020231213872832, 0.8367052023121387, 0.8509532062391681)