In [1]:
import numpy as np 
import pandas as pd 
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import nltk
nltk.download('stopwords')
nltk.download('punkt') # for nltk's sentence tokenization within word_tokenize


[nltk_data] Downloading package stopwords to /home/aqm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/aqm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
custom_stop_words = set(nltk.corpus.stopwords.words('english'))

In [3]:
path = 'kaggle-fn-dataset'
path_real = os.path.join(path, 'True.csv')
path_fake = os.path.join(path, 'Fake.csv')

realdf = pd.read_csv(path_real)
realdf['label'] = True
fakedf = pd.read_csv(path_fake)
fakedf['label'] = False
realdf.columns, len(realdf), len(fakedf)

(Index(['title', 'text', 'subject', 'date', 'label'], dtype='object'),
 21417,
 23481)

In [4]:
df = pd.concat([realdf, fakedf])
traindf, testdf = train_test_split(df)
len(df), len(traindf), len(testdf)

(44898, 33673, 11225)

In [5]:
ytrain = traindf.label
ytest = testdf.label

In [6]:
def train_and_score(model, xtrain, ytrain, xtest, ytest):
    model.fit(xtrain, ytrain)
    # train_score = model.score(xtrain, ytrain)
    test_score = model.score(xtest, ytest)
    print('For', model)
    # print('Train score: ', train_score)    
    print('Test score: ', test_score)

In [53]:
model_bow_nb = make_pipeline(CountVectorizer(max_features=50), MultinomialNB())
train_and_score(model_bow_nb, traindf.text, ytrain, testdf.text, ytest) # 88%

For Pipeline(steps=[('countvectorizer', CountVectorizer(max_features=50)),
                ('multinomialnb', MultinomialNB())])
Train score:  0.878300121759273
Test score:  0.8769710467706013


In [60]:
model_bigram_nb = make_pipeline(CountVectorizer(max_features=50, ngram_range=(2, 2)), MultinomialNB())
train_and_score(model_bigram_nb, traindf.text, ytrain, testdf.text, ytest) # 84%

For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50, ngram_range=(2, 2))),
                ('multinomialnb', MultinomialNB())])
Train score:  0.8404062602084756
Test score:  0.8457015590200445


In [68]:
model_tfidf_nb = make_pipeline(TfidfVectorizer(max_features=50), MultinomialNB())
train_and_score(model_tfidf_nb, traindf.text, ytrain, testdf.text, ytest) # 87%

For Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_features=50)),
                ('multinomialnb', MultinomialNB())])
Train score:  0.8659757075401657
Test score:  0.8679732739420936


In [89]:
cv_with_stop_words = CountVectorizer(
    max_features=50, 
    stop_words=custom_stop_words
)
model_bow_sw_nb = make_pipeline(cv_with_stop_words, MultinomialNB())

train_and_score(model_bow_sw_nb, traindf.text, ytrain, testdf.text, ytest) # 92%

For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...})),
                ('multinomialnb', MultinomialNB())])
Train score:  0.9129866658747364
Test score:  0.9168819599109131


In [91]:
cv_bigram_with_stop_words = CountVectorizer(
    max_features=50, 
    ngram_range=(2, 2),
    stop_words=custom_stop_words
)
model_bigram_sw_nb = make_pipeline(cv_bigram_with_stop_words, MultinomialNB())

train_and_score(model_bigram_sw_nb, traindf.text, ytrain, testdf.text, ytest) # 86%

For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50, ngram_range=(2, 2),
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...})),
                ('multinomialnb', MultinomialNB())])
Train score:  0.8733406586879696
Test score:  0.864053452115813


In [26]:
%%time
cv = CountVectorizer(
    max_features=50, 
    stop_words=custom_stop_words,
    tokenizer=nltk.word_tokenize
)
temp = Pipeline(steps=[
    ('nltk\'s tokenizer', cv), 
    ('mnb', MultinomialNB()),
])
train_and_score(temp, traindf.text, ytrain, testdf.text, ytest) # 0.9889; 3 mins

For Pipeline(steps=[("nltk's tokenizer",
                 CountVectorizer(max_features=50,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...},
                                 tokenizer=<function word_tokenize at 0x7fd612dd8670>)),
                ('mnb', MultinomialNB())])
Test score:  0.9889532293986637
CPU times: user 3min 2s, sys: 463 ms, total: 3min 2s
Wall time: 3min 3s


In [27]:
%%time
cv = TfidfVectorizer(
    max_features=50, 
    stop_words=custom_stop_words,
    tokenizer=nltk.word_tokenize
)
temp = Pipeline(steps=[
    ('nltk\'s tokenizer', cv), 
    ('mnb', MultinomialNB()),
])
train_and_score(temp, traindf.text, ytrain, testdf.text, ytest) # 0.9890; 3 mins

For Pipeline(steps=[("nltk's tokenizer",
                 TfidfVectorizer(max_features=50,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...},
                                 tokenizer=<function word_tokenize at 0x7fd612dd8670>)),
                ('mnb', MultinomialNB())])
Test score:  0.9890423162583519
CPU times: user 2min 38s, sys: 508 ms, total: 2min 38s
Wall time: 2min 39s


In [33]:
class TokenizerAndStemmer(object):

    stem = nltk.PorterStemmer().stem
    def __call__(self, text):
        return (self.stem(itoken) for itoken in nltk.word_tokenize(text))

In [34]:
%%time
cv = CountVectorizer(
    max_features=50, 
    stop_words=custom_stop_words,
    tokenizer=TokenizerAndStemmer()
)
temp = Pipeline(steps=[
    ('porter stemmer + nltk\'s tokenizer', cv), 
    ('mnb', MultinomialNB()),
])
train_and_score(temp, traindf.text, ytrain, testdf.text, ytest) # 0.9897; 7 mins

For Pipeline(steps=[("porter stemmer + nltk's tokenizer",
                 CountVectorizer(max_features=50,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...},
                                 tokenizer=<__main__.TokenizerAndStemmer object at 0x7fd60e21ab20>)),
                ('mnb', MultinomialNB())])
Test score:  0.9897550111358575
CPU times: user 6min 47s, sys: 188 ms, total: 6min 47s
Wall time: 6min 47s
