In [1]:
import numpy as np 
import pandas as pd 
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import nltk
nltk.download('stopwords')


In [3]:
path = 'kaggle-fn-dataset'
path_real = os.path.join(path, 'True.csv')
path_fake = os.path.join(path, 'Fake.csv')

In [4]:
realdf = pd.read_csv(path_real)
realdf['label'] = True
realdf.columns, len(realdf)

(Index(['title', 'text', 'subject', 'date'], dtype='object'), 21417)

In [6]:
fakedf = pd.read_csv(path_fake)
fakedf['label'] = False
fakedf.columns, len(fakedf)

(Index(['title', 'text', 'subject', 'date', 'label'], dtype='object'), 23481)

In [7]:
df = pd.concat([realdf, fakedf])
traindf, testdf = train_test_split(df)
len(df), len(traindf), len(testdf)

(Index(['title', 'text', 'subject', 'date', 'label'], dtype='object'), 44898)

In [32]:
ytrain = traindf.label
ytest = testdf.label

In [31]:
def train_and_score(model, xtrain, ytrain, xtest, ytest):
    model.fit(xtrain, ytrain)
    train_score = model.score(xtrain, ytrain)
    test_score = model.score(xtest, ytest)
    print('For', model)
    print('Train score: ', train_score)    
    print('Test score: ', test_score)

In [35]:
mnb_model = MultinomialNB()
train_and_score(mnb_model, xtrain, ytrain, xtest, ytest)

For MultinomialNB()
Train score:  0.878300121759273
Test score:  0.8769710467706013


In [53]:
model_bow_nb = make_pipeline(CountVectorizer(max_features=50), MultinomialNB())
train_and_score(model_bow_nb, traindf.text, ytrain, testdf.text, ytest)

For Pipeline(steps=[('countvectorizer', CountVectorizer(max_features=50)),
                ('multinomialnb', MultinomialNB())])
Train score:  0.878300121759273
Test score:  0.8769710467706013


In [60]:
model_bigram_nb = make_pipeline(CountVectorizer(max_features=50, ngram_range=(2, 2)), MultinomialNB())
train_and_score(model_bigram_nb, traindf.text, ytrain, testdf.text, ytest)

For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50, ngram_range=(2, 2))),
                ('multinomialnb', MultinomialNB())])
Train score:  0.8404062602084756
Test score:  0.8457015590200445


In [68]:
model_tfidf_nb = make_pipeline(TfidfVectorizer(max_features=50), MultinomialNB())
train_and_score(model_tfidf_nb, traindf.text, ytrain, testdf.text, ytest)

For Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_features=50)),
                ('multinomialnb', MultinomialNB())])
Train score:  0.8659757075401657
Test score:  0.8679732739420936


In [85]:
custom_stop_words = set(nltk.corpus.stopwords.words('english'))

In [89]:
cv_with_stop_words = CountVectorizer(
    max_features=50, 
    stop_words=custom_stop_words
)
model_bow_sw_nb = make_pipeline(cv_with_stop_words, MultinomialNB())

train_and_score(model_bow_sw_nb, traindf.text, ytrain, testdf.text, ytest) # 0.92

For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...})),
                ('multinomialnb', MultinomialNB())])
Train score:  0.9129866658747364
Test score:  0.9168819599109131


In [91]:
cv_bigram_with_stop_words = CountVectorizer(
    max_features=50, 
    ngram_range=(2, 2),
    stop_words=custom_stop_words
)
model_bigram_sw_nb = make_pipeline(cv_bigram_with_stop_words, MultinomialNB())

train_and_score(model_bigram_sw_nb, traindf.text, ytrain, testdf.text, ytest) # 0.86

For Pipeline(steps=[('countvectorizer',
                 CountVectorizer(max_features=50, ngram_range=(2, 2),
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all',
                                             'am', 'an', 'and', 'any', 'are',
                                             'aren', "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', 'can',
                                             'couldn', "couldn't", ...})),
                ('multinomialnb', MultinomialNB())])
Train score:  0.8733406586879696
Test score:  0.864053452115813
