In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import numpy as np
import itertools
import string
from nltk.corpus import stopwords

In [7]:
df = pd.read_csv("fake_or_real_news.csv")
df['label'] = df['label'].replace({'FAKE' : 0, 'REAL' : 1})
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [9]:
def text_cleaning(text):
    text = text.lower()
    # split into tokens by white space
    tokens = text.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

def dummy_fun(doc):
    return doc

In [11]:
df.columns

Index(['title', 'text', 'label'], dtype='object')

In [17]:
df['title'] = df['title'].apply(text_cleaning)
df['text'] = df['text'].apply(text_cleaning)

In [26]:
X = df['text']
y = df['label']


cv = CountVectorizer(analyzer='word',
                              tokenizer=dummy_fun,
                              preprocessor=dummy_fun,
                              max_features =100000,
                              token_pattern=None)

tfidf = TfidfVectorizer(analyzer='word',
                              tokenizer=dummy_fun,
                              preprocessor=dummy_fun,
                              max_features =100000,
                              token_pattern=None)

kf = KFold(n_splits=10, shuffle=True)




for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    X_train_cv = cv.fit_transform(X_train)
    X_test_cv = cv.transform(X_test)
    lr_clf = LogisticRegression()
    nb_clf = MultinomialNB()
    lr_clf.fit(X_train_cv, y_train)
    lr_pred = lr_clf.predict(X_test_cv)
    score = accuracy_score(y_test, lr_pred)
    print("CountVectorizer - Logistic Regression Accuracy: %.3f" % score)
    nb_clf.fit(X_train_cv, y_train)
    nb_pred = nb_clf.predict(X_test_cv)
    score = accuracy_score(y_test, nb_pred)
    print("CountVectorizer - MultinomialNB Accuracy: %.3f" % score)
    
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)
    lr_clf = LogisticRegression()
    nb_clf = MultinomialNB()
    lr_clf.fit(X_train_tfidf, y_train)
    lr_pred = lr_clf.predict(X_test_tfidf)
    score = accuracy_score(y_test, lr_pred)
    print("TfidfVectorizer - Logistic Regression Accuracy: %.3f" % score)
    nb_clf.fit(X_train_tfidf, y_train)
    nb_pred = nb_clf.predict(X_test_tfidf)
    score = accuracy_score(y_test, nb_pred)
    print("TfidfVectorizer - MultinomialNB Accuracy: %.3f" % score)
    



CountVectorizer - Logistic Regression Accuracy: 0.927
CountVectorizer - MultinomialNB Accuracy: 0.899




TfidfVectorizer - Logistic Regression Accuracy: 0.937
TfidfVectorizer - MultinomialNB Accuracy: 0.822




CountVectorizer - Logistic Regression Accuracy: 0.916
CountVectorizer - MultinomialNB Accuracy: 0.877




TfidfVectorizer - Logistic Regression Accuracy: 0.910
TfidfVectorizer - MultinomialNB Accuracy: 0.847




CountVectorizer - Logistic Regression Accuracy: 0.916
CountVectorizer - MultinomialNB Accuracy: 0.901




TfidfVectorizer - Logistic Regression Accuracy: 0.921
TfidfVectorizer - MultinomialNB Accuracy: 0.847




CountVectorizer - Logistic Regression Accuracy: 0.902
CountVectorizer - MultinomialNB Accuracy: 0.891




TfidfVectorizer - Logistic Regression Accuracy: 0.899
TfidfVectorizer - MultinomialNB Accuracy: 0.806




CountVectorizer - Logistic Regression Accuracy: 0.924
CountVectorizer - MultinomialNB Accuracy: 0.907




TfidfVectorizer - Logistic Regression Accuracy: 0.923
TfidfVectorizer - MultinomialNB Accuracy: 0.834




CountVectorizer - Logistic Regression Accuracy: 0.921
CountVectorizer - MultinomialNB Accuracy: 0.882




TfidfVectorizer - Logistic Regression Accuracy: 0.924
TfidfVectorizer - MultinomialNB Accuracy: 0.821




CountVectorizer - Logistic Regression Accuracy: 0.927
CountVectorizer - MultinomialNB Accuracy: 0.893




TfidfVectorizer - Logistic Regression Accuracy: 0.918
TfidfVectorizer - MultinomialNB Accuracy: 0.839




CountVectorizer - Logistic Regression Accuracy: 0.929
CountVectorizer - MultinomialNB Accuracy: 0.923




TfidfVectorizer - Logistic Regression Accuracy: 0.942
TfidfVectorizer - MultinomialNB Accuracy: 0.882




CountVectorizer - Logistic Regression Accuracy: 0.927
CountVectorizer - MultinomialNB Accuracy: 0.882




TfidfVectorizer - Logistic Regression Accuracy: 0.926
TfidfVectorizer - MultinomialNB Accuracy: 0.839




CountVectorizer - Logistic Regression Accuracy: 0.927
CountVectorizer - MultinomialNB Accuracy: 0.869




TfidfVectorizer - Logistic Regression Accuracy: 0.907
TfidfVectorizer - MultinomialNB Accuracy: 0.815
