# Importing all required packages

In [None]:
# Ignore warning messages
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Computations
import itertools

# Modelling Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Modelling Helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import KFold, cross_val_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


import seaborn as sns
import nltk
import re
from wordcloud import WordCloud

# Visualization
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

# Reading Fake and Real Data from CSV Files

In [None]:
fake = pd.read_csv('./bin_dataset/Fake.csv', delimiter = ',')
true = pd.read_csv('./bin_dataset/True.csv', delimiter = ',')

### Removing the 'Reuters' Keyword

In [None]:
def drop_prefix(text,prefix='(Reuters)',n=5):
    ts = str.split(text,' ')
    if prefix in ts[:n]:
        return str.split(text,prefix)[-1]
    else:
        return text

# Assigning 0 and 1 labels to Fake and Real Data

In [None]:
fake['label']= 0
true['label']= 1

dataset = pd.DataFrame()
dataset = true.append(fake).sample(frac = 1, random_state = 1)
dataset.index = range(len(true) + len(fake))

In [None]:
for i in range(dataset.shape[0]):
    dataset['text'][i] =  drop_prefix(dataset['text'][i])

In [None]:
# dataset[0]

## Displaying Dataset Info

In [None]:
dataset.info()

Checking for NULL values in Data

In [None]:
dataset.isnull().sum()

### Combining the text and title fields for "full text"

In [None]:
dataset['total'] = dataset['title'] + dataset['text']

### 5-fold cross validation

In [None]:
cv = KFold(n_splits=5, random_state=1,shuffle=True)

### Train-Test 80-20 Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset['total'], dataset.label, test_size=0.20, random_state=0)

## Count Vectorizer

In [None]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english') 
# Fit and transform the training data.
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test set 
count_test = count_vectorizer.transform(X_test)

## TF-IDF Vectorizer

In [None]:
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
#Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
#Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

### Defining Confusion Matrix Visualisation

In [None]:
# Creating a function that outputs a confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Defining Performance Evaluation Metrics

In [None]:
def precision_recall(true_label, predicted_label):
    precision = precision_score(true_label, predicted_label)
    print('Precision: %f' % precision)

    recall = recall_score(true_label, predicted_label)
    print('Recall: %f' % recall)
    
    accuracy = metrics.accuracy_score(true_label, predicted_label)
    print('Accuracy: %f' % accuracy)
    
    f1score = metrics.f1_score(true_label, predicted_label)
    print('F1 Score: %f' % f1score)

## Multinomial Naive Bayes Classifier + CountVectorizer

In [None]:
nb_classifier_k = MultinomialNB(alpha=0.1)
scores = cross_val_score(nb_classifier_k, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
nb_classifier_k.fit(count_train, y_train)
pred_nb_c = nb_classifier_k.predict(count_test)
precision_recall(y_test, pred_nb_c)

In [None]:
# tune the hyperparameter alpha for the MultinomialNB classifier
for alpha in np.arange(0,1,.05):
    nb_classifier_tune = MultinomialNB(alpha=alpha)
    nb_classifier_tune.fit(count_train, y_train)
    pred_tune = nb_classifier_tune.predict(count_test)
    precision_recall(y_test, pred_tune)
    print("Alpha: {:.2f} ".format(alpha))

In [None]:
# Running our fine-tuned model with alpha=0.05 and plotting the results
nb_classifier = MultinomialNB(alpha = 0.05)
scores = cross_val_score(nb_classifier_k, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
nb_classifier.fit(count_train, y_train)
pred_nb_count = nb_classifier.predict(count_test)
precision_recall(y_test, pred_nb_count)
cm = metrics.confusion_matrix(y_test, pred_nb_count, labels=[0,1])
    
    
plot_confusion_matrix(cm, classes=['TRUE','FAKE'], title ='Confusion matrix for a MultinomialNB with Count Vectorizer')

### Multinomial Naive Bayes + TF-IDF Vectorizer

In [None]:
nb_classifier = MultinomialNB(alpha = 0.1)
scores = cross_val_score(nb_classifier_k, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
nb_classifier.fit(tfidf_train, y_train)
pred_nb_tfidf = nb_classifier.predict(tfidf_test)
precision_recall(y_test, pred_nb_tfidf)

In [None]:
# tune the hyperparameter alpha for the MultinomialNB classifier
for alpha in np.arange(0,0.1,.01):
    nb_classifier_tune = MultinomialNB(alpha=alpha)
    nb_classifier_tune.fit(tfidf_train, y_train)
    pred_tune = nb_classifier_tune.predict(tfidf_test)
    precision_recall(y_test, pred_tune)
    print("Alpha: {:.2f} ".format(alpha))

In [None]:
# Running our fine-tuned model with alpha=0.05 and plotting the results
nb_classifier = MultinomialNB(alpha = 0.05)
scores = cross_val_score(nb_classifier_k, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
nb_classifier.fit(tfidf_train, y_train)
pred_nb_tfidf = nb_classifier.predict(tfidf_test)
precision_recall(y_test, pred_nb_tfidf)
cm2 = metrics.confusion_matrix(y_test, pred_nb_tfidf, labels=[0,1])
plot_confusion_matrix(cm2, classes=['TRUE','FAKE'], title ='Confusion matrix for a MultinomialNB with Tf-IDF')

### Logistic Regression + CountVectorizer

In [None]:
logreg = LogisticRegression(C=1e5)
scores = cross_val_score(logreg, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
logreg.fit(count_train, y_train)
pred_logreg_count = logreg.predict(count_test)
precision_recall(y_test, pred_logreg_count)

cm3 = metrics.confusion_matrix(y_test, pred_logreg_count, labels=[0,1])
plot_confusion_matrix(cm3, classes=['TRUE','FAKE'], title ='Confusion matrix for a Logistic Regression with Count Vectorizer')

### Logistic Regression + TF-IDF Vectorizer

In [None]:
logreg = LogisticRegression(C=1e5)
scores = cross_val_score(logreg, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
logreg.fit(tfidf_train, y_train)
pred_logreg_tfidf = logreg.predict(tfidf_test)
pred_logreg_tfidf_proba = logreg.predict_proba(tfidf_test)[:,1]
precision_recall(y_test, pred_logreg_tfidf)

cm4 = metrics.confusion_matrix(y_test, pred_logreg_tfidf, labels=[0,1])
plot_confusion_matrix(cm4, classes=['TRUE','FAKE'], title ='Confusion matrix for a Logistic Regression with Tf-IDF')

### Linear SVM + CountVectorizer

In [None]:
svclass = SVC(kernel = 'linear', random_state = 0)
scores = cross_val_score(svclass, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
svclass.fit(count_train, y_train)
pred_sv_count = svclass.predict(count_test)
precision_recall(y_test, pred_sv_count)

cm6 = metrics.confusion_matrix(y_test, pred_sv_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a LSVM with Count Vectorizer')

### Linear SVM + TF-IDF Vectorizer

In [None]:
svclass = SVC(kernel = 'linear', random_state = 0)
scores = cross_val_score(svclass, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
svclass.fit(tfidf_train, y_train)
pred_sv_count = svclass.predict(tfidf_test)
precision_recall(y_test, pred_sv_count)

cm6 = metrics.confusion_matrix(y_test, pred_sv_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a LSVM with TFIDF Vectorizer')

### Decision Tree + CountVectorizer

In [None]:
dtclass = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
scores = cross_val_score(dtclass, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
dtclass.fit(count_train, y_train)
pred_dt_count = dtclass.predict(count_test)
precision_recall(y_test, pred_dt_count)

cm4 = metrics.confusion_matrix(y_test, pred_dt_count, labels=[0,1])
plot_confusion_matrix(cm4, classes=['TRUE','FAKE'], title ='Confusion matrix for a DT with Count Vectorizer')

### Decision Tree + TF-IDF Vectorizer

In [None]:
dtclass = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
scores = cross_val_score(dtclass, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
dtclass.fit(tfidf_train, y_train)
pred_dt_count = dtclass.predict(tfidf_test)
precision_recall(y_test, pred_dt_count)

cm5 = metrics.confusion_matrix(y_test, pred_dt_count, labels=[0,1])
plot_confusion_matrix(cm5, classes=['TRUE','FAKE'], title ='Confusion matrix for a DT with TFIDF')

### Random Forest + CountVectorizer

In [None]:
rfclass = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
scores = cross_val_score(rfclass, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
rfclass.fit(count_train, y_train)
pred_rf_count = rfclass.predict(count_test)
precision_recall(y_test, pred_rf_count)

cm6 = metrics.confusion_matrix(y_test, pred_rf_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a RF with Count Vectorizer')

### Random Forest + TF-IDF Vectorizer

In [None]:
rfclass = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
scores = cross_val_score(rfclass, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
rfclass.fit(tfidf_train, y_train)
pred_rf_count = rfclass.predict(tfidf_test)
precision_recall(y_test, pred_rf_count)

cm6 = metrics.confusion_matrix(y_test, pred_rf_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a RF with TFIDF Vectorizer')

### K-Nearest Neighbor + CountVectorizer

In [None]:
knnclass = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
scores = cross_val_score(knnclass, count_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
knnclass.fit(count_train, y_train)
pred_knn_count = knnclass.predict(count_test)
precision_recall(y_test, pred_knn_count)

cm6 = metrics.confusion_matrix(y_test, pred_knn_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a KNN with Count Vectorizer')

### K-Nearest Neighbor + TF-IDF Vectorizer

In [None]:
knnclass = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
scores = cross_val_score(knnclass, tfidf_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores), np.std(scores))
knnclass.fit(tfidf_train, y_train)
pred_knn_count = knnclass.predict(tfidf_test)
precision_recall(y_test, pred_knn_count)

cm6 = metrics.confusion_matrix(y_test, pred_knn_count, labels=[0,1])
plot_confusion_matrix(cm6, classes=['TRUE','FAKE'], title ='Confusion matrix for a KNN with Count Vectorizer')