# Fake News Detector

### Alex Hedrick

### About the data (downloaded from Kaggle):

(WELFake) is a dataset of 72,134 news articles with 35,028 real and 37,106 fake news. For this, authors merged four popular news datasets (i.e. Kaggle, McIntire, Reuters, BuzzFeed Political) to prevent over-fitting of classifiers and to provide more text data for better ML training.

Published in:
IEEE Transactions on Computational Social Systems: pp. 1-13 (doi: 10.1109/TCSS.2021.3068519).

0 = fake, 1 = real

In [5]:
# import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix , classification_report
%matplotlib inline
import nltk

# read data
news = pd.read_csv('news_data_2_recent/WELFake_Dataset.csv', sep=',', names = ['index','title','body','label'], header = None, skiprows = 1)
news = news.drop('index', axis=1)
news.head()

Unnamed: 0,title,body,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### Data Preprocessing

In [6]:
import string
from warnings import simplefilter
import time
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
start_time = time.time()

# remove all rows with missing values
news = news.dropna()
print('removed missing values in ' + str(np.round(time.time() - start_time,2)) + ' seconds')

# # only keep first however many rows of news
# news = news[:15000]

# define set of allowed characters
allowed_chars = set(string.ascii_letters + string.digits + ' ')

# remove non-alphanumeric characters from news
# news['title'] = news['title'].apply(lambda x: ''.join(c for c in x if c in allowed_chars))
news['body'] = news['body'].apply(lambda x: ''.join(c for c in x if c in allowed_chars))
print('removed non-alphanumeric characters in ' + str(np.round(time.time() - start_time,2)) + ' seconds')

# convert all news to lowercase
# news['title'] = news['title'].str.lower()
news['body'] = news['body'].str.lower()
print('converted to lowercase in ' + str(np.round(time.time() - start_time,2)) + ' seconds')

# replace 0 with -1 for label
# now, -1 = fake news, 1 = real news
news['label'] = news['label'].replace(0, -1)

news.head()


removed missing values in 0.03 seconds
removed non-alphanumeric characters in 14.14 seconds
converted to lowercase in 14.33 seconds


Unnamed: 0,title,body,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,no comment is expected from barack obama membe...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,now most of the demonstrators gathered last n...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",a dozen politically active pastors came here f...,-1
4,SATAN 2: Russia unvelis an image of its terrif...,the rs28 sarmat missile dubbed satan 2 will re...,1
5,About Time! Christian Group Sues Amazon and SP...,all we can say on this one is it s about time ...,1


### Generate Multinomial Features

Generate multinomial features based on the number of occurrences of the most common words in 'title' and 'body' using sklearn CountVectorizer

In [7]:
# generate multinomial features based on the number of occurrences of a set of words in the message using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import text 

# add some extra stop words
more_stop_words = ["s", "wa", "u", "ha"]
stop_words = text.ENGLISH_STOP_WORDS.union(more_stop_words)

# split data into training and validation sets
xtrain, xvali, training_labels, vali_labels = train_test_split(news['body'], news['label'], test_size=0.33, random_state=1)

# return the number of unique words in all training data
def get_unique_words_count(training_data):
    unique_words = set()
    for message in training_data:
        for word in message.split():
            unique_words.add(word)
    return len(unique_words)

print(f'number of unique words: {get_unique_words_count(xtrain)}')

# look at only most frequent m words
m = [50, 1000, 10000, 30000, 50000, 70000]

# initialize lists to hold training features and vali features
training_features = []
vali_features = []

# apply CountVectorizer to messages
for j in m:
    count_vectorizer = CountVectorizer(stop_words=stop_words, max_features=j)
    # count_vectorizer = CountVectorizer(stop_words=stop_words, max_features=m)
    # count_vectorizer = CountVectorizer(stop_words=stop_words)

    # get count features for training data
    training_features_temp = count_vectorizer.fit_transform(xtrain)

    # get count features for vali data
    vali_features_temp = count_vectorizer.transform(xvali)

    # append training features to list
    training_features.append(training_features_temp)
    vali_features.append(vali_features_temp)

    # print time elapsed
    print(f'finished m = {j}')

# now we have a list of count features and feature names for each value of m
print(training_features[0].shape)
print(vali_features[0].shape)

number of unique words: 327023
finished m = 50
finished m = 1000
finished m = 10000
finished m = 30000
finished m = 50000
finished m = 70000
(47929, 50)
(23608, 50)


#### Homemade Naive Bayes

In [8]:
# get sparse matrices from features

from scipy.sparse import csr_matrix
time_start = time.time()

# convert training features to sparse matrices using csr_matrix
training_features = [csr_matrix(x) for x in training_features]
vali_features = [csr_matrix(x) for x in vali_features]

print(f'finished in {np.round(time.time() - time_start,2)} seconds')


finished in 0.0 seconds


In [15]:
# naive bayes classifier
from collections import Counter
import numpy as np

# note: data labels must be -1 and 1
class NaiveBayes:
    def fit(self, xtrain, ytrain):
        # tells what the classes are
        self.classes = {-1,1}

        # initialize prior and conditional dicts
        self.priors = {}
        self.conditionals = {}

        for i in self.classes:
            # prior for class i
            self.priors[i] = np.mean(ytrain == i)
            # conditional probability of each feature given class i with Laplace smoothing
            class_indices = np.where(ytrain == i)[0]
            class_data = xtrain[class_indices]
            class_counts = Counter(class_data.flatten())
            self.conditionals[i] = {k: (v + 1) / (len(class_data) + len(class_counts))
                                    for k, v in class_counts.items()}


    def predict(self, xtest):
        # calculate posterior probability for each class
        posteriors = []
        for i in range(len(self.classes)):
            prior = np.log(self.class_priors[i])
            likelihood = np.sum(np.log(self.feature_likelihoods[i]) * xtest + np.log(1 - self.feature_likelihoods[i]) * (1 - xtest), axis=1)
            posterior = prior + likelihood
            posteriors.append(posterior)

        # get the class with maximum posterior probability
        pred_index = np.argmax(posteriors, axis=0)
        pred_class = self.classes[pred_index]

        # calculate confidence of prediction
        probabilities = np.exp(posteriors)
        pred_confidence = probabilities[pred_index] / np.sum(probabilities, axis=0)

        return pred_class, pred_confidence


In [16]:
time_start = time.time()

# train naive bayes model
nb = NaiveBayes()
nb.fit(training_features[1], training_labels)
print('model trained in ' + str(np.round(time.time() - time_start,2)) + ' seconds')

# Make predictions on test data
ybayes, confidences = nb.predict(vali_features[1])
print('predictions made in ' + str(np.round(time.time() - time_start,2)) + ' seconds')

# Print results
print(f'm = {m[1]}')
print(classification_report(vali_labels, ybayes))
print("Confidences:", confidences)


AttributeError: flatten not found

In [10]:

#     # print classification report
#     print(f'm = {m[i]}')
#     print(classification_report(vali_labels, ybayes))

#     # print confusion matrix
#     plt.figure(figsize = (4,3))

#     sns.heatmap(confusion_matrix(vali_labels,ybayes), annot=True, 
#                 fmt='', cmap='Blues')

#     plt.title('Confusion Matrix for Naive Bayes Classifier with m = ' + str(m[i]))
#     plt.xlabel('Predicted Labels')
#     plt.ylabel('Real Labels')