# Fake News Classifier
 
Data : https://www.kaggle.com/jruvika/fake-news-detection/home

In [1]:
# Hide deprecated warnings of sklearn package
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

Import the data.

In [2]:
import pandas as pd
df = pd.read_csv("Data/data.csv")
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [3]:
# Drop the data with null or undefined values
df = df.dropna()
# Get the number of each label in the data
fake = df[df.Label ==  0]
real = df[df.Label ==  1]

print('Number of Fake Articles -> ', fake.shape)
print('Number of Real Articles -> ', real.shape)

# Max Count of words in Document
max = 0
total = 0
count = 0;
for i in range(df.shape[0]):
    length = len(df.iloc[i,2])
    total += length
    count += 1
    if(length > max):
        max = length
    
print("Maximum Length ", max)
print("Avg Length", total / count)

Number of Fake Articles ->  (2120, 4)
Number of Real Articles ->  (1868, 4)
Maximum Length  32767
Avg Length 2941.288365095286


### Train/Test Split

Using Stratified sampling, split the data into 70-30.

In [4]:
from sklearn.model_selection import train_test_split
y = df.pop('Label')
x = df

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

train_count = y_train.value_counts()
test_count = y_test.value_counts()

print('Number of Fake articles in Training set -> ', train_count[0])
print('Number of Real articles in Training set -> ', train_count[1])
print('Number of Fake articles in Testing set -> ', test_count[0])
print('Number of Real articles in Testing set -> ', test_count[1])

Number of Fake articles in Training set ->  1696
Number of Real articles in Training set ->  1494
Number of Fake articles in Testing set ->  424
Number of Real articles in Testing set ->  374


### Data Preprocessing

* Tokenization
* Normalization
    * Lowercase all the words
    * Negation Handling
    * Remove Stopwords
    * Remove punctuations and Empty Strings from the array
* Stemming

Source - https://medium.com/@annabiancajones/sentiment-analysis-of-reviews-text-pre-processing-6359343784fb

#### Setup:

* Import NLTK
* Download and import stopwords from NLTK.corpus
* Import PorterStemmer which is the module used for stemming
* Import NLTK Vader Sentiment Analysis Library

In [5]:
# Import the NLTK library and its needed modules
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# Import the Vader Sentiment Analysis Library
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# Load the Apostrophes connecting words
appos_file = open('appos.txt','r')
appos = eval(appos_file.read())
appos_file.close()

# Function returns the negation handled word if it is presend in the appos dictionary
# Else returns the word itself
def negationHandling(word):
    if word in appos:
        return appos[word]
    else:
        return word
    
# Check if a word is a Stopword
# Stopword is a word that is commonly present in most of the documents and does not affect the model
def isNotStopWord(word):
    return word not in stopwords.words('english')

# Function to preprocess a single article
# Document refers to the text of the Article.
def processDocument(document):
    sentences = nltk.sent_tokenize(document)
    tokens = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        
        #Converting to LowerCase
        words = map(str.lower, words)
        
        # Negation Handling map is'nt to is not : 
        words = map(lambda x: negationHandling(x), words)
        
        # Remove stop words
        words = filter(lambda x: isNotStopWord(x), words)
        
        # Removing punctuations except '<.>/<?>/<!>'
        punctuations = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'
        words = map(lambda x: x.translate(str.maketrans('', '', punctuations)), words)
        
        # Remove empty strings
        words = filter(lambda x: len(x) > 0, words)
        
        # stemming
        words = map(lambda x: ps.stem(x), words)
        
        # Adding the preprocessed words to the document
        tokens = tokens + list(words)
        
    return tokens    

#### Process the data:

In [6]:
# Processing the body i.e. text of the Article
train_body = x_train.loc[:,'Body']
train_raw_body = []
train_body_sentiment = []

for i in range(x_train.shape[0]):
    train_body_sentiment.append(list(analyser.polarity_scores(train_body.iloc[i]).values()))
    train_raw_body.append(train_body.iloc[i])
        
train_body_wordArray = list(map(lambda x: processDocument(x), train_raw_body))
print("Preprocessing Completed for Body of training data")

# Process the Headlines of the training data.
train_headline = x_train.loc[:,'Headline']
train_raw_headline = []
train_headline_sentiment = []

for i in range(x_train.shape[0]):
    train_headline_sentiment.append(list(analyser.polarity_scores(train_headline.iloc[i]).values()))
    train_raw_headline.append(train_headline.iloc[i])
        
train_headline_wordArray = list(map(lambda x: processDocument(x), train_raw_headline))
print("Preprocessing Completed for HeadLine of training data")

Preprocessing Completed for Body of training data
Preprocessing Completed for HeadLine of training data


### Doc2Vec Model Training

In [7]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_body_data = [TaggedDocument(
    words = train_body_wordArray[i], 
    tags = [str(i)]) for i, _d in enumerate(train_body_wordArray)]

max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(
    vector_size = vec_size,
    alpha = alpha, 
    min_alpha = 0.025,
    min_count = 5,
    window = 10,
    dm = 1)

model.build_vocab(tagged_body_data)
print('Training Doc2Vec Model')

for epoch in range(max_epochs):
    if ((epoch + 1) % 10 == 0):
        print('Training iteration {0}'.format(epoch + 1))
    model.train(tagged_body_data,
                total_examples = model.corpus_count,
                epochs = model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

Training Doc2Vec Model
Training iteration 10
Training iteration 20
Training iteration 30
Training iteration 40
Training iteration 50
Training iteration 60
Training iteration 70
Training iteration 80
Training iteration 90
Training iteration 100
Model Saved


### Preprocess the testing data

In [8]:
# Pre-process the body of the article of test Data set 
test_body = x_test.loc[:,'Body']
test_raw_body = []
test_body_sentiment = []

for i in range(x_test.shape[0]):
    test_body_sentiment.append(list(analyser.polarity_scores(test_body.iloc[i]).values()))
    test_raw_body.append(test_body.iloc[i])
        
test_body_wordArray = list(map(lambda x: processDocument(x), test_raw_body))
print("Preprocessing Completed for Body of testing data")

# Preprocess the Headline of the article for testing dataset
test_headLine = x_test.loc[:,'Headline']
test_raw_headline = []
test_headline_sentiment = []

for i in range(x_test.shape[0]):
    test_headline_sentiment.append(list(analyser.polarity_scores(test_body.iloc[i]).values()))
    test_raw_headline.append(test_headLine.iloc[i])
        
test_headline_wordArray = list(map(lambda x: processDocument(x), test_raw_headline))
print("Preprocessing Completed for HeadLine of testing data")

Preprocessing Completed for Body of testing data
Preprocessing Completed for HeadLine of testing data


### URL Matching

In [10]:
from difflib import SequenceMatcher

# Contains the list of Fake News website URL's
fake_URL_df = pd.read_csv("Data/fake_news_websites.csv")
# Contains the list of Authentic News website URL's
fact_URL_df = pd.read_csv("Data/fact_news_websites.csv")

##Computes the similarity score between 2 strings
def similarityRatio(url_one, url_two):
    return SequenceMatcher(None, url_one, url_two).ratio()

##Extracts the domain from a URL - for example: 'https://www.bbc.com' will become 'bbc'
def getDomain(url):
    return url.lstrip("https://www.").split(".")[0]

##Assigns a score to the URL by string matching with URL of fake websites
def fakeURLCheckAssign(url):
    minSimilarity = 0.5
    for i in range(len(fake_URL_df['SiteName'])):
        similarity = similarityRatio(getDomain(url),getDomain(fake_URL_df['SiteName'][i]))
        if (similarity > 0.75):
            minSimilarity = min(minSimilarity,(1 - similarity))
            
    return minSimilarity

##Assigns a score to the URL by string matching with URL of authentic websites
def factURLCheckAssign(url):
    for i in range(len(fact_URL_df['SiteName'])):
        minSimilarity = 0.5
        similarity = similarityRatio(getDomain(url), getDomain(fact_URL_df['SiteName'][i]))

        if similarity == 1:
            return similarity
            
        if similarity > 0.75:
            minSimilarity = min(minSimilarity,1 - similarity)
            
    return minSimilarity
          
def URLScore(url):
    fakeScore = fakeURLCheckAssign(url)
    factScore = factURLCheckAssign(url)
    if factScore == 1:
        return 1
    if fakeScore == 0:
        return 0
    if factScore == 0.5 and fakeScore == 0.5:
        return 0.5
    else:
        return min(fakeScore, factScore)

In [11]:
test_URL = x_test.loc[:,'URLs']
train_URL = x_train.loc[:,'URLs']
# Higher Score denotes that the article is more authentic
# Completely Real Article URL Score - 1 & Completely Fake Article URL Score - 0

# Training data set URL Score
train_URLScore_vector = []    
for i in range(x_train.shape[0]):
    train_URLScore_vector.append(URLScore(train_URL.iloc[i]))

# Testing data set URL Score
test_URLScore_vector = []    
for i in range(x_test.shape[0]):
    test_URLScore_vector.append(URLScore(test_URL.iloc[i]))

In [12]:
print(train_URLScore_vector)
print(test_URLScore_vector)

[0.5, 1, 0.23076923076923073, 1, 0.23076923076923073, 1, 0.23076923076923073, 0.125, 1, 1, 0.23076923076923073, 1, 0.5, 1, 1, 0.125, 0.23076923076923073, 1, 0.23076923076923073, 1, 0.5, 0.5, 0.23076923076923073, 1, 1, 0.23076923076923073, 0.23076923076923073, 0.5, 1, 0.23076923076923073, 0.23076923076923073, 0.5, 0.23076923076923073, 0.23076923076923073, 1, 0.23076923076923073, 1, 0.5, 0.23076923076923073, 0.23076923076923073, 0.23076923076923073, 1, 1, 1, 1, 1, 0.23076923076923073, 0.23076923076923073, 1, 0.5, 0.23076923076923073, 0.5, 1, 0.23076923076923073, 0.23076923076923073, 0.23076923076923073, 0.23076923076923073, 0.5, 0.23076923076923073, 0.5, 1, 0.125, 0.23076923076923073, 0.5, 0.23076923076923073, 0.23076923076923073, 0.5, 0.5, 0.23076923076923073, 0.23076923076923073, 0.5, 0.5, 0.23076923076923073, 0.23076923076923073, 0.23076923076923073, 1, 0.125, 1, 1, 1, 0.23076923076923073, 0.23076923076923073, 0.5, 1, 1, 0.5, 0.125, 0.5, 0.5, 0.23076923076923073, 1, 0.5, 0.23076923076

### Get word vectors using the trained doc2vec model

In [13]:
import numpy as np
model = Doc2Vec.load("d2v.model")

# Training set Body Word Vector  
train_body_vector = []
for i in range(x_train.shape[0]):
    trainBodyConcat = np.concatenate([model.docvecs[i], np.asarray(train_body_sentiment[i])])
    train_body_vector.append(trainBodyConcat)

# Training data set Headline Word Vectors
train_headline_vector = []
for i in range(x_train.shape[0]):
    train_headline_sentiment[i].append(train_URLScore_vector[i])
    trainHeadConcat = np.concatenate([model.infer_vector(train_headline_wordArray[i]), 
                                      np.asarray(train_headline_sentiment[i])]) 
    train_headline_vector.append(trainHeadConcat)

# Testing set Body Word Vector
test_body_vector = []
for i in range(x_test.shape[0]):
    testBodyConcat = np.concatenate([model.infer_vector(test_body_wordArray[i]), 
                                     np.asarray(test_body_sentiment[i])])
    test_body_vector.append(testBodyConcat)
    
# Testing set Headline Word Vectors
test_headline_vector = []
for i in range(x_test.shape[0]):
    test_headline_sentiment[i].append(test_URLScore_vector[i])
    testHeadConcat = np.concatenate([model.infer_vector(test_headline_wordArray[i]), 
                                     np.asarray(test_headline_sentiment[i])])
    test_headline_vector.append(testHeadConcat)    

# Create Numpy Array for training data to train sklearn models
np_train_headline = np.array([np.array(xi) for xi in train_headline_vector]) 
np_train_body = np.array([np.array(xi) for xi in train_body_vector])

inp_x_train = []
for i in range(x_train.shape[0]):
    inp_x_train.append(np.concatenate((np_train_headline[i], np_train_body[i])))

inp_x_train = np.array(inp_x_train)

# Create np Array for testing data to train sklearn models
np_test_headline = np.array([np.array(xi) for xi in test_headline_vector])
np_test_body = np.array([np.array(xi) for xi in test_body_vector])

inp_x_test = []
for i in range(x_test.shape[0]):
    inp_x_test.append(np.concatenate((np_test_headline[i], np_test_body[i])))

inp_x_test = np.array(inp_x_test)

print('Shape of the np training data', inp_x_train.shape)
print('Shape of the np training data', inp_x_test.shape)

Shape of the np training data (3190, 609)
Shape of the np training data (798, 609)


In [14]:
from sklearn import svm

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

C = [0.1, 0.5, 1, 5, 10, 50]
param_grid = [
     {'C': C, 'kernel': ['linear']},
     {'C': C, 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
     {'degree': [2,3,4], 'kernel': ['poly']},
     {'coef0': [0.0], 'kernel': ['sigmoid']} 
]

table = {}

score_metric = 'accuracy'
clf = GridSearchCV(svm.SVC(), param_grid, cv = 5, scoring = score_metric)
clf.fit(inp_x_train, y_train)
print("Best parameters set found :", clf.best_params_)
means = clf.cv_results_['mean_test_score']

for mean, params in zip(means, clf.cv_results_['params']):
    if params == clf.best_params_:
        print("%s -> %0.3f" % (score_metric, mean))
        key = str(params)
    if key not in table:
        table[key] = []
        table[key].append("%0.3f" % (mean))
    print()

Best parameters set found : {'C': 0.1, 'kernel': 'linear'}
accuracy -> 0.937




































### SVM

Using GridSearchCV to find the optimal parameters for this training the model

#### Train the SVC with above parameter

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

#svm_model = svm.SVC(C = 10, gamma = 0.0001, kernel = 'rbf')
svm_model = svm.SVC(C = 0.1, kernel = 'linear')
svm_model.fit(inp_x_train, y_train)
y_pred = svm_model.predict(inp_x_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy -> ', accuracy)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average = 'weighted')
print('Precision -> ', precision)
print('Recall -> ', recall)
print('F-Score -> ', fscore)
print('Support -> ', support)

Accuracy ->  0.9210526315789473
Precision ->  0.9239792535716824
Recall ->  0.9210526315789473
F-Score ->  0.9211256938956043
Support ->  None


### Gaussian Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(inp_x_train, y_train)

nb_y_pred = gnb.predict(inp_x_test)

accuracy = accuracy_score(y_test, nb_y_pred)
print('Accuracy -> ', accuracy)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, nb_y_pred, average = 'weighted')
print('Precision -> ', precision)
print('Recall -> ', recall)
print('F-Score -> ', fscore)
print('Support -> ', support)

Accuracy ->  0.5626566416040101
Precision ->  0.7601117930661063
Recall ->  0.5626566416040101
F-Score ->  0.43514383284862107
Support ->  None


### Decision Tree 

In [18]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(inp_x_train, y_train)

dt_y_pred = dt_clf.predict(inp_x_test)

accuracy = accuracy_score(y_test, dt_y_pred)
print('Accuracy -> ', accuracy)
precision, recall, fscore, support = precision_recall_fscore_support(y_test, dt_y_pred, average = 'weighted')
print('Precision -> ', precision)
print('Recall -> ', recall)
print('F-Score -> ', fscore)
print('Support -> ', support)

Accuracy ->  0.931077694235589
Precision ->  0.9340160315027508
Recall ->  0.931077694235589
F-Score ->  0.9311414787977496
Support ->  None


### Deep Learning
#### Neural Network

* Import Tensorflow
* Import Keras
* Set the seed value to 7

In [19]:
from numpy.random import seed
# Set the seed as 7 to get reproducible results
seed(7)

import tensorflow as tf

# Set the seed as 7 here as well
from tensorflow import set_random_seed
set_random_seed(7)

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K

train_accuracy = {}
test_accuracy = {}
models = []

for i in range(1,20):
    no_of_hidden_neurons = i * 2
    
    # Create a new Model
    model = Sequential()
    
    model.add(Dense(no_of_hidden_neurons, input_dim = inp_x_train.shape[1], activation='relu'))
    model.add(Dense(units = 1, activation = 'sigmoid'))

    model.compile(loss = 'mse',optimizer = 'adam',metrics = ['accuracy'])
    
    history = model.fit(inp_x_train, y_train, epochs = 10, batch_size = 100, verbose=2)
    train_accuracy[no_of_hidden_neurons] = history.history['acc'][9]
    
    # Store the model
    models.append(model)
    
    # Calculate the score on the testing data set
    test_scores = model.evaluate(inp_x_test, y_test, batch_size = 100)
    test_accuracy[no_of_hidden_neurons] = test_scores[1]
    # Reset keras and tf
    K.clear_session()
    tf.reset_default_graph()

print(train_accuracy)
print(test_accuracy)

Using TensorFlow backend.


Epoch 1/10
 - 1s - loss: 0.2610 - acc: 0.4759
Epoch 2/10
 - 0s - loss: 0.2494 - acc: 0.5317
Epoch 3/10
 - 0s - loss: 0.2415 - acc: 0.5317
Epoch 4/10
 - 0s - loss: 0.2159 - acc: 0.5574
Epoch 5/10
 - 0s - loss: 0.1932 - acc: 0.7931
Epoch 6/10
 - 0s - loss: 0.1773 - acc: 0.8417
Epoch 7/10
 - 0s - loss: 0.1654 - acc: 0.8680
Epoch 8/10
 - 0s - loss: 0.1560 - acc: 0.8881
Epoch 9/10
 - 0s - loss: 0.1478 - acc: 0.8991
Epoch 10/10
 - 0s - loss: 0.1407 - acc: 0.9056
Epoch 1/10
 - 0s - loss: 0.2613 - acc: 0.6185
Epoch 2/10
 - 0s - loss: 0.1817 - acc: 0.7455
Epoch 3/10
 - 0s - loss: 0.1311 - acc: 0.8276
Epoch 4/10
 - 0s - loss: 0.0999 - acc: 0.8749
Epoch 5/10
 - 0s - loss: 0.0815 - acc: 0.9003
Epoch 6/10
 - 0s - loss: 0.0685 - acc: 0.9197
Epoch 7/10
 - 0s - loss: 0.0601 - acc: 0.9320
Epoch 8/10
 - 0s - loss: 0.0536 - acc: 0.9420
Epoch 9/10
 - 0s - loss: 0.0485 - acc: 0.9476
Epoch 10/10
 - 0s - loss: 0.0454 - acc: 0.9514
Epoch 1/10
 - 0s - loss: 0.2600 - acc: 0.5749
Epoch 2/10
 - 0s - loss: 0.2295 

Epoch 1/10
 - 0s - loss: 0.2121 - acc: 0.7188
Epoch 2/10
 - 0s - loss: 0.0860 - acc: 0.8928
Epoch 3/10
 - 0s - loss: 0.0579 - acc: 0.9317
Epoch 4/10
 - 0s - loss: 0.0436 - acc: 0.9511
Epoch 5/10
 - 0s - loss: 0.0352 - acc: 0.9630
Epoch 6/10
 - 0s - loss: 0.0287 - acc: 0.9740
Epoch 7/10
 - 0s - loss: 0.0234 - acc: 0.9799
Epoch 8/10
 - 0s - loss: 0.0201 - acc: 0.9840
Epoch 9/10
 - 0s - loss: 0.0176 - acc: 0.9881
Epoch 10/10
 - 0s - loss: 0.0152 - acc: 0.9890
Epoch 1/10
 - 0s - loss: 0.2259 - acc: 0.6871
Epoch 2/10
 - 0s - loss: 0.0936 - acc: 0.8881
Epoch 3/10
 - 0s - loss: 0.0628 - acc: 0.9260
Epoch 4/10
 - 0s - loss: 0.0471 - acc: 0.9483
Epoch 5/10
 - 0s - loss: 0.0358 - acc: 0.9624
Epoch 6/10
 - 0s - loss: 0.0282 - acc: 0.9730
Epoch 7/10
 - 0s - loss: 0.0235 - acc: 0.9812
Epoch 8/10
 - 0s - loss: 0.0189 - acc: 0.9850
Epoch 9/10
 - 0s - loss: 0.0156 - acc: 0.9893
Epoch 10/10
 - 0s - loss: 0.0135 - acc: 0.9909
Epoch 1/10
 - 0s - loss: 0.2102 - acc: 0.7125
Epoch 2/10
 - 0s - loss: 0.0829 