Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

Importing Dataset

In [2]:
df_fake = pd.read_csv("D:/Desktop/NLP/FakeNews2/Fake.csv")
df_true = pd.read_csv("D:/Desktop/NLP/FakeNews2/True.csv")

In [3]:
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
df_fake["class"] = 0
df_true["class"] = 1

In [5]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

In [6]:
df_fake.shape, df_true.shape

((23481, 5), (21417, 5))

Merging True and Fake Dataframes

In [7]:
df_merge = pd.concat([df_fake, df_true], axis =0 )
df_merge.head(10)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",0


In [8]:
df_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

Removing columns which are not required

In [9]:
df = df_merge.drop(["title", "subject","date"], axis = 1)

In [10]:
df.isnull().sum()

text     0
class    0
dtype: int64

Random Shuffling the dataframe

In [11]:
df = df.sample(frac = 1)

In [12]:
df.head()

Unnamed: 0,text,class
5516,"Just before midnight on Friday, July 8, LaNayd...",0
976,Set off by a comment about his small hands on ...,0
19349,WASHINGTON (Reuters) - It would be a game-cha...,1
4921,WASHINGTON (Reuters) - The Republican chairman...,1
7807,NEW YORK (Reuters) - After a brutal week for ...,1


In [13]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [14]:
df.columns

Index(['text', 'class'], dtype='object')

Creating a function to process the texts

In [15]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

In [16]:
df["text"] = df["text"].apply(wordopt)

Defining dependent and independent variables

In [17]:
x = df["text"]
y = df["class"]

Splitting Training and Testing

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

Convert text to vectors

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train,y_train)

In [21]:
pred_lr=LR.predict(xv_test)

In [22]:
LR.score(xv_test, y_test)

0.9892204899777283

In [23]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5843
           1       0.99      0.99      0.99      5382

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [39]:

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred_lr)
print(cm)

[[5780   63]
 [  58 5324]]


Naive Bayes

In [24]:
from sklearn.naive_bayes import MultinomialNB
NB=MultinomialNB()
NB.fit(xv_train, y_train)

In [25]:
pred_nb = NB.predict(xv_test)

In [26]:
NB.score(xv_test, y_test)

0.9366592427616927

In [27]:
print(classification_report(y_test, pred_nb))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      5843
           1       0.95      0.92      0.93      5382

    accuracy                           0.94     11225
   macro avg       0.94      0.94      0.94     11225
weighted avg       0.94      0.94      0.94     11225



In [40]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred_nb)
print(cm)

[[5578  265]
 [ 446 4936]]


Decision Tree Classification

In [28]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [29]:
pred_dt = DT.predict(xv_test)

In [30]:
DT.score(xv_test, y_test)


0.9951002227171493

In [31]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5843
           1       1.00      0.99      0.99      5382

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



In [41]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred_dt)
print(cm)

[[5822   21]
 [  34 5348]]


Random Forest Classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

In [33]:
pred_rfc = RFC.predict(xv_test)

In [34]:
RFC.score(xv_test, y_test)

0.9901113585746102

In [35]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5843
           1       0.99      0.99      0.99      5382

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [42]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred_rfc)
print(cm)

[[5802   41]
 [  70 5312]]


Model Testing

In [36]:
def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt) 
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_NB = NB.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nNB Prediction: {} \nRFC Prediction: {}".format(output_lable(pred_LR[0]),
                                                                                                             output_lable(pred_DT[0]),                                                                                                       output_lable(pred_DT[0]), 
                                                                                                              output_lable(pred_NB[0]), 
                                                                                                              output_lable(pred_RFC[0])))

In [46]:
news = str(input())
manual_testing(news)



LR Prediction: Not A Fake News 
DT Prediction: Fake News 
NB Prediction: Fake News 
RFC Prediction: Not A Fake News
