## This is an end-to-end project to detect the fake news

In [1]:
# importing the modules needed to build the model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re #regular expressions
import sklearn
import string as str

In [2]:
# Read the csv files

T = pd.read_csv("true.csv")
T.head()

F = pd.read_csv("Fake.csv")
F.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [3]:
# Since these are 2 seperate dataframes we need to include label column before combining
T['target']=1
F['target']=0

In [4]:
# Let's combine both the dataframes and shuffle them
Data = pd.concat([T,F],axis=0)

# Let's shuffle the dataframe
Data=Data.sample(frac=1)

In [5]:
Data.head()

Unnamed: 0,title,text,subject,date,target
17701,Ukraine's controversial law reforms open to re...,SYDNEY (Reuters) - Ukrainian Justice Minister ...,worldnews,"October 12, 2017",1
2348,#NeverTrump Conservative Has The PERFECT Desc...,Donald Trump is being graded on a curve. He al...,News,"March 1, 2017",0
16355,Pakistan's youthful population creates educati...,ISLAMABAD (Reuters) - Pakistani private school...,worldnews,"October 27, 2017",1
6653,Obama says has 'great confidence' that Russia ...,WASHINGTON (Reuters) - U.S. President Barack O...,politicsNews,"December 16, 2016",1
480,There’s A Bill In The House That Would Comple...,"For Republicans, the Robert Mueller investigat...",News,"August 28, 2017",0


In [6]:
# Let's check if any missing values is present in the DataFrame
Data.isna().sum()

#There are no missing values

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [7]:
"""
As far we know ML model's perform well when we don't make the model complex
So the goal is to remove the unimportant features
In this dataframe we can remove the columns title , subject and date which make the model
a bit complex , since they are actually part of the text column
"""

#Removing the columns
Data.drop('title',axis=1,inplace=True)
Data.drop('subject',axis=1,inplace=True)
Data.drop('date',axis=1,inplace=True)

In [8]:
Data.head()

Unnamed: 0,text,target
17701,SYDNEY (Reuters) - Ukrainian Justice Minister ...,1
2348,Donald Trump is being graded on a curve. He al...,0
16355,ISLAMABAD (Reuters) - Pakistani private school...,1
6653,WASHINGTON (Reuters) - U.S. President Barack O...,1
480,"For Republicans, the Robert Mueller investigat...",0


In [9]:
# Let's check the value counts of the target
Data['target'].value_counts()

0    23481
1    21417
Name: target, dtype: int64

In [10]:
# Let's check the length of the dataframe
len(Data)

44898

In [11]:
"""
As we know that the machine learning models can only deal with numerical data,
our first task is to convert this into numeric format.
But we cannot directly convert the text data to numeric.
We first need to process the text(to avoid complexities in the data or unnecessary text)
and then we need to encode them using various encoding techniques

** for processing the text we can use many modules - re, string , textwrap and many more**

You can try on your own using various other modules

"""


# Let's write a function to process the text

def process(text):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+','',text)
    text = re.sub('[%s]' % re.escape(str.punctuation),'',text)
    text = re.sub('\n','',text)
    text = re.sub('\w*\d\w*','',text)
    
    return text




In [12]:
# Let's apply the function to the text column
Data['text'] = Data['text'].apply(process)

In [13]:
Data.head()

Unnamed: 0,text,target
17701,sydney reuters ukrainian justice minister ...,1
2348,donald trump is being graded on a curve he al...,0
16355,islamabad reuters pakistani private school...,1
6653,washington reuters u s president barack o...,1
480,for republicans the robert mueller investigat...,0


In [14]:
# In order to use the data to train and test the model ,
# we need to split it

from sklearn.model_selection import train_test_split

# Splits the data in 80:20 ratio
X_train,X_test,Y_train,Y_test = train_test_split(Data['text'],Data['target'],test_size=0.2)

In [15]:
len(X_train),len(Y_train)

(35918, 35918)

In [16]:
len(X_test),len(Y_test)

(8980, 8980)

In [17]:
'''
As we know we can't train a model with the numeric data, 
we have to convert this to numerical using encoding techniques

In this we use the most common tool to convert text to numeric that is
TfidfVectorizer
'''
from sklearn.feature_extraction.text import TfidfVectorizer
# This method is the most commonly adopted to encode the categorical data
Vec = TfidfVectorizer()
XV_train = Vec.fit_transform(X_train)
# Here the fit_tranform method is basically fitting 
#the data on the vectorizer and transforming the data
XV_test = Vec.transform(X_test)
# This transform method is used to encode based on the previous learnings of the vectorizer so that the
# dimension will also remain same

In [18]:
XV_train

<35918x97341 sparse matrix of type '<class 'numpy.float64'>'
	with 7336944 stored elements in Compressed Sparse Row format>

In [19]:
XV_test

<8980x97341 sparse matrix of type '<class 'numpy.float64'>'
	with 1857468 stored elements in Compressed Sparse Row format>

In [20]:
len(X_test),len(X_train)

(8980, 35918)

In [21]:
'''
Now it's time to build a model
Many machine learning projects take 80% of the time at 
feature engineering because the result of the model whether the best or worst it's
all because of the features

** Now let's use the RandomForestClassifier Model**
** Can use other classification models too**
'''

from sklearn.ensemble import RandomForestClassifier

Model = RandomForestClassifier(n_estimators=100)
#Let's fit the data in to model

Model.fit(XV_train,Y_train)

In [22]:
# Let's check how well it's performing on the training set
Model.score(XV_train,Y_train)

0.9999721588061696

In [23]:
'''
Let's see the performance of the model on testing data
'''
Model.score(XV_test,Y_test)

0.9904231625835189

In [24]:
'''
Let's plot the confusion matrix and get the classification report of the testing data
'''

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,Model.predict(XV_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4706
           1       0.99      0.99      0.99      4274

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [25]:
## Confusion Matrix
confusion_matrix(Y_test,Model.predict(XV_test),labels=['0','1'])

array([[0, 0],
       [0, 0]], dtype=int64)

In [27]:
## Let's use the Logistic Regression Model

from sklearn.linear_model import LogisticRegression
Model1 = LogisticRegression()
Model1.fit(XV_train,Y_train)

In [28]:
# Let's check how well it's performing on the training set
Model1.score(XV_train,Y_train)

0.9914805946879002

In [29]:
'''
Let's see the performance of the model on testing data
'''
Model1.score(XV_test,Y_test)

0.9875278396436525

In [30]:
'''
Let's plot the confusion matrix and get the classification report of the testing data
'''

from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(Y_test,Model1.predict(XV_test)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4706
           1       0.98      0.99      0.99      4274

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [33]:
"""
As far we observe, we can see both the models are performing at similar level,
but RandomForestClassfier is better when compared to LogisticRegression
"""

print("You can try with other models too..!")

You can try with other models too..!
