In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Loading the dataset
df =pd.read_csv('fake_or_real_news.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [4]:
df.shape

(6335, 4)

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,6335.0,5280.415627,3038.503953,2.0,2674.5,5271.0,7901.0,10557.0


In [6]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [7]:
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)

In [8]:
df

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [9]:
#Encoding label column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [10]:
df

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1


In [11]:
# Importing essential libraries for performing Natural Language Processing 
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [12]:
# Cleaning the news
corpus =[]
for i in range(0,df.shape[0]):
    title = re.sub('[^a-zA-Z]',' ',str(df.title[i]))
    title = title.lower()
    words = title.split()
    words = [word for word in words if word not in set(stopwords.words('english'))]
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    title = ' '.join(words)
    corpus.append(title)

In [13]:
corpus[0:10]

['smell hillari fear',
 'watch exact moment paul ryan commit polit suicid trump ralli video',
 'kerri go pari gestur sympathi',
 'berni support twitter erupt anger dnc tri warn',
 'battl new york primari matter',
 'tehran usa',
 'girl horrifi watch boyfriend left facetim',
 'britain schindler die',
 'fact check trump clinton command chief forum',
 'iran reportedli make new push uranium concess nuclear talk']

In [14]:
#Creating bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [15]:
x[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
y = df['label']

# Naive Bayes Classification

In [17]:
#Splitting the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x, y, test_size=0.20, random_state=0)

In [18]:
## Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
#Predicting the test set result
y_pred=classifier.predict(x_test)

In [20]:
y_pred

array([1, 1, 0, ..., 0, 1, 0])

In [21]:
# Accuracy, Precision and Recall and confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3 = recall_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

print('Accuracy score:{}'.format(round(score1*100,2)))
print('Precision score:{}'.format(round(score2*100,2)))
print('Recall score:{}'.format(round(score3*100,2)))
print('Confusion matrix:\n{}'.format(cm))

Accuracy score:81.37
Precision score:79.89
Recall score:85.28
Confusion matrix:
[[475 140]
 [ 96 556]]


In [22]:
#Hypermarameter tuning the Naive bayes model
alpha_val = 0.0
best_accuracy =0.0
for i in np.arange(0.1,1.1,0.1):
    temp_classifier = MultinomialNB(alpha=i)
    temp_classifier.fit(x_train,y_train)
    temp_y_pred=temp_classifier.predict(x_test)
    score = accuracy_score(y_test,temp_y_pred)
    print('Accuracy score for {} is {}'.format(round(i,1),round(score*100,2)))
    
    if score > best_accuracy:
        best_accuracy = score
        alpha_val = i
        
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for 0.1 is 80.9
Accuracy score for 0.2 is 81.53
Accuracy score for 0.3 is 81.77
Accuracy score for 0.4 is 81.53
Accuracy score for 0.5 is 81.29
Accuracy score for 0.6 is 81.37
Accuracy score for 0.7 is 81.37
Accuracy score for 0.8 is 81.53
Accuracy score for 0.9 is 81.37
Accuracy score for 1.0 is 81.37
The best accuracy is 81.77% with alpha value as 0.3


In [23]:
classifier = MultinomialNB(alpha=0.3)
classifier.fit(x_train,y_train)
y_pred=classifier.predict(x_test)

In [24]:
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3 = recall_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

print('Accuracy score:{}'.format(round(score1*100,2)))
print('Precision score:{}'.format(round(score2*100,2)))
print('Recall score:{}'.format(round(score3*100,2)))
print('Confusion matrix:\n{}'.format(cm))

Accuracy score:81.77
Precision score:80.91
Recall score:84.51
Confusion matrix:
[[485 130]
 [101 551]]


# Logistic Regression

In [25]:
#Fitting LogisticRegression to the dataset
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
#Predicting the test data
y_pred = lr.predict(x_test)

In [27]:
y_pred

array([1, 1, 0, ..., 0, 1, 0])

In [28]:
# Accuracy, Precision and Recall and confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3 = recall_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

print('Accuracy score:{}'.format(round(score1*100,2)))
print('Precision score:{}'.format(round(score2*100,2)))
print('Recall score:{}'.format(round(score3*100,2)))
print('Confusion matrix:\n{}'.format(cm))

Accuracy score:82.24
Precision score:83.0
Recall score:82.36
Confusion matrix:
[[505 110]
 [115 537]]


In [29]:
#Hypermarameter tuning the LogisticRegression model
c_val = 0.0
best_accuracy =0.0
for i in np.arange(0.1,1.1,0.1):
    temp_lr = LogisticRegression(C=i)
    temp_lr.fit(x_train,y_train)
    temp_y_pred=temp_lr.predict(x_test)
    score = accuracy_score(y_test,temp_y_pred)
    print('Accuracy score for {} is {}'.format(round(i,1),round(score*100,2)))
    
    if score > best_accuracy:
        best_accuracy = score
        alpha_val = i
        
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuracy score for 0.1 is 81.22
Accuracy score for 0.2 is 81.69
Accuracy score for 0.3 is 81.77
Accuracy score for 0.4 is 82.0
Accuracy score for 0.5 is 82.24
Accuracy score for 0.6 is 82.24
Accuracy score for 0.7 is 82.4
Accuracy score for 0.8 is 82.24
Accuracy score for 0.9 is 82.24
Accuracy score for 1.0 is 82.24
The best accuracy is 82.4% with alpha value as 0.7


In [30]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.7,random_state=0)
lr.fit(x_train,y_train)

LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
y_pred = lr.predict(x_test)

In [32]:
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3 = recall_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)

print('Accuracy score:{}'.format(round(score1*100,2)))
print('Precision score:{}'.format(round(score2*100,2)))
print('Recall score:{}'.format(round(score3*100,2)))
print('Confusion matrix:\n{}'.format(cm))

Accuracy score:82.4
Precision score:83.46
Recall score:82.06
Confusion matrix:
[[509 106]
 [117 535]]


# Prediction

In [33]:
def prediction(sample_news):
    for i in range(0,df.shape[0]):
        sample_news = re.sub('[^a-zA-Z]',' ',str(sample_news))
        sample_news = sample_news.lower()
        sample_words = sample_news.split()
        sample_words = [word for word in sample_words if word not in set(stopwords.words('english'))]
        ps = PorterStemmer()
        final_words = [ps.stem(word) for word in sample_words]
        final_news = ' '.join(final_words)
        
        temp = cv.transform([final_news]).toarray()
        return classifier.predict(temp)

In [34]:
def sample_news(i):
    sample_news = df.title[i]
    print('News:{}'.format(sample_news))
    if prediction(sample_news) :
        print('It is a REAL news')
    else:
        print('It is a FAKE news')

In [36]:
sample_news(566)

News:Hillary Clinton BETRAYED by Her Own Family: “We’re Voting for Trump!”
It is a FAKE news


In [37]:
sample_news(1000)

News:Donald Trump Wins The Presidency In Historic Mandate Victory As Hillary Clinton Concedes
It is a REAL news


In [38]:
sample_news(1493)

News:Trump &amp; Clinton Were Very Convincing...on How Lousy the Other One Is
It is a REAL news
