# Import Necessary Modules

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Preprocessing the Data Using Pandas

In [3]:
df = pd.read_csv("fake_or_real_news.csv")

In [4]:
df.shape

(6335, 4)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
df = df.set_index('Unnamed: 0')

In [7]:
df.loc[df["label"] == 'REAL',"label"] = 1
df.loc[df["label"] == 'FAKE',"label"] = 0
df.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


# Preparing Data For Training

In [8]:
y = df.label

In [9]:
df = df.drop('label', axis=1)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.2, random_state=53)

# Building Vectorizer Classifiers

# Using Count Vectorizer

In [11]:
cv = CountVectorizer()

In [12]:
x_traincv = cv.fit_transform(["This is a fake news","It is a real news","I like real"])

In [13]:
x_traincv.toarray()

array([[1, 1, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [14]:
cv.get_feature_names()

['fake', 'is', 'it', 'like', 'news', 'real', 'this']

In [15]:
cv1 = CountVectorizer()

In [16]:
x_traincv1 = cv1.fit_transform(x_train)
# x_testcv1 = cv1.fit_transform(x_test)

In [17]:
a = x_traincv1.toarray()
# b = x_testcv1.toarray()

In [18]:
a

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
# b

In [20]:
print(len(a[0]))
# print(len(b[0]))

61666


In [21]:
x_train.iloc[2]

'During the campaign, Trump had threatened to impose a large tariff to keep the jobs in the United States.'

# TFIDF Vectorizer

In [22]:
tv = TfidfVectorizer(min_df=1,stop_words='english')

In [23]:
x_traintv = tv.fit_transform(["This is a fake news","It is a real news","I like real"])

In [24]:
x_traintv.toarray()

array([[0.79596054, 0.        , 0.60534851, 0.        ],
       [0.        , 0.        , 0.70710678, 0.70710678],
       [0.        , 0.79596054, 0.        , 0.60534851]])

In [25]:
tv.get_feature_names()

['fake', 'like', 'news', 'real']

In [26]:
tv1 = TfidfVectorizer(min_df=1,stop_words='english')

In [27]:
x_traintv1 = tv1.fit_transform(x_train)

In [28]:
x_traintv1.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
cv1.get_feature_names()

['00',
 '000',
 '0000',
 '00000031',
 '000035',
 '00006',
 '0001',
 '0001pt',
 '000billion',
 '000ft',
 '000km',
 '000x',
 '001',
 '0011',
 '002',
 '003',
 '004',
 '006',
 '006s',
 '007',
 '007s',
 '008',
 '008s',
 '009',
 '0099',
 '00am',
 '00p',
 '00pm',
 '01',
 '010',
 '011',
 '013',
 '013c2812c9',
 '014',
 '015',
 '016',
 '018',
 '01am',
 '02',
 '020',
 '022',
 '023',
 '024',
 '025',
 '027',
 '02714',
 '028',
 '02870',
 '02welcome',
 '03',
 '031',
 '032',
 '0325',
 '033',
 '034',
 '035',
 '037',
 '03747',
 '039',
 '03eb',
 '04',
 '040',
 '0400',
 '042',
 '044',
 '047',
 '048',
 '049',
 '04pm',
 '05',
 '050',
 '0509245d29',
 '052',
 '056',
 '058',
 '06',
 '0600',
 '062',
 '066',
 '068',
 '06pm',
 '07',
 '0700',
 '075',
 '0750',
 '076',
 '079',
 '07dryempjx',
 '08',
 '080',
 '081',
 '082',
 '084',
 '0851',
 '089',
 '0891',
 '09',
 '091',
 '098263',
 '09am',
 '09pm',
 '0_65b67362bd',
 '0_jgdktlmn',
 '0a_merrill',
 '0d',
 '0dpbdk6rjd',
 '0fjjvowyhg8qtskiz',
 '0h4at2yetra17uxetni02ls2je

In [30]:
x_train.iloc[0]

'The supreme court justice Ruth Bader Ginsburg executed a full U-turn on Thursday morning, over remarks about the presumptive Republican presidential nominee Donald Trump that ignited controversy on the eve of the GOP convention.\n\n\n\nHer remarks about Trump were “ill-advised”, she said, adding: “I regret making them.”\n\nIn an interview with the New York Times last week, Ginsburg said she could not “imagine what the country would be with Donald Trump as our president” and suggested her late husband would have taken such a scenario as a reason to emigrate – as far away as New Zealand.\n\nGinsburg, 83, was the first supreme court justice in decades to comment publicly on a candidate in the middle of the presidential campaign.\n\nHer comments sparked indignation, dismay and accusations that she had violated judicial ethics. Trump called for her to resign.\n\nDespite this, Ginsburg doubled down, calling Trump a faker and telling CNN in an interview published on Tuesday: “He really has a

# Multinomial Naive-Bayes For Tfid Vectorizer

In [31]:
mnb = MultinomialNB()

In [32]:
#to make the training data integer
y_train=y_train.astype('int')

In [33]:
mnb.fit(x_traintv1,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
x_testtv = tv1.transform(x_test)

In [35]:
df.head()

Unnamed: 0_level_0,title,text
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello..."
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T..."
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...


In [36]:
pred = mnb.predict(x_testtv)

In [37]:
#prediction the test datas
pred

array([1, 1, 1, ..., 1, 1, 1])

In [38]:
#taking y_test to numpy array
actual = np.array(y_test)

In [39]:
actual

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [40]:
count = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        count = count+1

In [41]:
count

1098

In [42]:
accuracy = count/len(pred)
accuracy

0.8666140489344909

# Using Naive-Bayes For Count Vectorizer

In [43]:
mnbcv =  MultinomialNB()

In [44]:
#to make the training data integer
y_train = y_train.astype('int')
# y_train

In [45]:
x_traincv1

<5068x61666 sparse matrix of type '<class 'numpy.int64'>'
	with 1726397 stored elements in Compressed Sparse Row format>

In [46]:
mnbcv.fit(x_traincv1,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
x_testcv = cv1.transform(x_test)

In [48]:
x_testcv

<1267x61666 sparse matrix of type '<class 'numpy.int64'>'
	with 425470 stored elements in Compressed Sparse Row format>

In [49]:
pred = mnbcv.predict(x_testcv)

In [50]:
#prediction the test datas
pred

array([1, 1, 1, ..., 1, 1, 1])

In [51]:
#taking y_test to numpy array
actual = np.array(y_test)

In [52]:
actual

array([1, 0, 1, ..., 1, 1, 1], dtype=object)

In [53]:
countcv = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        countcv = countcv + 1

In [54]:
countcv

1128

In [55]:
len(pred)

1267

In [56]:
accuracy = countcv/len(pred)
accuracy

0.8902920284135754

# Using Logistic Regression to Classify as FAKE or REAL

In [57]:
#Reading the Dataset from CSV file
data = pd.read_csv('./fake_or_real_news.csv', header=None)

In [58]:
#Splitting the Data into Train and Test with 0.33 as test sets
train_x, test_x, train_y, test_y = train_test_split(data[2], data[3])

In [59]:
data.head()

Unnamed: 0,0,1,2,3
0,,title,text,label
1,8476.0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
2,10294.0,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3,3608.0,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
4,10142.0,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE


In [60]:
#Using the Tfid Vectorizer because the size of the dataset is Large
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train_x = vectorizer.fit_transform(train_x)

In [61]:
#Now using the Logistic Regression Algorithm to characterize as Fake or Real
classifier = LogisticRegression()
classifier.fit(tfidf_train_x, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [62]:
#After transform the test data will be stored as a float in case of TFID
tfidf_test_x = vectorizer.transform(test_x)
print(tfidf_test_x.shape)
tfidf_test_x

(1584, 59774)


<1584x59774 sparse matrix of type '<class 'numpy.float64'>'
	with 405271 stored elements in Compressed Sparse Row format>

In [63]:
#Finding the Accuracy Score for the Algorithm
accuracy = cross_val_score(classifier, tfidf_test_x, test_y, cv=5)
acc = accuracy.mean()
print(acc * 100)



88.76378334593653


In [66]:
#Entering the news to check its validity
print("Enter the news you want to validate: \n\n")
news = [input()]
output = classifier.predict(vectorizer.transform(news))

Enter the news you want to validate: 


KATHMANDU: The Business Advisory Committee meeting is underway to convene the meeting of the House of Representatives today.  Speaker Krishna Bahadur Mahara had been trying to create environment to forge consensus in order to convene the House meeting since the ruling Nepal Communist Party (NCP) and opposition parties — Nepali Congress and Rastriya Janata Party-Nepal — left the House on July 15.  The NCP wanted to allow Home Minister Ram Bahadur Thapa to speak first in the House, whereas the opposition parties jointly registered a motion on a matter of public importance to discuss ways to support people affected by floods and landslides and they wanted to discuss this agenda first.


In [67]:
print((output[0]+" ")*3+"!!!") 

FAKE FAKE FAKE !!!
