## Fake News Classifier Using LSTM

In [30]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/fake-news/submit.csv
/kaggle/input/fake-news/train.csv
/kaggle/input/fake-news/test.csv


In [31]:
import pandas as pd

In [32]:
df=pd.read_csv('/kaggle/input/fake-news/train.csv')

In [33]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [34]:
#Checking for Null values
df.isnull().sum()


id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [35]:
#We will replace the null values with blanks

df.fillna('',inplace = True)

In [36]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [37]:
# We will do our analysis on author text and label so i was dropping the id and title
df.drop(['id','title'],axis = 1,inplace = True)

In [38]:
# We will combine author and text and make them a single column

df['content'] = df['author']+ ' '+ df['text']

In [39]:
df.head()

Unnamed: 0,author,text,label,content
0,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,Daniel J. Flynn Ever get the feeling your life...
2,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss Videos 15 Civilians Killed In ...
4,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Print \nAn Iranian woman has be...


In [40]:
# Dropping the author and text column
df.drop(['author','text'],axis = 1,inplace = True)

In [41]:
df.head()

Unnamed: 0,label,content
0,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,0,Daniel J. Flynn Ever get the feeling your life...
2,1,Consortiumnews.com Why the Truth Might Get You...
3,1,Jessica Purkiss Videos 15 Civilians Killed In ...
4,1,Howard Portnoy Print \nAn Iranian woman has be...


In [42]:
df.label.value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [43]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [44]:
## Get the Dependent features
y=df['label']

In [45]:
X.shape

(20800, 1)

In [46]:
y.shape

(20800,)

In [47]:
import tensorflow as tf

In [48]:
tf.__version__

'2.6.2'

In [49]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [50]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [51]:
messages=X.copy()

In [52]:
messages

Unnamed: 0,content
0,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,Daniel J. Flynn Ever get the feeling your life...
2,Consortiumnews.com Why the Truth Might Get You...
3,Jessica Purkiss Videos 15 Civilians Killed In ...
4,Howard Portnoy Print \nAn Iranian woman has be...
...,...
20795,Jerome Hudson Rapper T. I. unloaded on black c...
20796,Benjamin Hoffman When the Green Bay Packers lo...
20797,Michael J. de la Merced and Rachel Abrams The ...
20798,"Alex Ansary NATO, Russia To Hold Parallel Exer..."


In [53]:
messages['content'][1]

'Daniel J. Flynn Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? [Hillary Clinton remains the big woman on campus in leafy, liberal Wellesley, Massachusetts. Everywhere else votes her most likely to don her inauguration dress for the remainder of her days the way Miss Havisham forever wore that wedding dress.  Speaking of Great Expectations, Hillary Rodham overflowed with them 48 years ago when she first addressed a Wellesley graduating class. The president of the college informed those gathered in 1969 that the students needed “no debate so far as I could ascertain as to who their spokesman was to be” (kind of the like the Democratic primaries in 2016 minus the   terms unknown then even at a Seven Sisters school). “I am very glad that Miss Adams made it clear that what I am speaking for today is all of us —  the 400 of us,” Miss Rodham told her classmates. After appointing herself Edger Bergen to the Charlie M

In [54]:
messages.reset_index(inplace=True)

In [55]:
import nltk
import re
from nltk.corpus import stopwords

In [56]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['content'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
corpus

In [58]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[826,
  496,
  847,
  3803,
  116,
  1335,
  182,
  2032,
  4757,
  1943,
  520,
  2390,
  826,
  496,
  2651,
  3367,
  1943,
  520,
  3396,
  4261,
  1204,
  2315,
  4469,
  3226,
  838,
  3711,
  1284,
  3286,
  996,
  4141,
  2456,
  4111,
  2710,
  3579,
  4398,
  1199,
  4763,
  1158,
  3091,
  71,
  1763,
  2032,
  3732,
  847,
  43,
  116,
  3597,
  4076,
  787,
  3903,
  4843,
  4398,
  1199,
  4671,
  963,
  2032,
  1720,
  2919,
  4757,
  1639,
  3091,
  3597,
  4336,
  1300,
  3111,
  1828,
  478,
  4336,
  4743,
  4188,
  43,
  741,
  3225,
  3785,
  2032,
  837,
  3124,
  2390,
  2901,
  864,
  3225,
  2712,
  3903,
  2032,
  1210,
  864,
  2712,
  43,
  4188,
  2816,
  847,
  1868,
  3434,
  793,
  3225,
  133,
  83,
  4336,
  3049,
  4865,
  2573,
  182,
  2873,
  4370,
  961,
  987,
  4757,
  4730,
  793,
  3225,
  891,
  1943,
  520,
  2425,
  859,
  4763,
  2462,
  2390,
  3091,
  1237,
  961,
  3091,
  3182,
  1942,
  4336,
  3196,
  601,
  4921,
  1226,
  838,
  1

### Embedding Representation

In [86]:
sent_length=500
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 1581 4957 4204]
 [   0    0    0 ... 4991 1327  219]
 [4803 4956  602 ... 2312  937 2860]
 ...
 [   0    0    0 ... 4784 4034 2799]
 [   0    0    0 ... 2181 4389   71]
 [4506 3733 3032 ... 4125 1645 4979]]


In [87]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  826,
        496,  847, 3803,  116, 1335,  182, 2032, 4757, 1943,  520, 2390,
        826,  496, 2651, 3367, 1943,  520, 3396, 4261, 1204, 2315, 4469,
       3226,  838, 3711, 1284, 3286,  996, 4141, 2456, 4111, 2710, 3579,
       4398, 1199, 4763, 1158, 3091,   71, 1763, 2032, 3732,  847,   43,
        116, 3597, 4076,  787, 3903, 4843, 4398, 1199, 4671,  963, 2032,
       1720, 2919, 4757, 1639, 3091, 3597, 4336, 1300, 3111, 1828,  478,
       4336, 4743, 4188,   43,  741, 3225, 3785, 2032,  837, 3124, 2390,
       2901,  864, 3225, 2712, 3903, 2032, 1210,  8

In [88]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 40)           200000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [89]:
len(embedded_docs),y.shape

(20800, (20800,))

In [90]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [91]:
X_final.shape,y_final.shape

((20800, 500), (20800,))

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [93]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4d24e11a10>

### Adding Dropout on Stacked LSTM

In [94]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=100
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(400, input_shape=(5,1)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [95]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4cb0597d90>

### Performance Metrics And Accuracy

In [96]:
y_pred=model.predict(X_test)

y_pred = (y_pred > 0.5)

In [97]:
from sklearn.metrics import confusion_matrix

In [98]:
confusion_matrix(y_test,y_pred)

array([[3247,  202],
       [ 295, 3120]])

In [99]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9275932400932401

In [100]:
#Using Bidirectional LSTM

In [101]:
## Creating model
from tensorflow.keras.layers import Bidirectional
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 40)           200000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dropout_8 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [102]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4c97c72250>

In [104]:
y_pred=model.predict(X_test)

y_pred = (y_pred > 0.5)

In [105]:
confusion_matrix(y_test,y_pred)

array([[3215,  234],
       [ 211, 3204]])

In [106]:
accuracy_score(y_test,y_pred)

0.9351689976689976