# Bidirectional LSTM RNN on Combined Snopes Data

In [1]:
# LSTM Bidirectional RNN
import pandas as pd

In [2]:
data = pd.read_csv('Data/SnopesGeneralDatav3.csv')

# 2 -> Other / Mixture/ All other labels
# 1 -> Fake / False
# 0 -> Not Fake / True

In [3]:
data.head()

Unnamed: 0,ID,Posts,Label
0,1,Did Kamala Harris Support Abortion Until the T...,Mixture
1,2,Did Hitler Invent the Inflatable Sex Doll?,FALSE
2,3,Pride Parade Fire Hydrant Mishap,FALSE
3,4,Did Trump Say He Might Sign an Exec Order Barr...,Mixture
4,5,Did the White House Watch the Benghazi Attack ...,FALSE


In [4]:
# For binary classsification
label = { 'FALSE': 0, 'TRUE': 1, 'Mixture': 1 } 

data.Label = [label[item] for item in data.Label]

In [5]:
data['Label'].unique()

array([1, 0], dtype=int64)

In [6]:
data['Label'].value_counts()

0    1756
1    1203
Name: Label, dtype: int64

In [7]:
# Remove label from data

X = data.drop('Label', axis=1)
X.head()

Unnamed: 0,ID,Posts
0,1,Did Kamala Harris Support Abortion Until the T...
1,2,Did Hitler Invent the Inflatable Sex Doll?
2,3,Pride Parade Fire Hydrant Mishap
3,4,Did Trump Say He Might Sign an Exec Order Barr...
4,5,Did the White House Watch the Benghazi Attack ...


In [8]:
# Store results

Y = data['Label']
Y.head()

0    1
1    0
2    0
3    1
4    0
Name: Label, dtype: int64

In [9]:
X.shape, Y.shape

((2959, 2), (2959,))

In [10]:
data.head()

Unnamed: 0,ID,Posts,Label
0,1,Did Kamala Harris Support Abortion Until the T...,1
1,2,Did Hitler Invent the Inflatable Sex Doll?,0
2,3,Pride Parade Fire Hydrant Mishap,0
3,4,Did Trump Say He Might Sign an Exec Order Barr...,1
4,5,Did the White House Watch the Benghazi Attack ...,0


In [11]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [12]:
# Vocabulary size
vocab_size = 5000

In [13]:
# Onehot Representation
messages = X.copy()

In [17]:
messages['Posts'][2]

'Pride Parade Fire Hydrant Mishap'

In [15]:
messages.reset_index(inplace = True)

In [18]:
# Stemming and stop word removal

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
#     type(messages['title'][i])
    review = re.sub('[^a-zA-Z]', ' ', str(messages['Posts'][i]))
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [19]:
corpus

['kamala harri support abort time give birth',
 'hitler invent inflat sex doll',
 'pride parad fire hydrant mishap',
 'trump say might sign exec order bar biden presid',
 'white hous watch benghazi attack noth',
 'color code toothpast tube identifi ingredi',
 'last surviv wwii veteran march alon memori day parad',
 'ruth bader ginsburg say pedophilia good children',
 'bow hunt wild hors',
 'dwayn johnson arrest',
 'firework explod woman shoplift attempt',
 'tim horton ad marijuana menu',
 'obama remov statu liberti offend muslim',
 'year old mama esifiho may oldest woman aliv',
 'cop shoot year old black babi mistak pacifi gun',
 'harvard studi reveal much damag instant noodl bodi',
 'alton sterl kill trump support robert kinnison',
 'monsanto buy whole food',
 'photograph show alton sterl children gun',
 'alton sterl longtim crimin gang member',
 'deray mckesson summer chao',
 'obama commiss gender neutral bibl',
 'dalla shooter identifi neo nazi jeffrey harri',
 'pride lion kill five

In [20]:
# One Hot encoding 

onehot_data = [one_hot(words, vocab_size) for words in corpus] 
onehot_data

[[169, 4826, 201, 3829, 3418, 1207, 743],
 [2877, 4464, 1809, 805, 2120],
 [370, 863, 800, 862, 3593],
 [2738, 3774, 4669, 1778, 4666, 4561, 1159, 1017, 113],
 [1689, 4866, 4635, 4515, 3902, 1406],
 [2123, 1533, 4694, 1658, 1236, 2986],
 [1030, 3916, 2049, 2711, 114, 1863, 1647, 1604, 863],
 [2865, 160, 2931, 3774, 134, 1681, 404],
 [4140, 324, 1159, 4829],
 [3044, 3734, 4315],
 [3794, 2037, 1543, 4340, 1399],
 [360, 2401, 2659, 4282, 4164],
 [2198, 1532, 2930, 4078, 3192, 1127],
 [4374, 986, 4214, 3860, 2672, 124, 1543, 2036],
 [272, 517, 4374, 986, 1914, 2146, 3362, 1452, 4117],
 [4109, 3887, 25, 48, 1181, 2764, 765, 566],
 [3830, 4608, 1499, 2738, 201, 964, 2192],
 [1646, 3005, 4413, 3642],
 [2694, 4058, 3830, 4608, 404, 4117],
 [3830, 4608, 3954, 3348, 654, 429],
 [3440, 2649, 302, 4983],
 [2198, 2714, 4234, 4650, 3120],
 [2864, 4925, 1236, 2535, 4861, 4448, 4826],
 [370, 2431, 1499, 2452, 3708, 4515, 630, 25, 946],
 [1525, 2332, 2961, 2037],
 [2104, 4462, 1788, 800, 768, 4145, 284

In [21]:
# Embedding

sentence_length = 20
embedded_data = pad_sequences(onehot_data, padding='pre', maxlen = sentence_length)
print(embedded_data)

[[   0    0    0 ... 3418 1207  743]
 [   0    0    0 ... 1809  805 2120]
 [   0    0    0 ...  800  862 3593]
 ...
 [   0    0    0 ... 4095  887 2034]
 [   0    0    0 ... 4943 1185 3771]
 [   0    0    0 ... 3735 2452 1516]]


In [22]:
embedded_data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  169, 4826,  201, 3829, 3418, 1207,  743])

In [23]:
# Creating Bidirectional LSTM model

embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length = sentence_length))
model.add(LSTM(100))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
# Creating second model

embedding_vector_features = 40
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features, input_length = sentence_length))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.3))
model1.add(Dense(1, activation = 'sigmoid'))
model1.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
len(embedded_data), Y.shape

(2959, (2959,))

In [26]:
import numpy as np

X_final = np.array(embedded_data)
Y_final = np.array(Y)

In [27]:
X_final.shape, Y_final.shape

((2959, 20), (2959,))

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, Y_final, test_size = 0.33, random_state = 42)

In [29]:
# Training

model1.fit(X_train, y_train, validation_data = (X_test, y_test), batch_size = 64, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a728aa21d0>

In [30]:
# Prediction

y_pred = model1.predict(X_test)
y_pred

array([[9.78280902e-01],
       [8.57977271e-02],
       [2.16341019e-03],
       [4.23089981e-01],
       [1.95860863e-04],
       [9.99898791e-01],
       [9.99565244e-01],
       [1.90880895e-03],
       [3.05348635e-03],
       [3.81748229e-01],
       [8.11967254e-03],
       [2.27112800e-01],
       [3.68714333e-04],
       [9.90253329e-01],
       [3.29151953e-05],
       [3.40822339e-03],
       [9.98267889e-01],
       [1.97816789e-02],
       [9.48053539e-01],
       [9.89422023e-01],
       [3.90703572e-05],
       [7.16924667e-04],
       [9.99664783e-01],
       [1.44660473e-04],
       [2.88312131e-05],
       [2.93165445e-04],
       [3.82012129e-02],
       [3.07261944e-04],
       [3.39022445e-05],
       [8.32951963e-01],
       [7.89819678e-05],
       [9.81790304e-01],
       [2.81107724e-02],
       [7.29638577e-01],
       [7.70551205e-01],
       [3.12328339e-04],
       [9.99865413e-01],
       [7.12430477e-03],
       [8.48186016e-03],
       [2.01011389e-01],


In [32]:
# Performance and Accuracy
predictions = []
for i in range(len(y_pred)):
    if y_pred[i].item() > 0.5:
#         predictions.append(str(i) + " is Fake " + str(1))
        predictions.append(1)
    else:
#         predictions.append(str(i) +" is Not Fake " + str(0))
        predictions.append(0)

In [33]:
predictions

[1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,


In [34]:
results = model1.evaluate(X_test, y_test, batch_size = 64)
print("test loss, test acc:", results)

test loss, test acc: [2.539504289627075, 0.5404298901557922]


In [35]:
# Creating classification report 
from sklearn import metrics

print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.59      0.65      0.62       566
           1       0.45      0.38      0.41       411

    accuracy                           0.54       977
   macro avg       0.52      0.52      0.52       977
weighted avg       0.53      0.54      0.53       977

