# LSTM RNN on Combined Snopes Data

In [33]:
import pandas as pd

In [34]:
data = pd.read_csv('Data/SnopesGeneralDatav3.csv')

# 2 -> Other / Mixture/ All other labels
# 1 -> Fake / False
# 0 -> Not Fake / True

In [35]:
data.head()

Unnamed: 0,ID,Posts,Label
0,1,Did Kamala Harris Support Abortion Until the T...,Mixture
1,2,Did Hitler Invent the Inflatable Sex Doll?,FALSE
2,3,Pride Parade Fire Hydrant Mishap,FALSE
3,4,Did Trump Say He Might Sign an Exec Order Barr...,Mixture
4,5,Did the White House Watch the Benghazi Attack ...,FALSE


In [36]:
# Encoding target labels from text to number
# Note: change here to balance classes

# Creating a dict file 
# label = { 'FALSE': 0, 'TRUE': 1, 'Mixture': 2, 'Mostly False': 2, 'Mostly True': 2, 
#          'Unproven': 2, 'Correct Attribution': 2, 'Misattributed': 2, 'Miscaptioned': 2,
#          'Scam': 2, 'Labeled Satire': 2, 'Outdated': 2 }

# For combinded data
# label = { 'FALSE': 0, 'TRUE': 1, 'Mixture': 2 } 


# For binary classsification
label = { 'FALSE': 0, 'TRUE': 1, 'Mixture': 1 } 

data.Label = [label[item] for item in data.Label]

In [37]:
data['Label'].unique()

array([1, 0], dtype=int64)

In [38]:
data['Label'].value_counts()

0    1756
1    1203
Name: Label, dtype: int64

In [39]:
# Remove label from data

X = data.drop(['ID', 'Label'], axis=1)
X.head()

Unnamed: 0,Posts
0,Did Kamala Harris Support Abortion Until the T...
1,Did Hitler Invent the Inflatable Sex Doll?
2,Pride Parade Fire Hydrant Mishap
3,Did Trump Say He Might Sign an Exec Order Barr...
4,Did the White House Watch the Benghazi Attack ...


In [40]:
# Store results

Y = data['Label']
Y.head()

0    1
1    0
2    0
3    1
4    0
Name: Label, dtype: int64

In [41]:
X.shape, Y.shape

((2959, 1), (2959,))

In [42]:
data.head()

Unnamed: 0,ID,Posts,Label
0,1,Did Kamala Harris Support Abortion Until the T...,1
1,2,Did Hitler Invent the Inflatable Sex Doll?,0
2,3,Pride Parade Fire Hydrant Mishap,0
3,4,Did Trump Say He Might Sign an Exec Order Barr...,1
4,5,Did the White House Watch the Benghazi Attack ...,0


In [43]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [44]:
# Vocabulary size
vocab_size = 5000

In [45]:
# Onehot Representation
messages = X.copy()

In [46]:
messages['Posts'][0]

'Did Kamala Harris Support Abortion Until the Time of Giving Birth?'

In [47]:
messages.reset_index(inplace = True)

In [48]:
# Stemming and stop word removal

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
#     type(messages['title'][i])
    review = re.sub('[^a-zA-Z]', ' ', str(messages['Posts'][i]))
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [49]:
corpus

['kamala harri support abort time give birth',
 'hitler invent inflat sex doll',
 'pride parad fire hydrant mishap',
 'trump say might sign exec order bar biden presid',
 'white hous watch benghazi attack noth',
 'color code toothpast tube identifi ingredi',
 'last surviv wwii veteran march alon memori day parad',
 'ruth bader ginsburg say pedophilia good children',
 'bow hunt wild hors',
 'dwayn johnson arrest',
 'firework explod woman shoplift attempt',
 'tim horton ad marijuana menu',
 'obama remov statu liberti offend muslim',
 'year old mama esifiho may oldest woman aliv',
 'cop shoot year old black babi mistak pacifi gun',
 'harvard studi reveal much damag instant noodl bodi',
 'alton sterl kill trump support robert kinnison',
 'monsanto buy whole food',
 'photograph show alton sterl children gun',
 'alton sterl longtim crimin gang member',
 'deray mckesson summer chao',
 'obama commiss gender neutral bibl',
 'dalla shooter identifi neo nazi jeffrey harri',
 'pride lion kill five

In [50]:
# One Hot encoding 

onehot_data = [one_hot(words, vocab_size) for words in corpus] 
onehot_data

[[4698, 284, 4540, 1906, 4596, 3285, 115],
 [2874, 3930, 4011, 99, 1537],
 [3450, 2681, 992, 3005, 3098],
 [4486, 2565, 1850, 2363, 1655, 886, 3484, 3982, 2293],
 [2388, 2292, 442, 528, 1493, 3846],
 [1701, 490, 3524, 3794, 2182, 566],
 [3566, 1711, 247, 4790, 2347, 971, 1423, 1424, 2681],
 [3749, 62, 1471, 2565, 2026, 4758, 1020],
 [4752, 4742, 4025, 2576],
 [495, 1792, 2737],
 [906, 1197, 4930, 98, 2981],
 [3199, 2017, 1769, 846, 2337],
 [842, 2224, 4876, 1128, 23, 1597],
 [775, 2369, 2661, 2956, 2043, 2847, 4930, 306],
 [3935, 4877, 775, 2369, 2262, 1810, 1562, 1916, 1661],
 [930, 1023, 545, 1343, 4493, 1376, 502, 209],
 [1799, 1663, 3112, 4486, 4540, 3262, 4758],
 [1289, 2899, 159, 735],
 [690, 4286, 1799, 1663, 1020, 1661],
 [1799, 1663, 3966, 4288, 637, 3478],
 [2315, 4159, 131, 4232],
 [842, 758, 3158, 982, 3956],
 [3688, 2758, 2182, 2503, 1786, 8, 284],
 [3450, 4573, 3112, 761, 1866, 239, 3130, 607, 605],
 [2157, 879, 1862, 1197],
 [4366, 843, 3535, 992, 4089, 3623, 3544, 4017,

In [51]:
# Embedding

sentence_length = 20
embedded_data = pad_sequences(onehot_data, padding='pre', maxlen = sentence_length)
print(embedded_data)

[[   0    0    0 ... 4596 3285  115]
 [   0    0    0 ... 4011   99 1537]
 [   0    0    0 ...  992 3005 3098]
 ...
 [   0    0    0 ... 1255  554 1037]
 [   0    0    0 ... 4214 3082  942]
 [   0    0    0 ... 1359  895 1666]]


In [52]:
# Creating LSTM model

embedding_vector_features = 40
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length = sentence_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 40)            200000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [53]:
len(embedded_data), Y.shape

(2959, (2959,))

In [54]:
import numpy as np

X_final = np.asarray(embedded_data)
Y_final = np.asarray(Y)

In [55]:
X_final.shape, Y_final.shape

((2959, 20), (2959,))

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_final, Y_final, test_size = 0.20, random_state = 42)

In [57]:
len(X_train), len(y_train), len(X_test), len(y_test)

(2367, 2367, 592, 592)

In [58]:
# Training

model.fit(X_train, y_train, validation_data = (X_test, y_test), batch_size = 64, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2250f253ba8>

In [59]:
# Prediction

y_pred = model.predict(X_test)
y_pred

array([[1.37269497e-04],
       [5.46959817e-01],
       [4.50092316e-01],
       [6.24117851e-02],
       [4.63226438e-03],
       [6.62105799e-01],
       [9.91647065e-01],
       [7.49012828e-03],
       [2.24343508e-01],
       [9.75777447e-01],
       [9.63583231e-01],
       [3.51298779e-01],
       [2.12082267e-03],
       [4.51603532e-03],
       [7.08778858e-01],
       [1.81558728e-03],
       [6.82559311e-02],
       [3.90087189e-06],
       [5.79240918e-03],
       [2.17199147e-01],
       [3.64434719e-03],
       [9.54982638e-03],
       [9.99342799e-01],
       [9.90548730e-01],
       [7.29114981e-05],
       [1.34527683e-04],
       [1.31194890e-02],
       [9.94201660e-01],
       [3.05101275e-03],
       [1.45368278e-02],
       [9.78065968e-01],
       [4.43222225e-02],
       [2.07424164e-04],
       [3.96370888e-04],
       [9.54146504e-01],
       [4.25338745e-04],
       [1.76263303e-01],
       [9.93583322e-01],
       [1.76121086e-01],
       [9.24468040e-04],


In [60]:
predictions = []
for i in range(len(y_pred)):
    if y_pred[i].item() > 0.5:
#         predictions.append(str(i) + " is Fake " + str(1))
        predictions.append(1)
    else:
#         predictions.append(str(i) +" is Not Fake " + str(0))
        predictions.append(0)

In [61]:
predictions

[0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,


In [62]:
results = model.evaluate(X_test, y_test, batch_size = 64)
print("test loss, test acc:", results)

test loss, test acc: [2.1149985790252686, 0.5472972989082336]


In [63]:
# Creating classification report 
from sklearn import metrics

print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.61      0.66      0.64       354
           1       0.43      0.38      0.40       238

    accuracy                           0.55       592
   macro avg       0.52      0.52      0.52       592
weighted avg       0.54      0.55      0.54       592

