## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd, csv
#my_cols = ['label', 'tagged_in_context']
trainset= pd.read_csv('/content/drive/MyDrive/propaganda_train.tsv', sep='\t')
valset= pd.read_csv('/content/drive/MyDrive/propaganda_val.tsv',  sep="\t")

In [None]:
propaganda_set= trainset[trainset['label'] != 'not_propaganda'].copy()
not_propaganda_set = trainset[trainset['label'] == 'not_propaganda']
val_propaganda_set= valset[valset['label'] != 'not_propaganda'].copy()
val_not_propaganda_set = valset[valset['label'] == 'not_propaganda']

prop_labels = propaganda_set.loc[:,['label']]
for val in prop_labels:
  #propaganda_set.loc[propaganda_set['label'] != 2, 'not_propaganda'] = 'propaganda'
  propaganda_set['label']= val.replace(val, 'propaganda')

val_prop_labels = val_propaganda_set['label']
for val in val_prop_labels:
  val_propaganda_set['label']= val.replace(val, 'propaganda')

propaganda_set
df = pd.concat([propaganda_set, not_propaganda_set], ignore_index= True)
#val_set = pd.concat([val_propaganda_set, val_not_propaganda_set], ignore_index= True)
df.rename(columns = {'tagged_in_context':'title'}, inplace = True)
#val_set.rename(columns = {'tagged_in_context':'title'}, inplace = True)

In [None]:
#first we need a map for the labels
#Here, we make a list of all of the unique labels in the training and testing dataframes, and sort the labels alphabetically
labellist=sorted(list(set(df['label'].unique()).union(set(df['label'].unique()))))

labels={label:i for i,label in enumerate(labellist)}
labels

{'not_propaganda': 0, 'propaganda': 1}

In [None]:
df['label'] = df['label'].map(labels)
#val_set['label'] = val_set['label'].map(labels)

In [None]:
df.head()

Unnamed: 0,label,title
0,1,The Obama administration misled the <BOS> Amer...
1,1,"Hitler <BOS> annihilated <EOS> 400,000 Germans..."
2,1,"As noted above, at this point literally every ..."
3,1,His account was suspended for violating Twitte...
4,1,A couple of seemingly unrelated events this pa...


In [None]:
###Drop Nan Values
df=df.dropna()


In [None]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [None]:
## Get the Dependent features
y=df['label']

In [None]:
X.shape

(2414, 1)

In [None]:
y.shape

(2414,)

In [None]:
import tensorflow as tf

In [None]:
tf.__version__

'2.1.0'

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
### Vocabulary size
voc_size=5000

### Onehot Representation

In [None]:
messages=X.copy()

In [None]:
messages['title'][1]

'Hitler <BOS> annihilated <EOS> 400,000 Germans who were handicapped or suffered from chronic diseases. '

In [None]:
messages.reset_index(inplace=True)

In [None]:
import nltk
import re
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

### Embedding Representation

In [None]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...  673 1322 2746]
 [   0    0    0 ... 4010  916 2273]
 [4479 4971 1179 ... 1121 1119 3289]
 ...
 [ 817 1365 4359 ...  673  350 2393]
 [   0    0    0 ... 4297 1831 3374]
 [   0    0    0 ... 4697 4187 1831]]


In [None]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0, 2556, 4711, 2882, 1673,  517,
        179, 1831, 4144, 2329, 3543, 4516,  673, 1322, 2746], dtype=int32)

In [None]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
len(embedded_docs),y.shape

(2414, (2414,))

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
X_final.shape,y_final.shape

((2414, 20), (2414,))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [None]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0b2008ab20>

### Adding Dropout

In [None]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

### Performance Metrics And Accuracy

In [None]:
#y_pred=model.predict_classes(X_test)
y_pred=model.predict(X_test)
classes_x=np.argmax(y_pred,axis=1)



In [None]:
classes_x

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred.round())

array([[257, 131],
       [275, 134]])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred.round())

0.4905897114178168