# Fake News Classifier using LSTM

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
df.shape

(20800, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
(df.isnull().sum()/len(df))*100

id        0.000000
title     2.682692
author    9.408654
text      0.187500
label     0.000000
dtype: float64

In [6]:
df = df.dropna()

In [7]:
(df.isnull().sum()/len(df))*100

id        0.0
title     0.0
author    0.0
text      0.0
label     0.0
dtype: float64

In [8]:
messages = df.copy()

In [9]:
messages.reset_index(inplace=True)

In [10]:
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [11]:
stemmer = PorterStemmer()

In [12]:
title_corpus = []
for i in range(len(messages['title'])):
    review = re.sub('\W',' ',messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    title_corpus.append(review)

In [13]:
title_corpus[3]

'15 civilian kill singl us airstrik identifi'

In [14]:
y = messages.pop('label')
X = messages

In [15]:
X.shape

(18285, 5)

In [16]:
y.shape

(18285,)

In [17]:
X.drop(['index','id','author','text'],axis=1,inplace=True)

In [18]:
X.head()

Unnamed: 0,title
0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired
3,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...


In [19]:
import tensorflow as tf

In [20]:
tf.__version__

'2.6.0'

In [21]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [22]:
voc_size = 5000

## One Hot Representation

In [23]:
one_hot_corpus = [one_hot(sent,voc_size) for sent in title_corpus]
print(one_hot_corpus)

[[3685, 1793, 3235, 3300, 3727, 775, 1587, 4659, 4369, 4024], [3423, 4571, 4766, 3870, 3690, 409, 4030], [738, 3906, 1521, 1923], [4501, 3230, 2478, 4050, 101, 1707, 1481], [3949, 3690, 1582, 1177, 3075, 4466, 3690, 1281, 4889, 378], [2867, 531, 2684, 2340, 2195, 2704, 631, 3466, 675, 2952, 1243, 2100, 1121, 585, 4030], [2480, 2776, 3399, 3027, 1218, 820, 80, 685, 2953, 4346, 4236], [4168, 3039, 2175, 4057, 1096, 2668, 2704, 2809, 2953, 4346, 4236], [3795, 977, 3011, 3030, 4024, 2648, 655, 3426, 2704, 683], [1886, 2289, 4119, 3689, 3309, 2758, 4467, 1463], [1264, 1982, 3886, 1724, 3748, 2042, 3243, 4736, 1434, 4174, 1687], [101, 3256, 3727, 2648, 2704, 1096], [1286, 868, 635, 1520, 3084, 4376, 4505, 275, 1462], [2869, 4706, 301, 3713, 2240, 3397, 2915, 2953, 4346, 4236], [4944, 65, 2486, 4270, 3671, 2953, 4346, 4236], [4716, 318, 3339, 2455, 1654, 3286, 4482, 2957, 438, 1797, 4642, 2302], [2486, 1584, 4571], [3993, 2432, 1700, 2588, 2704, 4999, 3591, 4030], [2187, 795, 4766, 1123, 1609

## Embedding Representation

### Padding:

In [24]:
max_sent_length = 20
padded_docs = pad_sequences(one_hot_corpus,padding='pre',maxlen=max_sent_length)
print(padded_docs)

[[   0    0    0 ... 4659 4369 4024]
 [   0    0    0 ... 3690  409 4030]
 [   0    0    0 ... 3906 1521 1923]
 ...
 [   0    0    0 ... 2953 4346 4236]
 [   0    0    0 ... 1556 1401 4471]
 [   0    0    0 ... 1761 4705 3297]]


### Embedding model:

In [25]:
embedding_word_features = 40
model = Sequential()
model.add(Embedding(voc_size,embedding_word_features,input_length=max_sent_length))
model.add(LSTM(100))      # 100 neurons
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics='accuracy')
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 40)            200000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
import numpy as np

In [27]:
X_final = np.array(padded_docs)
y_final = np.array(y)

In [28]:
X_final.shape,y_final.shape

((18285, 20), (18285,))

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

### Model Training

In [30]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x24903449c88>

In [47]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_pred

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [1]])

In [48]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [49]:
confusion_matrix(y_test,y_pred)

array([[3118,  301],
       [ 261, 2355]], dtype=int64)

In [51]:
accuracy_score(y_test,y_pred)

0.9068765534382767

In [53]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3419
           1       0.89      0.90      0.89      2616

    accuracy                           0.91      6035
   macro avg       0.90      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035

