dataset -> https://www.kaggle.com/c/fake-news/data

## 1). Loading Data

In [1]:
import pandas as pd

df=pd.read_csv('../datasets/news_data.csv')
df.sample(5)

Unnamed: 0,id,title,author,text,label
3852,3852,"Hilarion’s Message – October 30-November 5, 2016",Gillian,Leave a reply \nHilarion – As you endeavor to ...,1
14,14,"Re: Yes, There Are Paid Government Trolls On S...",AnotherAnnie,"Yes, There Are Paid Government Trolls On Socia...",1
12572,12572,Head of Veterans Health System Is Trump’s Pick...,Dave Philipps,In a move that left many veterans groups breat...,0
18523,18523,California Today: Fretting Over the ‘Netflix T...,Mike McPhate,Good morning. (Want to get California Today by...,0
10128,10128,Humiliated Hillary is SWARMED by Trump Support...,Amy Moreno,Humiliated Hillary is SWARMED by Trump Support...,1


In [2]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [3]:
df=df.dropna()

In [4]:
X=df.drop(columns='label',axis=1)
Y=df['label']
X.shape,Y.shape

((18285, 4), (18285,))

## 2). Importing Dependencies

In [5]:
import tensorflow as tf
tf.__version__

'2.16.1'

In [6]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [7]:
vocab_size=1000

## 3). Data preprocessing

In [8]:
messages=X.copy()
messages.reset_index(inplace=True)

In [9]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arpitpatel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
corpus=[]

In [11]:
for i in range(0,len(messages)):
    # print(i)
    review=re.sub('[^a-zA-Z]',' ',messages['title'][i])
    review=review.lower()
    review=review.split()
    
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [12]:
corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

## 4). OneHot representation

In [13]:
onehot_repr=[one_hot(word,vocab_size) for word in corpus]
onehot_repr

[[986, 38, 396, 254, 271, 876, 854, 346, 869, 56],
 [426, 764, 382, 578, 410, 893, 903],
 [720, 395, 313, 647],
 [878, 750, 225, 714, 832, 169],
 [742, 410, 693, 520, 442, 353, 410, 99, 142, 983],
 [534, 795, 595, 396, 255, 946, 671, 223, 609, 7, 409, 218, 171, 287, 903],
 [941, 666, 801, 506, 853, 9, 718, 957, 285, 695, 523],
 [896, 168, 492, 304, 404, 382, 946, 639, 285, 695, 523],
 [558, 596, 141, 927, 398, 693, 153, 366, 946, 383],
 [842, 895, 923, 742, 441, 294, 566, 932],
 [181, 270, 61, 385, 386, 60, 856, 424, 987, 62, 382],
 [714, 547, 271, 693, 946, 404],
 [589, 936, 307, 885, 473, 892, 291, 731, 408],
 [453, 5, 317, 574, 916, 666, 583, 285, 695, 523],
 [157, 218, 458, 398, 147, 285, 695, 523],
 [332, 597, 360, 428, 516, 250, 206, 369, 732, 948],
 [208, 66, 764],
 [743, 924, 259, 382, 946, 603, 225, 903],
 [145, 113, 382, 8, 826, 36, 20, 442, 353],
 [307, 326, 946, 916, 186, 903],
 [298, 299, 366, 492, 479, 20, 630, 640, 639, 285, 695, 523],
 [65, 697, 772, 807, 967, 327, 141]

## 5). Embedding with padding

In [14]:

maxlen=0
for word in onehot_repr:
    if len(word)>maxlen:
        maxlen=len(word)
        
maxlen

47

In [15]:
sent_length=50
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
embedded_docs

array([[  0,   0,   0, ..., 346, 869,  56],
       [  0,   0,   0, ..., 410, 893, 903],
       [  0,   0,   0, ..., 395, 313, 647],
       ...,
       [  0,   0,   0, ..., 285, 695, 523],
       [  0,   0,   0, ..., 458, 515, 187],
       [  0,   0,   0, ...,   7, 677, 212]], dtype=int32)

In [16]:
import numpy as np
X_final=np.array(embedded_docs)
Y_final=np.array(Y)

## 6). Train test split of features and target

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X_final,Y_final,random_state=2,test_size=0.2)

## 7). Model creation

In [23]:
embedding_vector_features=50

model=Sequential()
model.add(Embedding(vocab_size,embedding_vector_features))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=10,batch_size=64)
model.summary()

Epoch 1/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - accuracy: 0.8017 - loss: 0.4007 - val_accuracy: 0.9027 - val_loss: 0.2187
Epoch 2/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step - accuracy: 0.9189 - loss: 0.1897 - val_accuracy: 0.9092 - val_loss: 0.2041
Epoch 3/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step - accuracy: 0.9333 - loss: 0.1623 - val_accuracy: 0.9117 - val_loss: 0.1980
Epoch 4/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 38ms/step - accuracy: 0.9434 - loss: 0.1421 - val_accuracy: 0.9136 - val_loss: 0.1997
Epoch 5/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 38ms/step - accuracy: 0.9528 - loss: 0.1223 - val_accuracy: 0.9188 - val_loss: 0.2008
Epoch 6/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step - accuracy: 0.9598 - loss: 0.1108 - val_accuracy: 0.9035 - val_loss: 0.2185
Epoch 7/10
[1m229/22

## 8). Metrics evaluation

In [28]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# Convert y_pred to binary labels using a threshold of 0.5
y_pred_binary = (y_pred > 0.5).astype(int)

# Check the unique values in the converted y_pred_binary
print("Unique values in y_pred_binary:", np.unique(y_pred_binary))

# Compute accuracy
accuracy = accuracy_score(Y_test, y_pred_binary)
print("Accuracy:", accuracy)

# Compute confusion matrix
cm = confusion_matrix(Y_test, y_pred_binary)
print("Confusion Matrix:\n", cm)


Unique values in y_pred_binary: [0 1]
Accuracy: 0.8999179655455292
Confusion Matrix:
 [[1881  170]
 [ 196 1410]]
