## Fake News Classifier Using LSTM

Dataset: https://www.kaggle.com/c/fake-news/data#

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/fake-news/train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [35]:
df = df.dropna(ignore_index=True)

In [36]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding
import tensorflow as tf

In [38]:
## indepentent features
X=df.drop('label',axis=1)

In [39]:
## dependent feature
y = df['label']

In [40]:
X.shape

(18285, 4)

In [41]:
y.shape

(18285,)

In [42]:
### Vocabulary Size
voc_size = 5000

## Onehot Representation

In [43]:
messages = X.copy()

In [44]:
messages['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [45]:
messages.head(10)

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi..."
6,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi..."
7,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...
8,10,Obama’s Organizing for Action Partners with So...,Aaron Klein,"Organizing for Action, the activist group that..."
9,11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",Chris Tomlinson,The BBC produced spoof on the “Real Housewives...


In [46]:
import nltk
import re
from nltk.corpus import stopwords

In [47]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abhilash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
messages['title'][6]

'Benoît Hamon Wins French Socialist Party’s Presidential Nomination - The New York Times'

In [49]:
review = re.sub('[^a-zA-Z]', ' ', messages['title'][2])
review

'Why the Truth Might Get You Fired'

In [50]:
ps = PorterStemmer()

In [51]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    print(review)
    

hous dem aid even see comey letter jason chaffetz tweet
flynn hillari clinton big woman campu breitbart
truth might get fire
civilian kill singl us airstrik identifi
iranian woman jail fiction unpublish stori woman stone death adulteri
jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart
beno hamon win french socialist parti presidenti nomin new york time
back channel plan ukrain russia courtesi trump associ new york time
obama organ action partner soro link indivis disrupt trump agenda
bbc comedi sketch real housew isi caus outrag
russian research discov secret nazi militari base treasur hunter arctic photo
us offici see link trump russia
ye paid govern troll social media blog forum websit
major leagu soccer argentin find home success new york time
well fargo chief abruptli step new york time
anonym donor pay million releas everyon arrest dakota access pipelin
fbi close hillari
chuck todd buzzfe donald trump polit favor breitbart
monica lew

In [53]:
corpus[1]

'flynn hillari clinton big woman campu breitbart'

In [54]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[2713, 3568, 3105, 3496, 4138, 4954, 4062, 1757, 350, 4571],
 [2264, 3763, 3179, 3694, 626, 3124, 67],
 [1351, 3856, 2687, 2336],
 [947, 4378, 1178, 3556, 2701, 272],
 [4058, 626, 3319, 3226, 3030, 3164, 626, 327, 2486, 1306],
 [1266,
  4817,
  4428,
  1106,
  1542,
  1759,
  4659,
  2534,
  1304,
  3263,
  3409,
  943,
  3807,
  2679,
  67],
 [3228, 1967, 3151, 1201, 4008, 4137, 3024, 2540, 4469, 287, 1835],
 [1460, 3928, 1769, 582, 4794, 4045, 1759, 4254, 4469, 287, 1835],
 [4279, 232, 3810, 2296, 1233, 3708, 1751, 1748, 1759, 1018],
 [1955, 1223, 1825, 4289, 676, 960, 3692, 4989],
 [356, 3358, 3646, 2134, 390, 4740, 4761, 71, 4627, 157, 601],
 [3556, 1215, 4138, 3708, 1759, 4794],
 [279, 495, 2606, 2943, 3390, 2270, 1640, 3761, 2753],
 [3620, 1360, 4505, 2256, 3183, 2320, 1757, 4469, 287, 1835],
 [2419, 2202, 4399, 3106, 359, 4469, 287, 1835],
 [248, 2789, 3418, 2923, 3366, 3304, 162, 3029, 4702, 207],
 [2643, 4381, 3763],
 [237, 4187, 1509, 3663, 1759, 3234, 424, 67],
 [3291, 1321

In [55]:
onehot_repr[1]

[2264, 3763, 3179, 3694, 626, 3124, 67]

## Embedding Represtation

In [56]:
sent_length =20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)

In [57]:
embedded_docs[1]

array([2264, 3763, 3179, 3694,  626, 3124,   67,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [58]:
## Model
embedding_vector_features=40 ## 40 is the dimension of the vector
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100)) ## 100 is the number of neurons
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 40)            200000    
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 256501 (1001.96 KB)
Trainable params: 256501 (1001.96 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [59]:
embedded_docs.shape, y.shape

((18285, 20), (18285,))

In [60]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [61]:
X_final.shape,y_final.shape

((18285, 20), (18285,))

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [63]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x14861da8b80>

In [64]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [65]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x14867472e60>

### Performance Metrics And Accuracy

In [66]:
y_pred=model.predict(X_test)



In [67]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [68]:
from sklearn.metrics import confusion_matrix

In [69]:
confusion_matrix(y_test,y_pred)

array([[3133,  286],
       [ 264, 2352]], dtype=int64)

In [70]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9088649544324772

In [71]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      3419
           1       0.89      0.90      0.90      2616

    accuracy                           0.91      6035
   macro avg       0.91      0.91      0.91      6035
weighted avg       0.91      0.91      0.91      6035



## Rapid Api

In [3]:
import requests

url = "https://realtor.p.rapidapi.com/locations/v2/auto-complete"

querystring = {"input":"new york","limit":"10"}

headers = {
	"X-RapidAPI-Key": "746c0cdee5msh9106269a5b86db8p199414jsnd0cddda75543",
	"X-RapidAPI-Host": "realtor.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

print(response.json())

{'meta': {'version': '0.105.0.0.105.0.2137.v105', 'es_took': 16}, 'autocomplete': [{'area_type': 'state', '_id': 'state:ny', '_score': 36035.562, 'state': 'New York', 'state_code': 'NY', 'country': 'USA', 'centroid': {'lon': -75.5965453188093, 'lat': 42.9212421566579}, 'slug_id': 'New-York', 'geo_id': '7a9bffcb-e43c-5cdd-89ff-08ead6d8225d'}, {'area_type': 'city', '_id': 'city:ny_new-york', '_score': 34.656803, 'city': 'New York', 'state_code': 'NY', 'counties': [{'name': 'Queens', 'fips': '36081', 'state_code': 'NY'}, {'name': 'Kings', 'fips': '36047', 'state_code': 'NY'}, {'name': 'Richmond', 'fips': '36085', 'state_code': 'NY'}, {'name': 'Bronx', 'fips': '36005', 'state_code': 'NY'}, {'name': 'New York', 'fips': '36061', 'state_code': 'NY'}], 'country': 'USA', 'centroid': {'lon': -73.9386968, 'lat': 40.6634682}, 'slug_id': 'New-York_NY', 'geo_id': 'a5cac742-26a8-5870-b961-472ec6f9d824', 'county_needed_for_uniq': False}, {'area_type': 'city', '_id': 'city:ny_manhattan', '_score': 34.5