<a href="https://colab.research.google.com/github/Pakhi27/Bi-Directional-LSTM-RNN-Fake-News-Classifier/blob/main/FakeNewsClassifier_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [6]:
def handle_bad_line(line):
    print(f"Bad line: {line}")
    return None  # or return the line modified to fit the expected format

df = pd.read_csv('FNC.csv', delimiter=',', encoding='utf-8', on_bad_lines=handle_bad_line, engine='python')



In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [11]:
###Drop Nan Values
df=df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [14]:
## Get the Independent Features

X=df.drop('label',axis=1)

In [15]:
## Get the Dependent features
y=df['label']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
import tensorflow as tf

In [None]:
tf.__version__

In [20]:
from tensorflow.keras.layers import Embedding# word 2 vec
from tensorflow.keras.preprocessing.sequence import pad_sequences# pre-padding and post padding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [21]:
### Vocabulary size
voc_size=5000

In [22]:
# One Hot representation

In [23]:
messages=X.copy()

In [None]:
messages['title'][1]

In [None]:
messages

In [26]:
messages.reset_index(inplace=True)

In [None]:
messages

In [28]:
import nltk
import re
from nltk.corpus import stopwords

In [None]:
# stopwords
nltk.download('stopwords')

In [30]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer ##stemming purpose
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    # removing special characters and replacing it with blanks
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
corpus[1]

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

In [None]:
corpus[1]

In [None]:
onehot_repr[1]

In [36]:
# Embedding representation

In [None]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

In [None]:
embedded_docs[1]

In [None]:
embedded_docs[0]

In [None]:
## Creating model
# each and every word is going to get converted into a vector of 40 size
embedding_vector_features=40 ##features representation
model=Sequential()

# embedding layer
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))

# LSTM-100 NEURONS
model.add(LSTM(100))

# Sigmoid for binary prediction in model
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Assuming voc_size and sent_length are predefined variables
embedding_vector_features = 40  # Size of the embedding vector

model = Sequential()

# Embedding layer with correct input_dim (voc_size) and without deprecated input_length
model.add(Embedding(input_dim=voc_size, output_dim=embedding_vector_features))

# LSTM layer
model.add(LSTM(100))

# Dense layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
print(model.summary())


In [42]:
model.build(input_shape=(None, sent_length))


In [None]:
import numpy as np

# Dummy data: batch size of 1, sentence length of sent_length
dummy_input = np.random.randint(0, voc_size, (1, sent_length))
model.predict(dummy_input)

print(model.summary())

# Embedding Layer
# 20: This is the input length or sequence length, which represents the number of words in each input sequence.
# 40: This is the embedding dimension size (embedding_vector_features), which is the size of each word's embedding vector.
# This is the total number of parameters in the Embedding layer.
# Calculated as voc_size * embedding_vector_features = 5000 * 40 = 200,000.

# LSTM LAYER
# None: Again, the batch size is flexible.
# 100: This is the number of LSTM units (neurons) in the layer.
# This is the total number of parameters in the LSTM layer.
# The LSTM parameters include:
# 4 * [(embedding_vector_features + LSTM_units) * LSTM_units + LSTM_units]
# Specifically: 4 * [(40 + 100) * 100 + 100] = 4 * [140 * 100 + 100] = 4 * [14,000 + 100] = 4 * 14,100 = 56,400.
# These parameters include the weights for input, forget, cell, and output gates in the LSTM.

# Dense Layer
# Output Shape: (None, 1)
# None: Again, the batch size is flexible.
# 1: This is the output size, which is 1 because the model is set up for binary classification (predicting one of two classes).
# Param # (101):
# This is the total number of parameters in the Dense layer.
# Calculated as LSTM_units + 1 = 100 + 1 = 101.

# Total Parameters:
# Total params: 256,501

# This is the sum of all the parameters across all layers: 200,000 (Embedding) + 56,400 (LSTM) + 101 (Dense) = 256,501.
# Trainable params: 256,501

# All the parameters in the model are trainable, meaning they will be updated during training to minimize the loss.
# Non-trainable params: 0

# There are no non-trainable parameters in this model. Non-trainable parameters might exist in models with layers like Batch Normalization where some parameters are not updated during training.


In [None]:
len(embedded_docs),y.shape

In [45]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [None]:
X_final.shape,y_final.shape

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [48]:
# Model Training

In [None]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

In [50]:
# Performance Metrics & Accuracy

In [None]:
y_pred=model.predict(X_test)

In [52]:
y_pred=np.where(y_pred > 0.5, 1,0) ##AUC ROC Curve

In [53]:
# setting a threshold value of 0.5->0.5=1 and <0.5 =0
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)
#91%

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [57]:
# Bidirectional LSTM RNN

In [58]:
from tensorflow.keras.layers import Bidirectional

In [None]:
embedding_vector_features=40 ##features representation
model=Sequential()

# embedding layer
model.add(Embedding(voc_size, embedding_vector_features))

# LSTM-100 NEURONS
model.add(Bidirectional(LSTM(200)))

# Sigmoid for binary prediction in model
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [60]:
model.build(input_shape=(None, sent_length))

In [None]:
import numpy as np

# Dummy data: batch size of 1, sentence length of sent_length
dummy_input = np.random.randint(0, voc_size, (1, sent_length))
model.predict(dummy_input)

print(model.summary())

In [62]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [64]:
# Model Training

In [None]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

In [None]:
# Performance Metrics & Accuracy
y_pred=model.predict(X_test)

In [None]:
y_pred=np.where(y_pred > 0.5, 1,0) ##AUC ROC Curve
y_pred

In [68]:
# setting a threshold value of 0.5->0.5=1 and <0.5 =0
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))