In [1]:
pip install tensorflow



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Concatenate, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# Loading preprocessed data
train_df = pd.read_csv("/content/train_news.csv")
test_df = pd.read_csv("/content/test_news.csv")

In [4]:
# Initializing tokenizer
tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(train_df["content"])

# Saving the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Converting text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df["content"])
X_test_seq = tokenizer.texts_to_sequences(test_df["content"])

In [5]:
# Optimal length for covering 95% of sequences
optimal_len = int(np.percentile([len(seq) for seq in X_train_seq], 95))

# Padding sequences to make the length uniform
X_train_pad = pad_sequences(X_train_seq, maxlen=optimal_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=optimal_len, padding='post', truncating='post')

# Preparing target labels
y_train = train_df["label"].values
y_test = test_df["label"].values

In [6]:
print(f"Training shape: {X_train_pad.shape}, Test shape: {X_test_pad.shape}")
print(f"Sample padded sequence:\n{X_train_pad[1]}")

Training shape: (62172, 414), Test shape: (15544, 414)
Sample padded sequence:
[   1  371   99   33 4643  136 7135  139    4 5564 1711 3002 4909   43
   17  371  663  395    6   18    1  140  151  635  334 1354   75    4
  100 1859   75 1244  862  137   10  156   16  240  138 1012  526  434
   87 1522   99  371   99 1313  805   12 4643  296 3539    4  100  784
  603  177 7887 3531  100  177 1378 2898    3 1275   16  240    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    

In [7]:
# Checking the amount of sequences truncated (loss of information)
n_truncated = sum(len(seq) > optimal_len for seq in X_train_seq)
print(f"Percentage of total sequences truncated: {round(n_truncated/len(X_train_seq)*100, 2)}%")

Percentage of total sequences truncated: 5.0%


In [8]:
# LSTM Model Architecture
model = Sequential([
    # Input Layer
    InputLayer(input_shape=(optimal_len,)),

    # Embedding layer
    Embedding(input_dim=8000, output_dim=128, mask_zero=True), #Converts words into dense vectors,captures semantic relationships

    # 1st hidden Layer
    LSTM(64, dropout=0.2, return_sequences=True), #Captures contextual meaning and long-range dependencies"

    # 2nd hidden layer
    LSTM(32, dropout=0.2, return_sequences=True),

    # 3rd hidden layer
    LSTM(16, dropout=0.2),

    # 4th hidden layer
    Dense(8, activation='relu'), #Adds non-linearity, extracting higher-level patterns

    # Output layer
    Dense(1, activation='sigmoid') #Binary classification
])

model.summary()



In [9]:
# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=64
)
print("Model Training Completed")

Epoch 1/5
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1150s[0m 1s/step - accuracy: 0.8475 - loss: 0.3652
Epoch 2/5
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1174s[0m 1s/step - accuracy: 0.9240 - loss: 0.1930
Epoch 3/5
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1160s[0m 1s/step - accuracy: 0.9359 - loss: 0.1618
Epoch 4/5
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1165s[0m 1s/step - accuracy: 0.9146 - loss: 0.2182
Epoch 5/5
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1226s[0m 1s/step - accuracy: 0.9492 - loss: 0.1298
Model Training Completed


In [10]:
# Evaluate on the test set
loss, accuracy = model.evaluate(X_test_pad, y_test)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 170ms/step - accuracy: 0.9370 - loss: 0.1606
Test Loss: 0.1558
Test Accuracy: 0.9387


In [11]:
model.save('Fake_News_Detector_Model.keras')
print("Model saved as Fake_News_Detector_Model.keras")

Model saved as Fake_News_Detector_Model.keras
