In [1]:
pip install tensorflow



In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Concatenate, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Loading preprocessed data
train_df = pd.read_csv("/content/train_news.csv")
test_df = pd.read_csv("/content/test_news.csv")

In [6]:
# Initializing tokenizer
tokenizer = Tokenizer(num_words=8000)
tokenizer.fit_on_texts(train_df["content"])

# Saving the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Converting text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df["content"])
X_test_seq = tokenizer.texts_to_sequences(test_df["content"])

In [7]:
# Optimal length for covering 95% of sequences
optimal_len = int(np.percentile([len(seq) for seq in X_train_seq], 95))

# Padding sequences to make the length uniform
X_train_pad = pad_sequences(X_train_seq, maxlen=optimal_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=optimal_len, padding='post', truncating='post')

# Preparing target labels
y_train = train_df["label"].values
y_test = test_df["label"].values

In [8]:
print(f"Training shape: {X_train_pad.shape}, Test shape: {X_test_pad.shape}")
print(f"Sample padded sequence:\n{X_train_pad[1]}")

Training shape: (54475, 433), Test shape: (13619, 433)
Sample padded sequence:
[4262   94 1811  140   86 4306  213  564   55 2557    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    

In [9]:
# Checking the amount of sequences truncated (loss of information)
n_truncated = sum(len(seq) > optimal_len for seq in X_train_seq)
print(f"Percentage of total sequences truncated: {round(n_truncated/len(X_train_seq)*100, 2)}%")

Percentage of total sequences truncated: 4.96%


In [15]:
# LSTM Model Architecture
model = Sequential([
    # Input Layer
    InputLayer(input_shape=(optimal_len,)),

    # Embedding layer
    Embedding(input_dim=8000, output_dim=128, mask_zero=True), #Converts words into dense vectors,captures semantic relationships

    # 1st hidden Layer
    LSTM(64, dropout=0.2, return_sequences=True), #Captures contextual meaning and long-range dependencies"

    # 2nd hidden layer ()
    LSTM(32, dropout=0.2),

    # 3rd hidden layer
    Dense(16, activation='relu'), #Adds non-linearity, extracting higher-level patterns

    # Output layer
    Dense(1, activation='sigmoid') #Binary classification
])

model.summary()



In [18]:
# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=5,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=2),
        tf.keras.callbacks.ModelCheckpoint('Fake_News_Detector_Model.h5', save_best_only=True)
    ]
)
print("Model Training Completed")

Epoch 1/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 704ms/step - accuracy: 0.8575 - loss: 0.3368



[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1263s[0m 738ms/step - accuracy: 0.8575 - loss: 0.3368 - val_accuracy: 0.9137 - val_loss: 0.2100
Epoch 2/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1300s[0m 748ms/step - accuracy: 0.9185 - loss: 0.2013 - val_accuracy: 0.9092 - val_loss: 0.2335
Epoch 3/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 694ms/step - accuracy: 0.9292 - loss: 0.1779



[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1270s[0m 742ms/step - accuracy: 0.9292 - loss: 0.1779 - val_accuracy: 0.9267 - val_loss: 0.1799
Epoch 4/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 697ms/step - accuracy: 0.9518 - loss: 0.1170



[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1288s[0m 746ms/step - accuracy: 0.9518 - loss: 0.1170 - val_accuracy: 0.9306 - val_loss: 0.1628
Epoch 5/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1278s[0m 744ms/step - accuracy: 0.9603 - loss: 0.0948 - val_accuracy: 0.9309 - val_loss: 0.1751
Model Training Completed
