In [44]:
# Import libraries
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [46]:
# Load datasets
fake = pd.read_csv("Data/Fake.csv")
true = pd.read_csv("Data/True.csv")

In [47]:
fake["label"] = 0
true["label"] = 1

In [48]:
data = pd.concat([fake, true])
data = data.sample(frac=1).reset_index(drop=True)

In [49]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text

data["text"] = data["text"].apply(clean_text)

In [50]:
# Features and labels
X = data["text"]
y = data["label"]

In [51]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

print("LOGISTIC REGRESSION RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


LOGISTIC REGRESSION RESULTS
Accuracy: 0.9887527839643653
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4657
           1       0.99      0.99      0.99      4323

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Confusion Matrix:
 [[4593   64]
 [  37 4286]]


In [53]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [54]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [55]:
# Padding
max_len = 300
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:

model = Sequential()

model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.3))  
model.add(Dense(1, activation="sigmoid"))

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()


In [58]:
# Train model
model.fit(
    X_train_pad,
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2
)


Epoch 1/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 206ms/step - accuracy: 0.9443 - loss: 0.1500 - val_accuracy: 0.9788 - val_loss: 0.0712
Epoch 2/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 221ms/step - accuracy: 0.9683 - loss: 0.0914 - val_accuracy: 0.9294 - val_loss: 0.2000
Epoch 3/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 222ms/step - accuracy: 0.9745 - loss: 0.0763 - val_accuracy: 0.9254 - val_loss: 0.1793
Epoch 4/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 224ms/step - accuracy: 0.9837 - loss: 0.0497 - val_accuracy: 0.9852 - val_loss: 0.0524
Epoch 5/5
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 205ms/step - accuracy: 0.9928 - loss: 0.0254 - val_accuracy: 0.9830 - val_loss: 0.0577


<keras.src.callbacks.history.History at 0x1fcc17d31d0>

In [None]:
y_pred_lstm = (model.predict(X_test_pad) > 0.5).astype("int32")

print("LSTM RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_lstm))
print(classification_report(y_test, y_pred_lstm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm))



[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 41ms/step
LSTM RESULTS
Accuracy: 0.9835189309576837
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      4657
           1       0.98      0.98      0.98      4323

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980

Confusion Matrix:
 [[4574   83]
 [  65 4258]]
