Reae vs. Real news headlines.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Create a Fake News dataset
data = {
    "text": [
        "Breaking: Scientists discover a cure for COVID-19!",  # Fake
        "NASA confirms water on the moon, groundbreaking discovery!",  # Real
        "Experts warn of stock market collapse due to secret government policies.",  # Fake
        "Government announces new tax relief for small businesses.",  # Real
        "Shocking: Aliens found living in Area 51!",  # Fake
        "New study finds link between exercise and improved mental health.",  # Real
        "Politician caught hiding millions in offshore accounts.",  # Fake
        "Medical researchers develop breakthrough cancer treatment.",  # Real
        "Secret messages found in ancient pyramids predict end of the world!",  # Fake
        "Tech company launches revolutionary AI that changes programming forever.",  # Real,
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Fake, 0 = Real
}

df = pd.DataFrame(data)

# 2. Preprocess the text (lowercase, remove special characters, stopwords)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['clean_text'] = df['text'].apply(preprocess)

2025-03-21 21:37:41.613582: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742593061.632448 1758432 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742593061.637997 1758432 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742593061.651616 1758432 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742593061.651645 1758432 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742593061.651648 1758432 computation_placer.cc:177] computation placer alr

Tokenizer: Tensorflow tokenizer


The Tokenizer from tensorflow.keras.preprocessing.text converts a list of texts into sequences of integers, where each unique word is mapped to a unique index based on frequency. Formally, for a text t = [w₁, w₂, ..., wₙ], the tokenizer creates a sequence s = [i₁, i₂, ..., iₙ] such that iⱼ = word_index(wⱼ), where word_index(w) assigns an integer ID to word w based on its frequency rank.


In [3]:
# 3. Tokenization and Padding
max_words = 5000  # Max vocabulary size
max_len = 20  # Max length of a sequence

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['clean_text'])

X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=max_len)
y = np.array(df['label'])

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Build a Simple Neural Network with Embedding and LSTM
model = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=max_len),
    LSTM(32, return_sequences=False),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 6. Train the Model
model.fit(X_train, y_train, epochs=1000, batch_size=2, validation_data=(X_test, y_test))

# 7. Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype("int32").flatten()
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# 8. Test with new headlines
new_headlines = [
    "Scientists invent teleportation technology, changing travel forever!",
    "World Health Organization announces breakthrough in malaria vaccine.",
    "Shocking discovery: Atlantis city found under the ocean!",
    "New smartphone released with groundbreaking AI features."
]

new_headlines_clean = [preprocess(news) for news in new_headlines]
new_sequences = tokenizer.texts_to_sequences(new_headlines_clean)
new_sequences = pad_sequences(new_sequences, maxlen=max_len)

predictions = (model.predict(new_sequences) > 0.5).astype("int32").flatten()

# Print Predictions
for headline, pred in zip(new_headlines, predictions):
    category = "Fake News" if pred == 1 else "Real News"
    print(f"'{headline}' → {category}")


Epoch 1/1000




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 109ms/step - accuracy: 0.3881 - loss: 0.6939 - val_accuracy: 0.3333 - val_loss: 0.6967
Epoch 2/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.7119 - loss: 0.6887 - val_accuracy: 0.3333 - val_loss: 0.6982
Epoch 3/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.7690 - loss: 0.6857 - val_accuracy: 0.3333 - val_loss: 0.6989
Epoch 4/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.8524 - loss: 0.6789 - val_accuracy: 0.3333 - val_loss: 0.7002
Epoch 5/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.8190 - loss: 0.6761 - val_accuracy: 0.3333 - val_loss: 0.7003
Epoch 6/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 1.0000 - loss: 0.6669 - val_accuracy: 0.3333 - val_loss: 0.7007
Epoch 7/1000
[1m4/4[0m [32m━━━━━━━━━━━━━━━━