In [1]:
!pip install transformers
!pip install scikit-learn
!pip install pandas
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hI

In [2]:
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import TextClassificationPipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [3]:
from google.colab import drive
drive.mount('/content/drive')

true_df = pd.read_csv("/content/drive/MyDrive/True.csv").head(100)
fake_df = pd.read_csv("/content/drive/MyDrive/Fake.csv").head(100)


# Add labels: 1 for True, 0 for Fake
true_df['label'] = 1
fake_df['label'] = 0

# Combine datasets
df = pd.concat([true_df, fake_df]).reset_index(drop=True)

# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

# Display a few samples
df.head()


Mounted at /content/drive


Unnamed: 0,title,text,subject,date,label
0,U.S. lawmakers question businessman at 2016 Tr...,WASHINGTON (Reuters) - A Georgian-American bus...,politicsNews,"December 27, 2017",1
1,Trump: Market has not fully digested tax cut c...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"December 20, 2017",1
2,Liberal Group Trolls Trump At Roy Moore Rally...,Donald Trump held a rally for Alabama Senate c...,News,"December 9, 2017",0
3,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
4,Don Jr. Tries To Mock Al Franken’s Resignatio...,When Sen. Al Franken (D-MN) announced his plan...,News,"December 7, 2017",0


In [4]:
def clean_text(text):
    # Remove URLs, special characters, numbers, punctuations
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text


df['text'] = df['title'].apply(clean_text)  # or 'text' column if exists


In [5]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)


In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [8]:
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)


In [23]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [24]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [30]:
# Define metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to="none", # Disable Weights & Biases logging
)

# Trainer setup
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate on test set
metrics = trainer.evaluate(test_dataset)

# Print metrics nicely
print("📊 Model Performance on Test Set:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

# Save the model and tokenizer
model.save_pretrained("/content/drive/MyDrive/fake_news_model")
tokenizer.save_pretrained("/content/drive/MyDrive/fake_news_model")

# Predict function for user input
def predict_news(text):
    text = clean_text(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "True News" if prediction == 1 else "Fake News"

# Simple chatbox-style loop
print("\n📰 Welcome to the Fake News Detector Chatbox!")
print("Type 'exit' to quit.\n")

while True:
    user_input = input("📝 Enter your news text: ")
    if user_input.lower() == 'exit':
        print("👋 Chat ended. Stay informed!")
        break
    result = predict_news(user_input)
    print(f"🤖 Prediction: {result}\n")


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2354,0.089562,0.95,0.956522,1.0,0.916667
2,0.0247,0.108459,0.95,0.956522,1.0,0.916667
3,0.0057,0.14663,0.95,0.956522,1.0,0.916667


📊 Model Performance on Test Set:
eval_loss: 0.2966
eval_accuracy: 0.9000
eval_f1: 0.8889
eval_precision: 0.8000
eval_recall: 1.0000
eval_runtime: 0.8223
eval_samples_per_second: 24.3230
eval_steps_per_second: 3.6480
epoch: 3.0000

📰 Welcome to the Fake News Detector Chatbox!
Type 'exit' to quit.

📝 Enter your news text: trump is the new president
🤖 Prediction: True News

📝 Enter your news text: trump is killed 
🤖 Prediction: Fake News

📝 Enter your news text: cricket is not a sport but a song
🤖 Prediction: Fake News



KeyboardInterrupt: Interrupted by user

In [2]:
pip install streamlit transformers torch




In [6]:
import streamlit as st
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import re
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords if not already
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/fake_news_model"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Prediction function with confidence score
def predict_news(text):
    text = clean_text(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        confidence, prediction = torch.max(probs, dim=1)
        confidence_percent = confidence.item() * 100
        label = "True News ✅" if prediction.item() == 1 else "Fake News ❌"
        return label, confidence_percent

# Streamlit Interface
st.title("📰 Fake News Detector Chatbot (DistilBERT)")

st.write("Type or paste a news article/headline below to check if it’s true or fake 👇")

user_input = st.text_area("📝 Enter News Text Here")

if st.button("Check News"):
    if user_input.strip() == "":
        st.warning("Please enter some text.")
    else:
        result, confidence = predict_news(user_input)
        st.success(f"🤖 Prediction: **{result}**\n\n📊 Confidence: **{confidence:.2f}%**")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-07-08 09:16:10.185 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-07-08 09:16:10.209 Session state does not function when running a script without `streamlit run`
