In [1]:
!pip install datasets evaluate transformers

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset

In [2]:
dataset = load_dataset("GonzaloA/fake_news")

Found cached dataset parquet (C:/Users/anujb/.cache/huggingface/datasets/GonzaloA___parquet/GonzaloA--fake_news-1fe2b42e1fa111c8/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [4]:
print(dataset.keys())

dict_keys(['train', 'validation', 'test'])


In [5]:
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

In [6]:
num_examples = len(train_data)
print(f"Number of examples: {num_examples}")

Number of examples: 24353


In [7]:
column_names = val_data.column_names
print("Column names/features:", column_names)

Column names/features: ['Unnamed: 0', 'title', 'text', 'label']


In [8]:
# Define a custom dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=512,
            truncation=True,
            padding="max_length",
            add_special_tokens=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        token_type_ids = encoding["token_type_ids"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "label": label
        }

In [9]:
# Instantiate the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Define the custom fake news classification model
class FakeNewsClassifier(nn.Module):
    def __init__(self, bert_model):
        super(FakeNewsClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 2)  # 768 is the BERT hidden size

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [11]:
# Instantiate the fake news classifier model
classifier_model = FakeNewsClassifier(model)

In [12]:
# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classifier_model = classifier_model.to(device)


In [13]:
# Create data loaders
train_dataset = FakeNewsDataset(train_data, tokenizer)
val_dataset = FakeNewsDataset(val_data, tokenizer)
test_dataset = FakeNewsDataset(test_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [14]:
# Set up optimizer and learning rate scheduler
optimizer = AdamW(classifier_model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 10  # 10 epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=total_steps)

# Define loss function
criterion = nn.CrossEntropyLoss()



In [16]:
# Training loop
classifier_model.train()
for epoch in range(10):
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = classifier_model(input_ids, attention_mask, token_type_ids)

        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(classifier_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")


Epoch 1 - Average Loss: 0.1136
Epoch 2 - Average Loss: 0.0382
Epoch 3 - Average Loss: 0.0336
Epoch 4 - Average Loss: 0.0295
Epoch 5 - Average Loss: 0.0219
Epoch 6 - Average Loss: 0.0097
Epoch 7 - Average Loss: 0.0087
Epoch 8 - Average Loss: 0.0027
Epoch 9 - Average Loss: 0.0008
Epoch 10 - Average Loss: 0.0003


In [17]:
# Evaluation loop
classifier_model.eval()
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)

        logits = classifier_model(input_ids, attention_mask, token_type_ids)
        _, predicted_labels = torch.max(logits, dim=1)

        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

val_accuracy = correct_predictions / total_predictions
print(f"Validation Accuracy: {val_accuracy:.4f}")

# from sklearn.metrics import f1_score

# classifier_model.eval()
# true_labels = []
# predicted_labels = []

# with torch.no_grad():
#     for batch in val_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         token_type_ids = batch["token_type_ids"].to(device)
#         labels = batch["label"].to(device)

#         logits = classifier_model(input_ids, attention_mask, token_type_ids)
#         _, batch_predicted_labels = torch.max(logits, dim=1)

#         true_labels.extend(labels.tolist())
#         predicted_labels.extend(batch_predicted_labels.tolist())

# val_f1_score = f1_score(true_labels, predicted_labels)
# print(f"Validation F1 Score: {val_f1_score:.4f}")

Validation Accuracy: 0.9893


In [None]:
# Testing loop
classifier_model.eval()
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)

        logits = classifier_model(input_ids, attention_mask, token_type_ids)
        _, predicted_labels = torch.max(logits, dim=1)

        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

test_accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {test_accuracy:.4f}")




# from sklearn.metrics import f1_score

# classifier_model.eval()
# true_labels = []
# predicted_labels = []

# with torch.no_grad():
#     for batch in test_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         token_type_ids = batch["token_type_ids"].to(device)
#         labels = batch["label"].to(device)

#         logits = classifier_model(input_ids, attention_mask, token_type_ids)
#         _, batch_predicted_labels = torch.max(logits, dim=1)

#         true_labels.extend(labels.tolist())
#         predicted_labels.extend(batch_predicted_labels.tolist())

# test_f1_score = f1_score(true_labels, predicted_labels)
# print(f"Test F1 Score: {test_f1_score:.4f}")

In [None]:
# Save model and metrics

save_path = "model_checkpoint.pth"
torch.save(
    {
        "model_state_dict": classifier_model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict(),
        "metrics": {
            "validation_accuracy": val_accuracy,
            "test_accuracy": test_accuracy
        }
    },
    save_path
)

print("Model and metrics saved successfully!")

In [15]:
load_path = "model_checkpoint.pth"
checkpoint = torch.load(load_path)

# Load model state dict
classifier_model.load_state_dict(checkpoint["model_state_dict"])
classifier_model.eval()

# Load optimizer and scheduler state dicts
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

# Load metrics
validation_accuracy = checkpoint["metrics"]["validation_accuracy"]
test_accuracy = checkpoint["metrics"]["test_accuracy"]

print("Model and metrics loaded successfully!")

Model and metrics loaded successfully!


In [16]:
# User input
user_text = input("Enter the text to classify: ")

# Tokenize and preprocess the user input
encoded_input = tokenizer.encode_plus(
    user_text,
    max_length=512,
    truncation=True,
    padding="max_length",
    add_special_tokens=True,
    return_attention_mask=True,
    return_token_type_ids=True,
    return_tensors="pt"
)
input_ids = encoded_input["input_ids"].to(device)
attention_mask = encoded_input["attention_mask"].to(device)
token_type_ids = encoded_input["token_type_ids"].to(device)

# Make predictions
with torch.no_grad():
    logits = classifier_model(input_ids, attention_mask, token_type_ids)
    probabilities = nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

# Map predicted class to label
label_map = {1: "Real News", 0: "Fake News"}
predicted_label = label_map[predicted_class]

# Print the predicted label
print("Predicted Label:", predicted_label)

Enter the text to classify: North Korea blew up the moon.
Predicted Label: Fake News
