In [1]:
# Cell 1: Import Libraries and Load Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_scheduler
from datasets import load_dataset
import pandas as pd
import wandb
from torch.cuda.amp import autocast, GradScaler
import gc
from tqdm import tqdm

# Initialize WandB in offline mode
wandb.init(project="DeBERTa-Fake-News", config={"epochs": 3, "batch_size": 32, "lr": 1e-5}, mode="offline")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# Free GPU cache
torch.cuda.empty_cache()
gc.collect()

# Load DeBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [2]:
# Cell 2: Load Datasets
# Load dataset from Hugging Face for pre-training
ds = load_dataset("Salesforce/wikitext", "wikitext-103-v1", split="train")
pretrain_texts = ds["text"][:int(0.05 * len(ds["text"]))]  # Use 10% of Wikitext-103
pretrain_df = pd.DataFrame({"text": pretrain_texts, "label": [0] * len(pretrain_texts)})

# Load dataset from CSV (WELFake) for fine-tuning
welfake_df = pd.read_csv("/kaggle/input/welfake-dataset-for-fake-news/WELFake_Dataset.csv", usecols=["text", "label"], dtype={"label": str})

# Ensure labels are numeric & clean
welfake_df["label"] = welfake_df["label"].str.replace(r"[^\d]", "", regex=True).str.strip()
welfake_df = welfake_df[welfake_df["label"] != ""]  # Remove empty values
welfake_df["label"] = welfake_df["label"].astype(int)  # Convert to int for PyTorch
print("Data loading complete!")

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/722k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Data loading complete!


In [3]:
# Cell 3: Define Dataset Classes
class MLMDataset(Dataset):
    def __init__(self, texts, max_len=128):
        self.encodings = tokenizer(texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")
        self.labels = self.encodings.input_ids.clone()
        rand = torch.rand(self.labels.shape)
        mask_arr = (rand < 0.15) * (self.labels != tokenizer.pad_token_id) * (self.labels != tokenizer.cls_token_id) * (self.labels != tokenizer.sep_token_id)
        self.encodings.input_ids[mask_arr] = tokenizer.mask_token_id
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

class FakeNewsDataset(Dataset):
    def __init__(self, df, max_len=256):
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
        self.encodings = tokenizer(df["text"].astype(str).tolist(), padding="max_length", truncation=True, 
                                   max_length=max_len, return_tensors="pt")
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items() if key != "token_type_ids"}, self.labels[idx]

In [4]:
# Cell 4: Define DeBERTa Model
class DeBERTaClassifier(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base", num_labels=2):
        super(DeBERTaClassifier, self).__init__()
        self.deberta = AutoModel.from_pretrained(model_name)
        self.deberta.gradient_checkpointing_enable()
        self.classifier = nn.Linear(self.deberta.config.hidden_size, num_labels)
        self.mlm_head = nn.Linear(self.deberta.config.hidden_size, self.deberta.config.vocab_size)  # MLM head for pretraining
    
    def forward(self, input_ids, attention_mask, mlm=False):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        if mlm:
            return self.mlm_head(outputs.last_hidden_state)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

In [5]:
# Cell 5: Initialize Model, Optimizer, and Scheduler
model = DeBERTaClassifier().to(device)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

# Learning rate scheduler
num_training_steps = 3 * len(welfake_df) // 32  # Adjusted for batch size 32
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

scaler = GradScaler()

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  scaler = GradScaler()


In [6]:
# Cell 6: Pre-training Loop
def pretrain_model(model, train_loader, epochs=1):
    model.train()
    mlm_criterion = nn.CrossEntropyLoss(ignore_index=-100)
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Pretraining Epoch {epoch+1}")
        for batch in progress_bar:
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)
            optimizer.zero_grad()
            with torch.amp.autocast("cuda", dtype=torch.bfloat16):
                outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], mlm=True)
                loss = mlm_criterion(outputs.view(-1, model.deberta.config.vocab_size), labels.view(-1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / len(train_loader))

In [None]:
# Cell 7: Run Pre-training
pretrain_dataset = MLMDataset(pretrain_texts)
pretrain_loader = DataLoader(pretrain_dataset, batch_size=32, shuffle=True)
pretrain_model(model, pretrain_loader)

Pretraining Epoch 1:   9%|▉         | 267/2815 [12:45<2:04:50,  2.94s/it, loss=0.434]

In [None]:
# Cell 8: Run Fine-tuning
train_dataset = FakeNewsDataset(welfake_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
train_model(model, train_loader)

In [None]:
# Cell 9: Save and Evaluate Model
torch.save(model.state_dict(), "deberta_fakenews.pth")
print("Model saved successfully!")