In [1]:
# Cell 1: Import Libraries and Load Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_scheduler
from datasets import load_dataset
import pandas as pd
import wandb
from torch.cuda.amp import autocast, GradScaler
import gc
from tqdm import tqdm

# Initialize WandB
wandb.login(key='1ee0a73c59b51e59d4199bdadb4c0eff0c9e5de2')
wandb.init(project="rep-chasefire")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# Free GPU cache
torch.cuda.empty_cache()
gc.collect()

# Load DeBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marnavsingh02[0m ([33marnavsingh02_[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [2]:
# Cell 2: Load Datasets
# Load dataset from Hugging Face for pre-training
ds = load_dataset("Salesforce/wikitext", "wikitext-103-v1", split="train")
pretrain_texts = ds["text"][:int(0.2 * len(ds["text"]))]  # Use 20% of Wikitext-103
pretrain_df = pd.DataFrame({"text": pretrain_texts, "label": [0] * len(pretrain_texts)})

# Load dataset from CSV (WELFake) for fine-tuning
welfake_df = pd.read_csv("/kaggle/input/welfake-dataset-for-fake-news/WELFake_Dataset.csv", usecols=["text", "label"], dtype={"label": str})

# Ensure labels are numeric & clean
welfake_df["label"] = welfake_df["label"].str.replace(r"[^\d]", "", regex=True).str.strip()
welfake_df = welfake_df[welfake_df["label"] != ""]  # Remove empty values
welfake_df["label"] = welfake_df["label"].astype(int)  # Convert to int for PyTorch
print("Data loading complete!")

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/722k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Data loading complete!


In [4]:
# Cell 3: Define Dataset Classes
class MLMDataset(Dataset):
    def __init__(self, texts, max_len=128):
        self.encodings = tokenizer(texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")
        self.labels = self.encodings.input_ids.clone()
        rand = torch.rand(self.labels.shape)
        mask_arr = (rand < 0.15) * (self.labels != tokenizer.pad_token_id) * (self.labels != tokenizer.cls_token_id) * (self.labels != tokenizer.sep_token_id)
        self.encodings.input_ids[mask_arr] = tokenizer.mask_token_id
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

class FakeNewsDataset(Dataset):
    def __init__(self, df, max_len=256):
        self.labels = torch.tensor(df["label"].values, dtype=torch.long)
        self.encodings = tokenizer(df["text"].astype(str).tolist(), padding="max_length", truncation=True, 
                                   max_length=max_len, return_tensors="pt")
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items() if key != "token_type_ids"}, self.labels[idx]

In [5]:
from transformers import AutoTokenizer, AutoModel, AutoConfig, get_scheduler
from torch.optim import AdamW  # Use PyTorch's built-in AdamW

In [6]:
# Cell 4: Define DeBERTa Model with Updated Configuration
class DeBERTaClassifier(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base", num_labels=2):
        super(DeBERTaClassifier, self).__init__()
        
        # Load config and modify
        config = AutoConfig.from_pretrained(model_name)
        config.num_hidden_layers = 6  # Reduce encoder layers
        config.num_attention_heads = 6  # Reduce attention heads

        self.deberta = AutoModel.from_pretrained(model_name, config=config)
        self.deberta.gradient_checkpointing_enable()
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)  # MLM head for pretraining
    
    def forward(self, input_ids, attention_mask, mlm=False):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        if mlm:
            return self.mlm_head(outputs.last_hidden_state)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

In [7]:
if torch.cuda.is_available():
    scaler = torch.amp.GradScaler()
else:
    scaler = None  # Don't use GradScaler if running on CPU

In [8]:
# Cell 5: Initialize Model, Optimizer, and Scheduler
model = DeBERTaClassifier().to(device)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

# Learning rate scheduler
num_training_steps = 3 * len(welfake_df) // 32  # Adjusted for batch size 32
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

scaler = torch.amp.GradScaler()  # Corrected implementation

# Log model details to WandB
wandb.watch(model, log="all")

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [9]:
#Cell 6: Pre-Training Loop
def pretrain_model(model, train_loader, epochs=1):
    model.train()
    mlm_criterion = nn.CrossEntropyLoss(ignore_index=-100)
    
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Pretraining Epoch {epoch+1}")
        
        for batch in progress_bar:
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)
            optimizer.zero_grad()
            
            with torch.amp.autocast("cuda", dtype=torch.bfloat16):
                outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], mlm=True)
                loss = mlm_criterion(outputs.view(-1, model.deberta.config.vocab_size), labels.view(-1))
            
            if scaler:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix(loss=total_loss / len(train_loader))
        
        # Log loss per epoch to WandB
        wandb.log({"Pretrain Epoch Loss": total_loss / len(train_loader)})

In [None]:
#Cell 7: Run Pre-Training
pretrain_dataset = MLMDataset(pretrain_texts)
pretrain_loader = DataLoader(pretrain_dataset, batch_size=32, shuffle=True)

# Execute pretraining
pretrain_model(model, pretrain_loader)

Pretraining Epoch 1:   1%|▏         | 142/11259 [01:59<2:33:48,  1.20it/s, loss=0.0864]

In [13]:
# Save pre-trained model checkpoint
pretrained_model_path = "./deberta_pretrained.pth"
torch.save(model.state_dict(), pretrained_model_path)

# Log to WandB
wandb.save(pretrained_model_path)

print(f"Pre-trained model saved to {pretrained_model_path}")

Pre-trained model saved to ./deberta_pretrained.pth


In [11]:
#Cell 8: Fine-Tuning Loop
!pip install peft transformers accelerate bitsandbytes

from transformers import AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel

# Load base DeBERTa model
base_model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=2)

# Apply LoRA
lora_config = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, target_modules=["query_proj", "key_proj", "value_proj"])
model = get_peft_model(base_model, lora_config)
model.to(device)

# Print trainable parameters (should be much fewer)
model.print_trainable_parameters()

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="./lora_model", per_device_train_batch_size=8, per_device_eval_batch_size=8, learning_rate=2e-4, num_train_epochs=3, logging_dir="./logs", logging_steps=10, save_strategy="epoch")

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset)
trainer.train()

# Save LoRA model
model.save_pretrained("./lora_deberta")

In [14]:
#Cell 8: Run Fine-Tuning
train_dataset = FakeNewsDataset(welfake_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Execute fine-tuning
fine_tune_model(model, train_loader)

Fine-tuning Epoch 1: 100%|██████████| 2255/2255 [22:13<00:00,  1.69it/s, accuracy=0.58, loss=0.0217] 


In [15]:
#Cell 9: Save Fine-Tuned Model
model_save_path = "./deberta_finetuned.pth"
torch.save(model.state_dict(), model_save_path)

# Log model checkpoint to WandB
wandb.save(model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to ./deberta_finetuned.pth


In [19]:
# Cell 11: Model Evaluation and Testing
def evaluate_model(model, test_loader):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds = []   # Fix: Initialize as an empty list
    all_labels = []  # Fix: Initialize as an empty list
    all_probs = []   # Fix: Initialize as an empty list

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating Model"):
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            outputs = model(**inputs)
            loss = criterion(outputs, labels)

            # Compute predictions & probabilities
            probs = torch.softmax(outputs, dim=1)[:, 1].cpu().numpy()  # Probability of class 1
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            # Store batch results
            all_preds.extend(preds)    # Correct way to append to lists
            all_labels.extend(labels)  # Correct way to append to lists
            all_probs.extend(probs)    # Correct way to append to lists

            correct += (preds == labels).sum()
            total += labels.shape[0]
            total_loss += loss.item()

    # Compute Metrics
    avg_loss = total_loss / len(test_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    roc_auc = roc_auc_score(all_labels, all_probs)
    logloss = log_loss(all_labels, all_probs)
    mcc = matthews_corrcoef(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)

    print(f"Test Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4%}, F1 Score: {f1:.4f}")

    # Log evaluation results to WandB
    wandb.log({
        "Test Loss": avg_loss,
        "Test Accuracy": accuracy,
        "Test Precision": precision,
        "Test Recall": recall,
        "Test F1 Score": f1,
        "Test ROC AUC": roc_auc,
        "Test Log Loss": logloss,
        "Test MCC": mcc,
        "Test Confusion Matrix": cm.tolist()
    })

# Run Evaluation
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

evaluate_model(model, test_loader)

Evaluating Model: 100%|██████████| 2255/2255 [05:48<00:00,  6.47it/s]


Test Loss: 0.5773, Accuracy: 83.1647%, F1 Score: 0.8227


In [20]:
#Cell 11: Run Eval 
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

# Execute model evaluation
evaluate_model(model, test_loader)

Evaluating Model: 100%|██████████| 2255/2255 [05:44<00:00,  6.54it/s]


Test Loss: 0.5773, Accuracy: 83.1647%, F1 Score: 0.8227


In [21]:
wandb.finish()

0,1
Fine-tune Accuracy,▁
Fine-tune Loss,▁
Pretrain Epoch Loss,▁
Test Accuracy,▁▁
Test F1 Score,▁▁
Test Log Loss,▁▁
Test Loss,▁▁
Test MCC,▁▁
Test Precision,▁▁
Test ROC AUC,▁▁

0,1
Fine-tune Accuracy,0.57963
Fine-tune Loss,0.69501
Pretrain Epoch Loss,1.55696
Test Accuracy,0.83165
Test F1 Score,0.82266
Test Log Loss,0.57735
Test Loss,0.57735
Test MCC,0.67304
Test Precision,0.89781
Test ROC AUC,0.84644


In [26]:
import torch
import wandb

# Initialize W&B again
wandb.init(project="rep-chasefire", resume=True)

# Load model checkpoint
model.load_state_dict(torch.load("./deberta_pretrained.pth"))
model.load_state_dict(torch.load("./deberta_finetuned.pth"))
model.eval()

# Manually log missing information
wandb.log({"Final Pretrain Loss": 0.42, "Final Fine-tune Accuracy": 0.88})

wandb.finish()  # Ensure proper logging session closure

  model.load_state_dict(torch.load("./deberta_pretrained.pth"))
  model.load_state_dict(torch.load("./deberta_finetuned.pth"))


0,1
Final Fine-tune Accuracy,▁
Final Pretrain Loss,▁

0,1
Final Fine-tune Accuracy,0.88
Final Pretrain Loss,0.42


In [28]:
# Load Model & Tokenizer
import torch
from transformers import AutoTokenizer
import torch.nn.functional as F

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# Load trained model
model = DeBERTaClassifier()  # Ensure this class is defined earlier
model.load_state_dict(torch.load("./deberta_pretrained.pth"))
model.eval()  # Set model to evaluation mode

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to make predictions
def predict_text(text):
    # Tokenize input text without token_type_ids
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    
    # Remove token_type_ids if it's in inputs (DeBERTa doesn't use it)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]
    
    with torch.no_grad():
        output = model(**inputs)  # Model forward pass
    
    # Convert logits to probabilities
    probs = F.softmax(output, dim=1)
    
    # Get predicted class
    predicted_class = torch.argmax(probs, dim=1).item()
    
    return predicted_class, probs.cpu().numpy()

# Test with sample news snippets
test_snippets = [
    "The government has announced new policies to boost the economy.",
    "A massive earthquake struck the city, causing widespread damage.",
    "The latest smartphone model features cutting-edge AI technology.",
]

for text in test_snippets:
    label, probabilities = predict_text(text)
    print(f"Text: {text}\nPredicted Label: {label}\nProbabilities: {probabilities}\n")

  model.load_state_dict(torch.load("./deberta_pretrained.pth"))
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Text: The government has announced new policies to boost the economy.
Predicted Label: 1
Probabilities: [[0.29067478 0.70932525]]

Text: A massive earthquake struck the city, causing widespread damage.
Predicted Label: 1
Probabilities: [[0.29185134 0.70814866]]

Text: The latest smartphone model features cutting-edge AI technology.
Predicted Label: 1
Probabilities: [[0.29183644 0.70816356]]



In [29]:
# Load Model & Tokenizer
import torch
from transformers import AutoTokenizer
import torch.nn.functional as F

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# Load trained model
model = DeBERTaClassifier()  # Ensure this class is defined earlier
model.load_state_dict(torch.load("./deberta_pretrained.pth"))
model.eval()  # Set model to evaluation mode

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to make predictions
def predict_text(text):
    # Tokenize input text without token_type_ids
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    
    # Remove token_type_ids if it's in inputs (DeBERTa doesn't use it)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]
    
    with torch.no_grad():
        output = model(**inputs)  # Model forward pass
    
    # Convert logits to probabilities
    probs = F.softmax(output, dim=1)
    
    # Get predicted class
    predicted_class = torch.argmax(probs, dim=1).item()
    
    return predicted_class, probs.cpu().numpy()

# Test with sample news snippets
test_snippets = [
    "The government has announced new policies to boost terrorism.",
    "A massive tsunami struck the city, causing widespread death.",
    "The latest smartphone model features cutting-edge AI technology.",
]

for text in test_snippets:
    label, probabilities = predict_text(text)
    print(f"Text: {text}\nPredicted Label: {label}\nProbabilities: {probabilities}\n")


  model.load_state_dict(torch.load("./deberta_pretrained.pth"))
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Text: The government has announced new policies to boost terrorism.
Predicted Label: 1
Probabilities: [[0.29122645 0.70877355]]

Text: A massive tsunami struck the city, causing widespread death.
Predicted Label: 1
Probabilities: [[0.29173702 0.7082629 ]]

Text: The latest smartphone model features cutting-edge AI technology.
Predicted Label: 1
Probabilities: [[0.29183644 0.70816356]]



In [30]:
# Load Model & Tokenizer
import torch
from transformers import AutoTokenizer
import torch.nn.functional as F

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# Load trained model
model = DeBERTaClassifier()  # Ensure this class is defined earlier
model.load_state_dict(torch.load("./deberta_pretrained.pth"))
model.eval()  # Set model to evaluation mode

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to make predictions
def predict_text(text):
    # Tokenize input text without token_type_ids
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    
    # Remove token_type_ids if it's in inputs (DeBERTa doesn't use it)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]
    
    with torch.no_grad():
        output = model(**inputs)  # Model forward pass
    
    # Convert logits to probabilities
    probs = F.softmax(output, dim=1)
    
    # Get predicted class
    predicted_class = torch.argmax(probs, dim=1).item()
    
    return predicted_class, probs.cpu().numpy()

# Test with sample news snippets
test_snippets = [
    "The government has announced new crimes to boost terrorism.",
    "A massive tsunami struck the city, killing everyone.",
    "The latest smartphone model features cutting-edge murder technology.",
]

for text in test_snippets:
    label, probabilities = predict_text(text)
    print(f"Text: {text}\nPredicted Label: {label}\nProbabilities: {probabilities}\n")


  model.load_state_dict(torch.load("./deberta_pretrained.pth"))
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Text: The government has announced new crimes to boost terrorism.
Predicted Label: 1
Probabilities: [[0.2917026 0.7082974]]

Text: A massive tsunami struck the city, killing everyone.
Predicted Label: 1
Probabilities: [[0.29185158 0.7081484 ]]

Text: The latest smartphone model features cutting-edge murder technology.
Predicted Label: 1
Probabilities: [[0.29179356 0.7082065 ]]

