In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# Check if GPU is ready
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda


In [None]:
file_path = "/content/drive/MyDrive/combined_llmfake.csv"

if not os.path.exists(file_path):
    print(f"⚠️ FILE NOT FOUND at: {file_path}")
    print("Please check the path in your Google Drive.")
else:
    print(f"Loading file from: {file_path}")
    df = pd.read_csv(file_path)
    df["label"] = df["label"].astype(int)

    # 1. Split off 10% for Test
    train_val_df, test_df = train_test_split(
        df, test_size=0.10, stratify=df["label"], random_state=42
    )

    # 2. Split off 10% (of total) for Validation
    # (0.10 total / 0.90 remaining = 0.1111)
    train_df, val_df = train_test_split(
        train_val_df, test_size=0.11111, stratify=train_val_df["label"], random_state=42
    )

    print(f"✅ Data Loaded & Split Successfully:")
    print(f"   Train Size: {len(train_df)} (80%)")
    print(f"   Val Size:   {len(val_df)} (10%)")
    print(f"   Test Size:  {len(test_df)} (10%)")

    # Convert to Hugging Face Datasets
    train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
    test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

Loading file from: /content/drive/MyDrive/combined_llmfake.csv
✅ Data Loaded & Split Successfully:
   Train Size: 32992 (80%)
   Val Size:   4124 (10%)
   Test Size:  4125 (10%)


In [None]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False, # We use dynamic padding later to save memory
        max_length=512 # Critical for high accuracy
    )

print("Tokenizing datasets... (this may take 1-2 minutes)")
tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("✅ Tokenization Complete.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizing datasets... (this may take 1-2 minutes)


Map:   0%|          | 0/32992 [00:00<?, ? examples/s]

Map:   0%|          | 0/4124 [00:00<?, ? examples/s]

Map:   0%|          | 0/4125 [00:00<?, ? examples/s]

✅ Tokenization Complete.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Calculate Probabilities for AUC
    # We use softmax to get the probability of class 1
    probs = F.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    try:
        auc = roc_auc_score(labels, probs)
    except:
        auc = 0.5 # Fallback if error occurs

    return {"accuracy": acc, "f1": f1, "auc": auc}

In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

training_args = TrainingArguments(
    output_dir="./roberta_results",
    learning_rate=2e-5,            # Low LR for stability
    per_device_train_batch_size=8, # Small batch size to fit 512 tokens
    gradient_accumulation_steps=2, # Accumulate gradients to simulate batch=16
    per_device_eval_batch_size=16,
    num_train_epochs=3,            # 3 Epochs for better convergence
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,                     # Use Mixed Precision
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
print("Model & Trainer Initialized.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model & Trainer Initialized.


  trainer = Trainer(


In [None]:
print(" Starting Training...")
trainer.train()

print("\n📊 Evaluating on TEST SET...")
results = trainer.evaluate(tokenized_test)

print("\n" + "="*40)
print("FINAL RESULTS (ROBERTA-BASE)")
print("="*40)
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"F1 Score: {results['eval_f1']:.4f}")
print(f"ROC-AUC:  {results['eval_auc']:.4f}")
print("="*40)

 Starting Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.5609,0.525962,0.676285,0.5728,0.75711
2,0.5257,0.526667,0.682832,0.562542,0.769888
3,0.5048,0.503883,0.670951,0.584634,0.767636



📊 Evaluating on TEST SET...



FINAL RESULTS (ROBERTA-BASE)
Accuracy: 0.6657
F1 Score: 0.5330
ROC-AUC:  0.7573


In [None]:
save_path = "/content/drive/MyDrive/RoBERTa_Final_Model"

print(f"💾 Saving best model to {save_path}...")

# Save the model and the tokenizer
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("✅ Model successfully saved to Google Drive!")
print("You can now safely restart the runtime without losing your progress.")

💾 Saving best model to /content/drive/MyDrive/RoBERTa_Final_Model...
✅ Model successfully saved to Google Drive!
You can now safely restart the runtime without losing your progress.


In [None]:
# ==========================================
#MODEL 2 (DeBERTa Standard)
# ==========================================
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import torch.nn.functional as F
import os

# 1. Configuration
model_name = "microsoft/deberta-v3-base"
drive_save_path = "/content/drive/MyDrive/DeBERTa_Standard_Model"
file_path = "/content/drive/MyDrive/combined_llmfake.csv"

# 2. Re-Load Data (Safety check to ensure fresh split)
print(f"🚀 MODEL 2: Loading Data for {model_name}...")
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    df["label"] = df["label"].astype(int)

    train_val_df, test_df = train_test_split(df, test_size=0.10, stratify=df["label"], random_state=42)
    train_df, val_df = train_test_split(train_val_df, test_size=0.11111, stratify=train_val_df["label"], random_state=42)

    train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
    test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))
else:
    raise FileNotFoundError("Could not find combined_llmfake.csv in Drive!")

# 3. Tokenization (DeBERTa Specific)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)

print("Tokenizing...")
tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    probs = F.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    try: auc = roc_auc_score(labels, probs)
    except: auc = 0.5
    return {"accuracy": acc, "f1": f1, "auc": auc}

# 5. Training Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    output_dir="./deberta_standard_results", # Local temporary folder
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 6. Train & Evaluate
print(f"Starting Training for {model_name}...")
trainer.train()

print("Evaluating...")
results = trainer.evaluate(tokenized_test)
print(f"✅ MODEL 2 RESULTS: {results}")

# 7. Save to Drive (CRITICAL STEP)
print(f"💾 Saving Model 2 to {drive_save_path}...")
trainer.save_model(drive_save_path)
tokenizer.save_pretrained(drive_save_path)
print("Saved!")

🚀 MODEL 2: Loading Data for microsoft/deberta-v3-base...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Tokenizing...


Map:   0%|          | 0/32992 [00:00<?, ? examples/s]

Map:   0%|          | 0/4124 [00:00<?, ? examples/s]

Map:   0%|          | 0/4125 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Starting Training for microsoft/deberta-v3-base...


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.5503,0.533172,0.681377,0.55123,0.767281
2,0.5298,0.528683,0.67774,0.629289,0.771751
3,0.5031,0.506266,0.680892,0.608333,0.778463


Evaluating...


✅ MODEL 2 RESULTS: {'eval_loss': 0.5450383424758911, 'eval_accuracy': 0.6627878787878788, 'eval_f1': 0.5134662469394893, 'eval_auc': 0.7478122459687637, 'eval_runtime': 84.2062, 'eval_samples_per_second': 48.987, 'eval_steps_per_second': 3.064, 'epoch': 3.0}
💾 Saving Model 2 to /content/drive/MyDrive/DeBERTa_Standard_Model...
Saved!


In [None]:
# ==========================================
# MODEL 3 (DeBERTa Augmented)
# ==========================================
from transformers import AutoModelForSeq2SeqLM
from tqdm.auto import tqdm

# 1. Configuration
drive_save_path = "/content/drive/MyDrive/DeBERTa_Augmented_Model"
augmented_file_path = "/content/drive/MyDrive/augmented_data_cache.csv" # Cache file so we don't regen if it fails

# 2. Data Augmentation (T5)
print("\n MODEL 3: Starting Data Augmentation...")

# Load T5
t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")
t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to(device)

def paraphrase_batch(texts):
    inputs = ["paraphrase: " + text for text in texts]
    encoding = t5_tokenizer(inputs, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = t5_model.generate(encoding.input_ids, max_length=128, num_beams=2, early_stopping=True)
    return t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Select subset to augment (2000 samples)
# We use the 'train_df' from the previous cell (it persists in memory),
# but we check if it exists just in case.
if 'train_df' not in locals():
    print("Reloading train_df for augmentation...")
    df = pd.read_csv("/content/drive/MyDrive/combined_llmfake.csv")
    df["label"] = df["label"].astype(int)
    _, test_df = train_test_split(df, test_size=0.10, stratify=df["label"], random_state=42)
    # Re-create train split
    train_val_df, _ = train_test_split(df, test_size=0.10, stratify=df["label"], random_state=42)
    train_df, val_df = train_test_split(train_val_df, test_size=0.11111, stratify=train_val_df["label"], random_state=42)

subset = train_df.sample(n=2000, random_state=42)
print(f"Generating paraphrases for {len(subset)} samples...")

new_texts = []
raw_texts = subset["text"].tolist()
batch_size = 32

for i in tqdm(range(0, len(raw_texts), batch_size)):
    batch_texts = raw_texts[i : i + batch_size]
    try:
        paraphrased = paraphrase_batch(batch_texts)
        new_texts.extend(paraphrased)
    except Exception as e:
        print(f"Skipped batch due to error: {e}")

# Create Augmented Dataset
aug_df = pd.DataFrame({"text": new_texts, "label": subset["label"].tolist()})
combined_train_df = pd.concat([train_df, aug_df]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Original Train: {len(train_df)} | Augmented: {len(aug_df)} | Combined: {len(combined_train_df)}")

# Convert to Datasets
train_ds_aug = Dataset.from_pandas(combined_train_df)
val_ds_aug = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds_aug = Dataset.from_pandas(test_df.reset_index(drop=True))

# 3. Prepare DeBERTa Model (Fresh Load)
# Note: We must reload the tokenizer/model to clear any T5 artifacts from GPU memory
print("Freeing GPU memory...")
del t5_model
torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=2).to(device)

def tokenize_func(examples):
    return tokenizer(examples["text"], truncation=True, padding=False, max_length=512)

print("Tokenizing Augmented Data...")
tokenized_train = train_ds_aug.map(tokenize_func, batched=True)
tokenized_val = val_ds_aug.map(tokenize_func, batched=True)
tokenized_test = test_ds_aug.map(tokenize_func, batched=True)

# 4. Train
training_args = TrainingArguments(
    output_dir="./deberta_aug_results", # DIFFERENT OUTPUT FOLDER
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting Training for Model 3 (Augmented)...")
trainer.train()

print("Evaluating...")
results = trainer.evaluate(tokenized_test)
print(f"✅ MODEL 3 RESULTS: {results}")

# 5. Save
print(f"💾 Saving Model 3 to {drive_save_path}...")
trainer.save_model(drive_save_path)
tokenizer.save_pretrained(drive_save_path)
print("Saved! All tasks complete.")


 MODEL 3: Starting Data Augmentation...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Generating paraphrases for 2000 samples...


  0%|          | 0/63 [00:00<?, ?it/s]

Original Train: 32992 | Augmented: 2000 | Combined: 34992
Freeing GPU memory...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing Augmented Data...


Map:   0%|          | 0/34992 [00:00<?, ? examples/s]

Map:   0%|          | 0/4124 [00:00<?, ? examples/s]

Map:   0%|          | 0/4125 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Starting Training for Model 3 (Augmented)...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.5479,0.615232,0.635306,0.672045,0.727723
2,0.5258,0.517378,0.671435,0.670396,0.76916
3,0.512,0.518946,0.660039,0.688167,0.764728


Evaluating...


✅ MODEL 3 RESULTS: {'eval_loss': 0.5219699144363403, 'eval_accuracy': 0.6673939393939394, 'eval_f1': 0.6610671936758893, 'eval_auc': 0.7602467003267115, 'eval_runtime': 84.2171, 'eval_samples_per_second': 48.981, 'eval_steps_per_second': 3.064, 'epoch': 3.0}
💾 Saving Model 3 to /content/drive/MyDrive/DeBERTa_Augmented_Model...
Saved! All tasks complete.
