In [None]:
# Install dependencies
!pip install transformers torch datasets scikit-learn -q


# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


# Imports
import os
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import joblib

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from datasets import Dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# CONFIGURATION
TRAIN_PATH = "/content/drive/MyDrive/train_none.csv"
MODEL_DIR = "/content/drive/MyDrive"

MODELS = ["cohere-chat", "gpt4", "mistral-chat", "mpt-chat", "llama-chat"]
RANDOM_STATE = 5

# DistilBERT Configuration
BERT_MODEL_NAME = "distilbert-base-uncased"
BERT_MAX_LENGTH = 256
BERT_BATCH_SIZE = 32
BERT_EPOCHS = 4
BERT_LEARNING_RATE = 2e-5

os.makedirs(MODEL_DIR, exist_ok=True)

print("Setup complete!")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


# Load and prepare data
print(f"Loading dataset from: {TRAIN_PATH}")
full_df = pd.read_csv(TRAIN_PATH)
print(f"Loaded {len(full_df)} rows")

# Split into train/test
train_df, test_df = train_test_split(
    full_df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=full_df["model"]
)

# Filter to specific models
train_df = train_df[train_df["model"].isin(MODELS)]
test_df = test_df[test_df["model"].isin(MODELS)]

train_texts = train_df["generation"].astype(str)
train_labels = train_df["model"]
test_texts = test_df["generation"].astype(str)
test_labels = test_df["model"]

print(f"Train size: {len(train_texts)}")
print(f"Test size: {len(test_texts)}")
print(f"\nLabel distribution:\n{train_labels.value_counts()}")

# Label encoding
le = LabelEncoder()
y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

print(f"\nClasses: {list(le.classes_)}")

# Save label encoder
joblib.dump(le, os.path.join(MODEL_DIR, "label_encoder.pkl"))

# Save test data for later comparison
joblib.dump({
    'test_texts': test_texts,
    'y_test': y_test,
    'test_labels': test_labels,
    'test_df': test_df
}, os.path.join(MODEL_DIR, "test_data.pkl"))


# Fine-tune
from torch.utils.data import DataLoader

bert_model_path = os.path.join(MODEL_DIR, "distilbert_model")

print("Fine-tuning DistilBERT...")
print(f"  Model: {BERT_MODEL_NAME}")
print(f"  Max Length: {BERT_MAX_LENGTH}")
print(f"  Batch Size: {BERT_BATCH_SIZE}")
print(f"  Epochs: {BERT_EPOCHS}")

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL_NAME,
    num_labels=len(le.classes_)
)

# Tokenize in batches to avoid RAM crash
print("\nTokenizing texts in batches...")

def tokenize_in_batches(texts, labels, tokenizer, max_length, batch_size=1000):
    all_input_ids = []
    all_attention_mask = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts.iloc[i:i+batch_size].tolist()
        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        all_input_ids.append(encodings['input_ids'])
        all_attention_mask.append(encodings['attention_mask'])

        if (i // batch_size) % 10 == 0:
            print(f"  Tokenized {min(i+batch_size, len(texts))}/{len(texts)}")

    return Dataset.from_dict({
        'input_ids': torch.cat(all_input_ids),
        'attention_mask': torch.cat(all_attention_mask),
        'labels': labels
    })

train_dataset = tokenize_in_batches(train_texts, y_train, tokenizer, BERT_MAX_LENGTH)
print("Train tokenization complete!")

eval_dataset = tokenize_in_batches(test_texts, y_test, tokenizer, BERT_MAX_LENGTH)
print("Eval tokenization complete!")

# Training arguments
training_args = TrainingArguments(
    output_dir=bert_model_path,
    num_train_epochs=BERT_EPOCHS,
    per_device_train_batch_size=BERT_BATCH_SIZE,
    per_device_eval_batch_size=BERT_BATCH_SIZE,
    learning_rate=BERT_LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    warmup_steps=500,
    fp16=True,
    report_to="none",
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='macro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

Setup complete!
CUDA available: True
GPU: Tesla T4
Loading dataset from: /content/drive/MyDrive/train_none.csv
Loaded 467985 rows
Train size: 171149
Test size: 42787

Label distribution:
model
mpt-chat        42787
mistral-chat    42787
llama-chat      42787
gpt4            21394
cohere-chat     21394
Name: count, dtype: int64

Classes: ['cohere-chat', 'gpt4', 'llama-chat', 'mistral-chat', 'mpt-chat']
Fine-tuning DistilBERT...
  Model: distilbert-base-uncased
  Max Length: 256
  Batch Size: 32
  Epochs: 4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Tokenizing texts in batches...
  Tokenized 1000/171149
  Tokenized 11000/171149
  Tokenized 21000/171149
  Tokenized 31000/171149
  Tokenized 41000/171149
  Tokenized 51000/171149
  Tokenized 61000/171149
  Tokenized 71000/171149
  Tokenized 81000/171149
  Tokenized 91000/171149
  Tokenized 101000/171149
  Tokenized 111000/171149
  Tokenized 121000/171149
  Tokenized 131000/171149
  Tokenized 141000/171149
  Tokenized 151000/171149
  Tokenized 161000/171149
  Tokenized 171000/171149
Train tokenization complete!
  Tokenized 1000/42787
  Tokenized 11000/42787
  Tokenized 21000/42787
  Tokenized 31000/42787
  Tokenized 41000/42787
Eval tokenization complete!


In [None]:
# Train
print("\nStarting training...")
trainer.train()

# Save model
model.save_pretrained(bert_model_path)
tokenizer.save_pretrained(bert_model_path)
print(f"\n✅ Model saved to: {bert_model_path}")


# Evaluate and save predictions
print("\nEvaluating model...")
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

bert_preds = []
bert_probs = []

for i in range(0, len(test_texts), BERT_BATCH_SIZE):
    batch_texts = test_texts.iloc[i:i+BERT_BATCH_SIZE].tolist()

    encodings = tokenizer(
        batch_texts,
        truncation=True,
        padding=True,
        max_length=BERT_MAX_LENGTH,
        return_tensors='pt'
    )
    encodings = {k: v.to(device) for k, v in encodings.items()}

    with torch.no_grad():
        outputs = model(**encodings)
        probs = torch.softmax(outputs.logits, dim=1)
        preds = torch.argmax(outputs.logits, dim=1)

    bert_preds.extend(preds.cpu().numpy())
    bert_probs.extend(probs.cpu().numpy())

bert_preds = np.array(bert_preds)
bert_probs = np.array(bert_probs)

# Save predictions
joblib.dump({
    'predictions': bert_preds,
    'probabilities': bert_probs
}, os.path.join(MODEL_DIR, "bert_predictions.pkl"))

print(f"\n{'='*50}")
print("RESULTS")
print(f"{'='*50}")
print(f"Accuracy: {accuracy_score(y_test, bert_preds):.4f}")
print(f"Macro F1: {f1_score(y_test, bert_preds, average='macro'):.4f}")
print(f"\n✅ Predictions saved to: {MODEL_DIR}/bert_predictions.pkl")


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3927,0.362469,0.862996,0.868745
2,0.2811,0.328622,0.880852,0.884814
3,0.1719,0.331171,0.892771,0.896715
4,0.1003,0.359046,0.897796,0.901741



✅ Model saved to: /content/drive/MyDrive/distilbert_model

Evaluating model...

RESULTS
Accuracy: 0.8978
Macro F1: 0.9017

✅ Predictions saved to: /content/drive/MyDrive/bert_predictions.pkl


In [None]:
# Continue fine-tuning for 4 more epochs

# Paths
LOAD_MODEL_PATH = "/content/drive/MyDrive/distilbert_model/4_epochs"
SAVE_MODEL_PATH = "/content/drive/MyDrive/distilbert_model/8_epochs"
BERT_MAX_LENGTH = 256
BERT_BATCH_SIZE = 32
BERT_EPOCHS = 4  # 4 more epochs
BERT_LEARNING_RATE = 1e-5  # Slightly lower LR for continued training

os.makedirs(SAVE_MODEL_PATH, exist_ok=True)

print(f"Loading model from: {LOAD_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(LOAD_MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(LOAD_MODEL_PATH)
print("✅ Model loaded!")

# Tokenize in batches
print("\nTokenizing texts in batches...")

def tokenize_in_batches(texts, labels, tokenizer, max_length, batch_size=1000):
    all_input_ids = []
    all_attention_mask = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts.iloc[i:i+batch_size].tolist()
        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        all_input_ids.append(encodings['input_ids'])
        all_attention_mask.append(encodings['attention_mask'])

        if (i // batch_size) % 20 == 0:
            print(f"  Tokenized {min(i+batch_size, len(texts))}/{len(texts)}")

    return Dataset.from_dict({
        'input_ids': torch.cat(all_input_ids),
        'attention_mask': torch.cat(all_attention_mask),
        'labels': labels
    })

train_dataset = tokenize_in_batches(train_texts, y_train, tokenizer, BERT_MAX_LENGTH)
print("Train tokenization complete!")

eval_dataset = tokenize_in_batches(test_texts, y_test, tokenizer, BERT_MAX_LENGTH)
print("Eval tokenization complete!")

# Training arguments
training_args = TrainingArguments(
    output_dir=SAVE_MODEL_PATH,
    num_train_epochs=BERT_EPOCHS,
    per_device_train_batch_size=BERT_BATCH_SIZE,
    per_device_eval_batch_size=BERT_BATCH_SIZE,
    learning_rate=BERT_LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    warmup_steps=200,
    fp16=True,
    report_to="none",
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='macro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("\nContinuing training for 4 more epochs...")
trainer.train()

# Save to new location
model.save_pretrained(SAVE_MODEL_PATH)
tokenizer.save_pretrained(SAVE_MODEL_PATH)

# Evaluate
print("\nEvaluating...")
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

bert_preds = []
bert_probs = []

for i in range(0, len(test_texts), BERT_BATCH_SIZE):
    batch_texts = test_texts.iloc[i:i+BERT_BATCH_SIZE].tolist()

    encodings = tokenizer(
        batch_texts,
        truncation=True,
        padding=True,
        max_length=BERT_MAX_LENGTH,
        return_tensors='pt'
    )
    encodings = {k: v.to(device) for k, v in encodings.items()}

    with torch.no_grad():
        outputs = model(**encodings)
        probs = torch.softmax(outputs.logits, dim=1)
        preds = torch.argmax(outputs.logits, dim=1)

    bert_preds.extend(preds.cpu().numpy())
    bert_probs.extend(probs.cpu().numpy())

bert_preds = np.array(bert_preds)
bert_probs = np.array(bert_probs)

# Save predictions
joblib.dump({
    'predictions': bert_preds,
    'probabilities': bert_probs
}, os.path.join(SAVE_MODEL_PATH, "predictions.pkl"))

print(f"\n{'='*50}")
print("RESULTS (8 epochs total)")
print(f"{'='*50}")
print(f"Accuracy: {accuracy_score(y_test, bert_preds):.4f}")
print(f"Macro F1: {f1_score(y_test, bert_preds, average='macro'):.4f}")
print(f"\n✅ Model saved to: {SAVE_MODEL_PATH}")
print(f"✅ Original 4-epoch model unchanged at: {LOAD_MODEL_PATH}")

Loading model from: /content/drive/MyDrive/distilbert_model/4_epochs
✅ Model loaded!

Tokenizing texts in batches...
  Tokenized 1000/171149
  Tokenized 21000/171149
  Tokenized 41000/171149
  Tokenized 61000/171149
  Tokenized 81000/171149
  Tokenized 101000/171149
  Tokenized 121000/171149
  Tokenized 141000/171149
  Tokenized 161000/171149
Train tokenization complete!
  Tokenized 1000/42787
  Tokenized 21000/42787
  Tokenized 41000/42787
Eval tokenization complete!

Continuing training for 4 more epochs...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.087,0.509059,0.882932,0.88578


Aborted training before the 4 epochs were completed because of sharp increase in validation loss on the forst epoch.