In [1]:
# -------------------------
# Teacher Training (4 epochs, save to results_teacher_4epoch)
# Paste & run as a single cell in Colab
# -------------------------

# 0) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 1) Install dependencies
!pip install -q transformers datasets evaluate accelerate

# 2) Imports
import os, gc, warnings
import numpy as np
import pandas as pd
import torch
import evaluate
from datasets import Dataset, DatasetDict, Value
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, set_seed
)
warnings.filterwarnings("ignore")

# 3) Paths & settings
DRIVE_BASE = "/content/drive/MyDrive/Colab Notebooks/CodeMix"
train_path = os.path.join(DRIVE_BASE, "train.csv")
val_path   = os.path.join(DRIVE_BASE, "val.csv")
test_path  = os.path.join(DRIVE_BASE, "test.csv")

# NEW: save to a separate folder so we don't overwrite previous teacher results
RESULTS_DIR = os.path.join(DRIVE_BASE, "results_teacher_4epoch")
os.makedirs(RESULTS_DIR, exist_ok=True)

SEED = 42
set_seed(SEED)

CHECKPOINT = "distilbert-base-multilingual-cased"
MAX_LEN = 64

# OOM-safe defaults (you can tune later)
PER_DEVICE_BATCH = 4
GRAD_ACCUM = 2
EPOCHS = 4          # <-- changed to 4 epochs
LR = 2e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 4) Load CSVs (from data_prep notebook)
if not os.path.exists(train_path) or not os.path.exists(val_path) or not os.path.exists(test_path):
    raise FileNotFoundError("Expected train/val/test CSVs under DRIVE_BASE. Run data_prep notebook first.")

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)
print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# 5) Hugging Face DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

# 6) Tokenizer + tokenization
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

def tokenize_fn(batch):
    # pad all sequences to the same MAX_LEN for safety
    return tokenizer(batch["review"], truncation=True, padding="max_length", max_length=MAX_LEN)

# Tokenize (drop original columns to avoid duplicates)
dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset["train"].column_names)

# Re-add labels (ensure int)
dataset["train"] = dataset["train"].add_column("label", train_df["label"].astype(int).tolist())
dataset["validation"] = dataset["validation"].add_column("label", val_df["label"].astype(int).tolist())
dataset["test"] = dataset["test"].add_column("label", test_df["label"].astype(int).tolist())

# 7) Force label to int64
for split in ["train", "validation", "test"]:
    dataset[split] = dataset[split].cast_column("label", Value("int64"))

# 8) Torch format
cols = ["input_ids", "attention_mask", "label"]
if "token_type_ids" in dataset["train"].column_names:
    cols.insert(1, "token_type_ids")
dataset.set_format(type="torch", columns=cols)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Check one sample
sample = dataset["train"][0]
print("Sample keys:", list(sample.keys()), "len(input_ids):", len(sample["input_ids"]), "label:", sample["label"])

# 9) Teacher model
num_labels = 2
teacher_config = AutoConfig.from_pretrained(
    CHECKPOINT,
    num_labels=num_labels,
    output_hidden_states=True,
    output_attentions=True
)
teacher = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, config=teacher_config).to(device)

# optionally enable gradient checkpointing if supported (saves memory)
try:
    teacher.gradient_checkpointing_enable()
except Exception:
    pass

# 10) Metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

# 11) TrainingArguments helper (compatibility)
def make_train_args(output_dir, **kwargs):
    ta_kwargs = dict(kwargs)
    # support different HF versions which may have different arg names
    if "evaluation_strategy" in TrainingArguments.__init__.__code__.co_varnames:
        if "eval_strategy" in ta_kwargs:
            ta_kwargs["evaluation_strategy"] = ta_kwargs.pop("eval_strategy")
    else:
        if "evaluation_strategy" in ta_kwargs:
            ta_kwargs["eval_strategy"] = ta_kwargs.pop("evaluation_strategy")
    return TrainingArguments(output_dir=output_dir, **ta_kwargs)

train_args = make_train_args(
    output_dir=RESULTS_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=PER_DEVICE_BATCH,
    per_device_eval_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    logging_steps=50,
    report_to="none",
    fp16=torch.cuda.is_available()
)

# 12) Trainer
trainer_teacher = Trainer(
    model=teacher,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 13) Train
print("Starting teacher training (4 epochs)...")
trainer_teacher.train()

# 14) Evaluate
teacher_eval = trainer_teacher.evaluate(dataset["test"])
print("Teacher test results:", teacher_eval)

# 15) Save model + tokenizer into the results_teacher_4epoch folder
teacher_save_dir = os.path.join(RESULTS_DIR, "model")
tokenizer_save_dir = os.path.join(RESULTS_DIR, "tokenizer")
os.makedirs(teacher_save_dir, exist_ok=True)
os.makedirs(tokenizer_save_dir, exist_ok=True)

trainer_teacher.model.save_pretrained(teacher_save_dir)
tokenizer.save_pretrained(tokenizer_save_dir)
print("Saved teacher model ->", teacher_save_dir)
print("Saved tokenizer ->", tokenizer_save_dir)

# Free GPU memory for student runs
trainer_teacher.model.to("cpu")
gc.collect(); torch.cuda.empty_cache()


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda
Train/Val/Test sizes: 3000 800 800


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Sample keys: ['input_ids', 'attention_mask', 'label'] len(input_ids): 64 label: tensor(0)


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Starting teacher training (4 epochs)...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.5563,0.580764,0.72875,0.728342
2,0.4219,0.539543,0.7575,0.757494
3,0.3425,0.659168,0.75875,0.758514
4,0.2007,0.826626,0.75625,0.75625


Teacher test results: {'eval_loss': 0.859175443649292, 'eval_accuracy': 0.7525, 'eval_macro_f1': 0.7523126864691423, 'eval_runtime': 5.2476, 'eval_samples_per_second': 152.45, 'eval_steps_per_second': 38.113, 'epoch': 4.0}
Saved teacher model -> /content/drive/MyDrive/Colab Notebooks/CodeMix/results_teacher_4epoch/model
Saved tokenizer -> /content/drive/MyDrive/Colab Notebooks/CodeMix/results_teacher_4epoch/tokenizer
