In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!pip install protobuf==3.20.3
# for kaggle

In [None]:
from torch.utils.data import Dataset
import json

path = "/kaggle/input/code-review-model-train-test-val-jsnol-files/val_data.jsonl"

with open(path, "r", encoding="utf-8") as f:
    ex = json.loads(f.readline())

In [None]:
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM
model_name = 't5-small'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
max_input_tok = 256
max_target_tok = 128

In [None]:
enc_inputs = tokenizer(ex['prompt'], truncation = True, max_length = max_input_tok)
enc_target = tokenizer(ex['completion'], truncation = True, max_length = max_target_tok)

In [None]:
import torch

input_ids = torch.tensor(enc_inputs['input_ids'])
attention_mask = torch.tensor(enc_inputs['attention_mask'])
labels = torch.tensor(enc_target['input_ids'])

In [None]:
batch = {
    "input_ids" : input_ids.unsqueeze(0),
    "attention_mask" : attention_mask.unsqueeze(0),
    "labels" : labels.unsqueeze(0)
}

In [None]:
def collate_fn(batch):
    inputs = [{"input_ids": b["input_ids"], "attention_mask": b.get("attention_mask", None)} for b in batch]
    padded = tokenizer.pad(inputs, return_tensors="pt")

    labels = [b["labels"] for b in batch]
    padded_labels = tokenizer.pad({"input_ids": labels}, return_tensors="pt")["input_ids"]

    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
    padded_labels[padded_labels == pad_id] = -100

    return {
        "input_ids": padded["input_ids"],
        "attention_mask": padded["attention_mask"],
        "labels": padded_labels
    }

In [None]:
class PRDataset(Dataset):
    def __init__(self, path, tokenizer,
                 max_input_len=256, max_target_len=128):

        self.path = path
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

        self._offsets = []
        with open(self.path, "r", encoding="utf-8") as f:
            offset = f.tell()
            line = f.readline()
            while line:
                self._offsets.append(offset)
                offset = f.tell()
                line = f.readline()

    def __len__(self):
        return len(self._offsets)

    def __getitem__(self, idx):
        with open(self.path, "r", encoding="utf-8") as f:
            f.seek(self._offsets[idx])
            rec = json.loads(f.readline())

        enc = self.tokenizer(
            rec["prompt"],
            truncation=True,
            max_length=self.max_input_len,
            return_attention_mask=True
        )
        tgt = self.tokenizer(
            rec["completion"],
            truncation=True,
            max_length=self.max_target_len
        )

        return {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "labels": tgt["input_ids"]
        }



path = "/kaggle/input/code-review-model-train-test-val-jsnol-files/val_data.jsonl"

train_dataset = PRDataset(
    path,
    tokenizer,
    max_input_len=256,
    max_target_len=128
)

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=8,
    pin_memory=True,
    collate_fn=collate_fn,
    persistent_workers=True
)

val_dataset = PRDataset(
    "/kaggle/input/code-review-model-train-test-val-jsnol-files/val_data.jsonl",
    tokenizer,
    max_input_len=256,
    max_target_len=128
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    collate_fn=collate_fn,
    persistent_workers=True
)


In [None]:
batch = next(iter(train_loader))
assert batch['input_ids'].ndim == 2
print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['labels'].shape)

In [None]:
import torch
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader
import torch.nn.utils as nn_utils

torch.backends.cudnn.benchmark = True 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.config.use_cache = False
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

ACCUM_STEPS = 8 
scaler = GradScaler()
optimizer.zero_grad()

for epoch in range(1,6):
    model.train()
    total_loss = 0.0
    step_count = 0

    for step, batch in enumerate(train_loader):
        # move tensors to device
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)

        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                use_cache=False
            )
            loss = outputs.loss
            loss = loss / ACCUM_STEPS

        scaler.scale(loss).backward()

        if (step + 1) % ACCUM_STEPS == 0:
            # clip grads (optional)
            scaler.unscale_(optimizer)
            nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * ACCUM_STEPS  # accumulate back to per-step loss
        step_count += 1

    avg_train_loss = total_loss / max(1, step_count)
    print(f"Epoch {epoch} train avg loss: {avg_train_loss:.4f}")

    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad():
        for vbatch in val_loader:
            v_input_ids = vbatch['input_ids'].to(device, non_blocking=True)
            v_attention_mask = vbatch['attention_mask'].to(device, non_blocking=True)
            v_labels = vbatch['labels'].to(device, non_blocking=True)
            with autocast():
                vout = model(
                    input_ids=v_input_ids,
                    attention_mask=v_attention_mask,
                    labels=v_labels,
                    use_cache=False
                )
            val_loss += vout.loss.item()
            val_steps += 1
    print(f"Epoch {epoch} val avg loss: {val_loss / max(1, val_steps):.4f}")
    
model.save_pretrained("/kaggle/working/")
tokenizer.save_pretrained("/kaggle/working/")

In [None]:
model.save_pretrained("/kaggle/working/")