<a href="https://colab.research.google.com/github/adititadkod15-tech/HinglishLID1/blob/main/Google_T5_ITN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 1. Re-initialize the model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 2. Setup device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model loaded on: {device}")

# 3. Define the inference function
def run_itn(text):
    input_text = "itn: " + text
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate
    outputs = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 4. Test it
sample = "the date is may fifth twenty twenty six"
print(f"Input: {sample}")
print(f"Output: {run_itn(sample)}")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/131 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded on: cuda
Input: the date is may fifth twenty twenty six
Output: date: the date is may fifth twenty twenty six


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# 1. Load your model and tokenizer (T5-small is recommended for fast iterations)
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 2. Data Collator: Handles dynamic padding of batches for Seq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 3. Training Arguments: Optimized for Colab/GPU performance
training_args = Seq2SeqTrainingArguments(
    output_dir="./itn_t5_results",
    evaluation_strategy="steps",
    eval_steps=1000,               # Evaluate every 1000 steps
    learning_rate=3e-4,            # Standard for T5 fine-tuning
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,            # 1 epoch is massive with 10M rows
    predict_with_generate=True,    # Use generation for eval metrics
    fp16=True,                     # Accelerates training on NVIDIA GPUs
    logging_dir='./logs',
    logging_steps=100,
    push_to_hub=False,
)

# 4. Initialize the Trainer
# Assuming 'tokenized_dataset' is created from your t5_train.csv
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset.get("validation"), # Optional
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 5. Start Training
print("ðŸš€ Launching Neural ITN Training...")
trainer.train()

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

# 1. Paths - Update these to your local server paths
MODEL_PATH = "./t5-small-local"  # The folder where you downloaded T5
DATA_PATH = "./data/t5_train.csv"
OUTPUT_DIR = "./itn_final_results"

# 2. Load Model & Tokenizer from your local download
print("Loading model from local storage...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)

# 3. Load Dataset
# Note: For 10M rows, this might take a moment to load into memory.
dataset = load_dataset('csv', data_files={'train': DATA_PATH})

def preprocess_function(examples):
    # We add the "itn: " prefix so the model knows the task
    inputs = ["itn: " + str(doc) for doc in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=128, truncation=True, padding=False)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing data...")
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=4 # Uses 4 CPU cores for faster processing
)

# 4. V100 Optimized Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=32,   # V100 can easily handle 32-64 for t5-small
    gradient_accumulation_steps=2,    # Total effective batch size = 64
    learning_rate=3e-4,
    num_train_epochs=1,               # 10M rows usually only needs 1 pass
    fp16=True,                        # Vital for V100 performance
    logging_steps=100,
    save_steps=5000,                  # Save every 5k steps (don't fill up disk)
    save_total_limit=2,
    dataloader_num_workers=4,         # Keeps the GPU fed with data
    predict_with_generate=True,
    report_to="none"
)

# 5. Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

# 6. Start Training
print("ðŸš€ Training started on V100...")
trainer.train()

# 7. Save the Final Model
trainer.save_model(f"{OUTPUT_DIR}/itn_model_v100_final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/itn_model_v100_final")
print("âœ… Done! Model saved locally.")

In [None]:
https://huggingface.co/google-t5/t5-small/tree/main