In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install transformers datasets sacrebleu sentencepiece




In [4]:
from datasets import load_dataset, Dataset

dataset = load_dataset("juancavallotti/multilingual-gec")

print("datasetdict looks like:",dataset)

# Keep only relevant columns for eng
train_dataset = dataset["train"].filter(lambda example: example["lang"] == "en")
train_dataset = train_dataset.remove_columns(["transformation", "sec_transformation", "__index_level_0__"])

print(train_dataset[0:5])

datasetdict looks like: DatasetDict({
    train: Dataset({
        features: ['lang', 'sentence', 'modified', 'transformation', 'sec_transformation', '__index_level_0__'],
        num_rows: 216318
    })
    test: Dataset({
        features: ['lang', 'sentence', 'modified', 'transformation', 'sec_transformation', '__index_level_0__'],
        num_rows: 2186
    })
})
{'lang': ['en', 'en', 'en', 'en', 'en'], 'sentence': ['Plants, obviously, cannot move after they have put down roots.', 'I looked at the schedule.', "It's very hard to get rid of bad habits.", "Anyway, I think I've said enough.", 'Technologies allow you to do more things.'], 'modified': ["fix grammar: Plants, obviously, cannot moved after they hadn't put down roots.", 'fix grammar: I looked at schedule.', 'fix grammar: It am very hard to get rid of bad habits.', "fix grammar: Anyway, think I've said enough.", 'fix grammar: Technologies allow you to do most things.']}


In [5]:
processed_data = {
    "input_text": train_dataset["modified"],
    "target_text": train_dataset["sentence"]
}


# Convert to Hugging Face Dataset format
formatted_dataset = Dataset.from_dict(processed_data)


for i in range(5):  
    print(f"Example {i+1}:")
    print(f"  Input: {formatted_dataset['input_text'][i]}")
    print(f"  Target: {formatted_dataset['target_text'][i]}")
    print("-" * 50)

Example 1:
  Input: fix grammar: Plants, obviously, cannot moved after they hadn't put down roots.
  Target: Plants, obviously, cannot move after they have put down roots.
--------------------------------------------------
Example 2:
  Input: fix grammar: I looked at schedule.
  Target: I looked at the schedule.
--------------------------------------------------
Example 3:
  Input: fix grammar: It am very hard to get rid of bad habits.
  Target: It's very hard to get rid of bad habits.
--------------------------------------------------
Example 4:
  Input: fix grammar: Anyway, think I've said enough.
  Target: Anyway, I think I've said enough.
--------------------------------------------------
Example 5:
  Input: fix grammar: Technologies allow you to do most things.
  Target: Technologies allow you to do more things.
--------------------------------------------------


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Tokenize input and target text
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": targets["input_ids"]
    }


tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

print(tokenized_dataset[0])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/50862 [00:00<?, ? examples/s]

{'input_text': "fix grammar: Plants, obviously, cannot moved after they hadn't put down roots.", 'target_text': 'Plants, obviously, cannot move after they have put down roots.', 'input_ids': [2210, 19519, 10, 6041, 7, 6, 6865, 6, 1178, 2301, 227, 79, 12381, 31, 17, 474, 323, 8523, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [8]:
from sklearn.model_selection import train_test_split

train_size = int(0.8 * len(tokenized_dataset))
train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

In [10]:
from transformers import  TrainerCallback

model = T5ForConditionalGeneration.from_pretrained("t5-base")

training_args = TrainingArguments(
    output_dir="/kaggle/working/t5-grammar-corrector",  
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=50, 
    save_steps=500,  
    evaluation_strategy="steps",  
    eval_steps=500,  
    save_total_limit=2, 
    fp16=True,
    report_to="none", 
)


class ProgressCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            print(f"Step {state.global_step}: Loss = {logs.get('loss', 'N/A')}, LR = {logs.get('learning_rate', 'N/A')}", flush=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[ProgressCallback()],
)


trainer.train()


model.save_pretrained("/kaggle/working/t5-grammar-corrector")


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.027,0.018151
1000,0.0178,0.015229
1500,0.0172,0.013195
2000,0.0154,0.012304
2500,0.014,0.011244
3000,0.0124,0.011037
3500,0.0111,0.010668
4000,0.0107,0.010229
4500,0.0095,0.010147
5000,0.0115,0.009619


Step 50: Loss = 3.093, LR = 4.9672431865828094e-05
Step 100: Loss = 0.1017, LR = 4.9344863731656185e-05
Step 150: Loss = 0.043, LR = 4.9017295597484283e-05
Step 200: Loss = 0.0337, LR = 4.8689727463312375e-05
Step 250: Loss = 0.0284, LR = 4.8362159329140466e-05
Step 300: Loss = 0.0292, LR = 4.803459119496855e-05
Step 350: Loss = 0.0258, LR = 4.770702306079665e-05
Step 400: Loss = 0.0287, LR = 4.737945492662474e-05
Step 450: Loss = 0.0257, LR = 4.705188679245283e-05
Step 500: Loss = 0.027, LR = 4.672431865828092e-05
Step 500: Loss = N/A, LR = N/A




Step 550: Loss = 0.024, LR = 4.6396750524109015e-05
Step 600: Loss = 0.0212, LR = 4.606918238993711e-05
Step 650: Loss = 0.023, LR = 4.5741614255765204e-05
Step 700: Loss = 0.0205, LR = 4.5414046121593296e-05
Step 750: Loss = 0.0193, LR = 4.508647798742139e-05
Step 800: Loss = 0.021, LR = 4.475890985324948e-05
Step 850: Loss = 0.0179, LR = 4.443134171907757e-05
Step 900: Loss = 0.0194, LR = 4.410377358490566e-05
Step 950: Loss = 0.0185, LR = 4.377620545073375e-05
Step 1000: Loss = 0.0178, LR = 4.3448637316561844e-05
Step 1000: Loss = N/A, LR = N/A




Step 1050: Loss = 0.019, LR = 4.312106918238994e-05
Step 1100: Loss = 0.0185, LR = 4.2793501048218034e-05
Step 1150: Loss = 0.0175, LR = 4.2465932914046125e-05
Step 1200: Loss = 0.018, LR = 4.213836477987422e-05
Step 1250: Loss = 0.0176, LR = 4.181079664570231e-05
Step 1300: Loss = 0.0163, LR = 4.14832285115304e-05
Step 1350: Loss = 0.0153, LR = 4.115566037735849e-05
Step 1400: Loss = 0.0159, LR = 4.082809224318658e-05
Step 1450: Loss = 0.0191, LR = 4.0500524109014674e-05
Step 1500: Loss = 0.0172, LR = 4.017295597484277e-05
Step 1500: Loss = N/A, LR = N/A




Step 1550: Loss = 0.0164, LR = 3.984538784067086e-05
Step 1600: Loss = 0.0144, LR = 3.9517819706498955e-05
Step 1650: Loss = 0.015, LR = 3.9190251572327046e-05
Step 1700: Loss = 0.0152, LR = 3.886268343815514e-05
Step 1750: Loss = 0.0157, LR = 3.8535115303983236e-05
Step 1800: Loss = 0.0144, LR = 3.820754716981133e-05
Step 1850: Loss = 0.0159, LR = 3.787997903563941e-05
Step 1900: Loss = 0.0154, LR = 3.75524109014675e-05
Step 1950: Loss = 0.0163, LR = 3.7224842767295595e-05
Step 2000: Loss = 0.0154, LR = 3.689727463312369e-05
Step 2000: Loss = N/A, LR = N/A




Step 2200: Loss = 0.0152, LR = 3.558700209643606e-05
Step 2250: Loss = 0.0157, LR = 3.525943396226416e-05
Step 2300: Loss = 0.0145, LR = 3.493186582809225e-05
Step 2350: Loss = 0.0149, LR = 3.460429769392033e-05
Step 2400: Loss = 0.0155, LR = 3.4276729559748424e-05
Step 2450: Loss = 0.0152, LR = 3.394916142557652e-05
Step 2500: Loss = 0.014, LR = 3.3621593291404614e-05
Step 2500: Loss = N/A, LR = N/A




Step 2550: Loss = 0.0149, LR = 3.3294025157232705e-05
Step 2600: Loss = 0.0116, LR = 3.2966457023060796e-05
Step 2650: Loss = 0.0115, LR = 3.263888888888889e-05
Step 2700: Loss = 0.0129, LR = 3.2311320754716986e-05
Step 2750: Loss = 0.0136, LR = 3.198375262054508e-05
Step 2800: Loss = 0.0119, LR = 3.165618448637317e-05
Step 2850: Loss = 0.0115, LR = 3.132861635220126e-05
Step 2900: Loss = 0.0101, LR = 3.100104821802935e-05
Step 2950: Loss = 0.0103, LR = 3.067348008385744e-05
Step 3000: Loss = 0.0124, LR = 3.0345911949685535e-05
Step 3000: Loss = N/A, LR = N/A




Step 3050: Loss = 0.0121, LR = 3.0018343815513626e-05
Step 3100: Loss = 0.0117, LR = 2.969077568134172e-05
Step 3150: Loss = 0.0117, LR = 2.9363207547169812e-05
Step 3200: Loss = 0.0118, LR = 2.9035639412997907e-05
Step 3250: Loss = 0.0111, LR = 2.8708071278826e-05
Step 3300: Loss = 0.0111, LR = 2.838050314465409e-05
Step 3350: Loss = 0.0116, LR = 2.8052935010482185e-05
Step 3400: Loss = 0.0124, LR = 2.7725366876310273e-05
Step 3450: Loss = 0.0115, LR = 2.7397798742138364e-05
Step 3500: Loss = 0.0111, LR = 2.7070230607966455e-05
Step 3500: Loss = N/A, LR = N/A




Step 3550: Loss = 0.0108, LR = 2.674266247379455e-05
Step 3600: Loss = 0.0104, LR = 2.641509433962264e-05
Step 3650: Loss = 0.0107, LR = 2.6087526205450736e-05
Step 3700: Loss = 0.0113, LR = 2.5759958071278828e-05
Step 3750: Loss = 0.0116, LR = 2.543238993710692e-05
Step 3800: Loss = 0.0102, LR = 2.5104821802935014e-05
Step 3850: Loss = 0.0098, LR = 2.4777253668763102e-05
Step 3900: Loss = 0.0111, LR = 2.4449685534591197e-05
Step 3950: Loss = 0.0111, LR = 2.412211740041929e-05
Step 4000: Loss = 0.0107, LR = 2.3794549266247383e-05
Step 4000: Loss = N/A, LR = N/A




Step 4050: Loss = 0.0121, LR = 2.346698113207547e-05
Step 4100: Loss = 0.009, LR = 2.3139412997903566e-05
Step 4150: Loss = 0.0104, LR = 2.2811844863731657e-05
Step 4200: Loss = 0.0108, LR = 2.248427672955975e-05
Step 4250: Loss = 0.0105, LR = 2.2156708595387844e-05
Step 4300: Loss = 0.0115, LR = 2.182914046121593e-05
Step 4350: Loss = 0.0096, LR = 2.1501572327044026e-05
Step 4400: Loss = 0.0096, LR = 2.1174004192872118e-05
Step 4450: Loss = 0.0103, LR = 2.0846436058700213e-05
Step 4500: Loss = 0.0095, LR = 2.0518867924528304e-05
Step 4500: Loss = N/A, LR = N/A




Step 4550: Loss = 0.0109, LR = 2.0191299790356395e-05
Step 4600: Loss = 0.0102, LR = 1.9863731656184487e-05
Step 4650: Loss = 0.0099, LR = 1.9536163522012578e-05
Step 4700: Loss = 0.0111, LR = 1.9208595387840673e-05
Step 4750: Loss = 0.0106, LR = 1.8881027253668765e-05
Step 4800: Loss = 0.0094, LR = 1.8553459119496856e-05
Step 4850: Loss = 0.0097, LR = 1.8225890985324947e-05
Step 4900: Loss = 0.0089, LR = 1.789832285115304e-05
Step 4950: Loss = 0.0112, LR = 1.7570754716981134e-05
Step 5000: Loss = 0.0115, LR = 1.7243186582809225e-05
Step 5000: Loss = N/A, LR = N/A




Step 5050: Loss = 0.0094, LR = 1.691561844863732e-05
Step 5100: Loss = 0.0089, LR = 1.6588050314465408e-05
Step 5150: Loss = 0.0087, LR = 1.6260482180293503e-05
Step 5200: Loss = 0.0088, LR = 1.5932914046121594e-05
Step 5250: Loss = 0.0088, LR = 1.5605345911949685e-05
Step 5300: Loss = 0.008, LR = 1.527777777777778e-05
Step 5350: Loss = 0.009, LR = 1.495020964360587e-05
Step 5400: Loss = 0.0087, LR = 1.4622641509433963e-05
Step 5450: Loss = 0.0095, LR = 1.4295073375262054e-05
Step 5500: Loss = 0.0088, LR = 1.3967505241090148e-05
Step 5500: Loss = N/A, LR = N/A




Step 5550: Loss = 0.0081, LR = 1.363993710691824e-05
Step 5600: Loss = 0.0075, LR = 1.331236897274633e-05
Step 5650: Loss = 0.0088, LR = 1.2984800838574423e-05
Step 5700: Loss = 0.0091, LR = 1.2657232704402517e-05
Step 5750: Loss = 0.0076, LR = 1.232966457023061e-05
Step 5800: Loss = 0.0091, LR = 1.2002096436058701e-05
Step 5850: Loss = 0.0084, LR = 1.1674528301886793e-05
Step 5900: Loss = 0.0087, LR = 1.1346960167714884e-05
Step 5950: Loss = 0.0086, LR = 1.1019392033542977e-05
Step 6000: Loss = 0.009, LR = 1.069182389937107e-05
Step 6000: Loss = N/A, LR = N/A




Step 6050: Loss = 0.0078, LR = 1.0364255765199162e-05
Step 6100: Loss = 0.0084, LR = 1.0036687631027255e-05
Step 6150: Loss = 0.0071, LR = 9.709119496855348e-06
Step 6200: Loss = 0.0084, LR = 9.38155136268344e-06
Step 6250: Loss = 0.0088, LR = 9.05398322851153e-06
Step 6300: Loss = 0.0085, LR = 8.726415094339622e-06
Step 6350: Loss = 0.0089, LR = 8.398846960167715e-06
Step 6400: Loss = 0.0093, LR = 8.071278825995808e-06
Step 6450: Loss = 0.0088, LR = 7.7437106918239e-06
Step 6500: Loss = 0.0083, LR = 7.416142557651992e-06
Step 6500: Loss = N/A, LR = N/A




Step 6550: Loss = 0.0074, LR = 7.088574423480083e-06
Step 6600: Loss = 0.0082, LR = 6.761006289308176e-06
Step 6650: Loss = 0.0077, LR = 6.433438155136269e-06
Step 6700: Loss = 0.0076, LR = 6.105870020964361e-06
Step 6750: Loss = 0.0087, LR = 5.778301886792453e-06
Step 6800: Loss = 0.0082, LR = 5.4507337526205454e-06
Step 6850: Loss = 0.008, LR = 5.123165618448638e-06
Step 6900: Loss = 0.0087, LR = 4.79559748427673e-06
Step 6950: Loss = 0.0076, LR = 4.468029350104822e-06
Step 7000: Loss = 0.0083, LR = 4.1404612159329145e-06
Step 7000: Loss = N/A, LR = N/A




Step 7050: Loss = 0.0082, LR = 3.8128930817610063e-06
Step 7100: Loss = 0.0075, LR = 3.4853249475890986e-06
Step 7150: Loss = 0.009, LR = 3.1577568134171913e-06
Step 7200: Loss = 0.0084, LR = 2.830188679245283e-06
Step 7250: Loss = 0.0075, LR = 2.5026205450733754e-06
Step 7300: Loss = 0.008, LR = 2.1750524109014676e-06
Step 7350: Loss = 0.0082, LR = 1.8474842767295599e-06
Step 7400: Loss = 0.0078, LR = 1.519916142557652e-06
Step 7450: Loss = 0.0072, LR = 1.1923480083857442e-06
Step 7500: Loss = 0.0089, LR = 8.647798742138365e-07
Step 7500: Loss = N/A, LR = N/A




Step 7550: Loss = 0.0091, LR = 5.372117400419287e-07
Step 7600: Loss = 0.0092, LR = 2.09643605870021e-07
Step 7632: Loss = N/A, LR = N/A


In [9]:
pip install sacrebleu


Note: you may need to restart the kernel to use updated packages.


In [10]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer


device = "cuda" if torch.cuda.is_available() else "cpu"


model_path = "/kaggle/working/t5-grammar-corrector"
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)


tokenizer = T5Tokenizer.from_pretrained("t5-base")

print("Model loaded")


Model loaded


In [11]:
import torch

sample = eval_dataset[0]
input_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
true_output = tokenizer.decode(sample["labels"], skip_special_tokens=True)


model.eval()
with torch.no_grad():
    pred_ids = model.generate(torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device))
    predicted_text = tokenizer.decode(pred_ids[0], skip_special_tokens=True)

print("Input Text:", input_text)
print("Expected Output:", true_output)
print("Model Prediction:", predicted_text)


Input Text: fix grammar: Things is improving in Algeria.
Expected Output: Things are improving in Algeria.
Model Prediction: Things are improving in Algeria.


In [None]:
import evaluate
import numpy as np
import torch
from transformers import GenerationConfig, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load BLEU metric
bleu = evaluate.load("sacrebleu")

# Define Generation Configuration
generation_config = GenerationConfig(
    max_length=128,
    num_beams=5,
    early_stopping=True
)

# ✅ Define compute_metrics BEFORE using it
def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    predictions = np.argmax(predictions, axis=-1)

    decoded_preds = [
        tokenizer.decode(model.generate(torch.tensor(pred).unsqueeze(0), **generation_config.to_dict())[0],
                         skip_special_tokens=True)
        for pred in predictions
    ]

    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

    bleu_score = bleu.compute(predictions=decoded_preds, references=[[lbl] for lbl in decoded_labels])["score"]

    exact_matches = sum(1 for pred, gt in zip(decoded_preds, decoded_labels) if pred.strip().lower() == gt.strip().lower())
    accuracy = (exact_matches / len(decoded_labels)) * 100

    return {"exact_match": accuracy, "bleu_score": bleu_score}

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/t5-grammar-corrector",
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=50, 
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    fp16=False,  
    no_cuda=True,  
    report_to="none",
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model.to("cpu"),  
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,  # No more "not defined" error
    tokenizer=tokenizer  
)

# Run Evaluation
results = trainer.evaluate()

# Print Scores
print(f"Exact Match Accuracy: {results.get('eval_exact_match', 0):.2f}%")
print(f"BLEU Score: {results.get('eval_bleu_score', 0):.2f}")


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
