In [1]:
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset

In [2]:
def preprocess_data(data, output_path):
    formatted_data = []
    for sample in data:
        source_text = sample["source"]
        target_text = sample["target"]
        entities = sample.get("enriched_entities", [])

        entity_annotations = [f"{ent['entity_name']['en']} [{ent['entity_type']}]" for ent in entities]
        entity_text = ", ".join(entity_annotations) if entity_annotations else "None"

        # Reduce NER examples to avoid overfitting
        if len(formatted_data) % 3 == 0:  # Keep only 1/3 NER examples
            formatted_data.append({
                "task": "NER",
                "input": f"Recognize entities: {source_text}",
                "output": entity_text
            })

        # Keep more translation examples
        formatted_data.append({
            "task": "Entity-aware MT",
            "input": f"Entity translate (EN→FR): {source_text}",
            "output": target_text
        })

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(formatted_data, f, indent=4, ensure_ascii=False)

    print(f"Processed data saved to {output_path}")


In [5]:
with open("train_data.json", "r", encoding="utf-8") as f:
    data_train = json.load(f)

In [6]:
preprocess_data(data_train, "train_processed_data.json")

Processed data saved to train_processed_data.json


In [10]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [12]:
def preprocess_function(samples):
    # Tokenize inputs and targets
    inputs = tokenizer(samples["input"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(samples["output"], padding="max_length", truncation=True, max_length=128)

    # Set the labels for the inputs
    inputs["labels"] = targets["input_ids"]

    # Move tensors to CUDA (GPU) or CPU
    inputs = {key: torch.tensor(value).to(device) for key, value in inputs.items()}
    
    return inputs


In [13]:
def load_and_prepare_data(input_path):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    return Dataset.from_list(data)

data_path = "train_processed_data.json"
dataset = load_and_prepare_data(data_path)

In [14]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/5807 [00:00<?, ? examples/s]

In [16]:
output_dir = "t5_finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0957,0.077598


Model saved to t5_finetuned


In [24]:
# !pip install evaluate

In [25]:
# !pip install sacrebleu

In [17]:
data_test[400]

{'id': '307ce37f',
 'source_locale': 'en',
 'target_locale': 'fr',
 'source': 'Which country is bigger in size, USA or China?',
 'target': 'Entre les USA et la Chine : quel pays a la plus grande superficie ?',
 'entities': ['Q148'],
 'from': 'mintaka',
 'enriched_entities': [{'qid': 'Q148',
   'entity_name': {'en': "People's Republic of China",
    'fr': 'république populaire de Chine'},
   'entity_type': 'sovereign state'}]}

In [19]:
import torch  

# Move model to CUDA  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
model.to(device)  

# Prepare input  
sample_input = data_test[400]["source"]  
print("Sample Input:", sample_input)  

inputs = tokenizer(sample_input, return_tensors="pt", padding=True, truncation=True).to(device)  

# Generate output  
outputs = model.generate(**inputs)  

# Decode and print output  
print("Sample Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))  


Sample Input: Which country is bigger in size, USA or China?
Sample Output: People's Republic of China [sovereign state], United States [sovereign


In [None]:
preprocess_data(data_test,"test_split.json")

In [16]:
import json
import torch
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# Load model and tokenizer
model_path = "t5_finetuned"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def load_test_data(input_path):
    with open(input_path, "r", encoding="utf-8") as f:
        return json.load(f)

# # Load test set (limit to 10 samples for debugging)
test_data_path = "test_split.json"
test_data = load_test_data(test_data_path)[:50]

# Quick test to check model inference
sample_input = test_data[0]["input"]
print("Sample Input:", sample_input)
inputs = tokenizer(sample_input, return_tensors="pt", padding=True, truncation=True)
outputs = model.generate(**inputs)
print("Sample Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

bleu = evaluate.load("sacrebleu")

def restore_entities(text, entity_mapping):
    """Replace placeholders in translated text with original entity names."""
    for placeholder, original in entity_mapping.items():
        text = text.replace(placeholder, original)
    return text

predictions, references = [], []

for sample in tqdm(test_data, desc="Evaluating Translations"):
    input_text = f"Entity translate (EN→FR): {sample['input']}"
    expected_output = sample["output"]
    entity_mapping = sample.get("enriched_entities", {})

    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(**inputs)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Restore entities before BLEU evaluation
    final_output = restore_entities(decoded_output, entity_mapping)
    final_reference = restore_entities(expected_output, entity_mapping)

    predictions.append(final_output)
    references.append([final_reference])

# Compute BLEU score on restored translations
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"Updated BLEU Score for Translation: {bleu_score['score']:.2f}")


Sample Input: Recognize entities: How may states touch Lake Michigan?
Sample Output: Lake Michigan [lake]


Evaluating Translations: 100%|█████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.07it/s]

Updated BLEU Score for Translation: 31.44





In [36]:
test_data[1]

{'task': 'Entity-aware MT',
 'input': 'Entity translate (EN→FR): How may states touch Lake Michigan?',
 'output': 'Combien d’États touchent le lac Michigan ?'}

In [32]:
for i in range(len(predictions)):
    print (i,references[i],"\t",predictions[i])

0 ['Lake Michigan [lake]'] 	 Combien de États a-t-il touché le lac Michigan ?
1 ['Combien d’États touchent le lac Michigan ?'] 	 Combien d’États touchent le lac Michigan ?
2 ['Combien d’États touchent le lac Michigan ?'] 	 Combien d’États touchent-on le lac Michigan ?
3 ["People's Republic of China [sovereign state]"] 	 Quel pays a le plus peuplé, le Canada ou le Chine ?
4 ['Quel est le pays le plus peuplé, le Canada ou la Chine ?'] 	 Quel pays est le plus peuplé, le Canada ou le Chine ?
5 ['Quel est le pays le plus peuplé, le Canada ou la Chine ?'] 	 Quel pays est le plus peuplé, le Canada ou le Chine ?
6 ['The Lord of the Rings [novel series]'] 	 Qui a réalisé le film qui est inspiré du deuxième livre de la série Le Seigneur
7 ['Qui a réalisé le film inspiré du deuxième livre de la série Le Seigneur des anneaux ?'] 	 Qui a réalisé le film inspiré du deuxième livre de la série Le Seigneur des
8 ['Qui a réalisé le film inspiré du deuxième livre de la série Le Seigneur des anneaux ?'] 	

# Main 

In [17]:
from transformers import Trainer
import torch.nn.functional as F
import torch

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # ✅ Fix: Accept extra arguments
        """Custom loss function to prioritize translation over NER."""
        labels = inputs.pop("labels")  # Extract target labels
        outputs = model(**inputs)
        logits = outputs.logits  # Get logits

        # Compute CrossEntropy loss
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

        # Assign higher weight to translation (80%) and lower weight to NER (20%)
        ner_weight = 0.2
        translation_weight = 0.8

        # Get task type (default to Translation)
        task_type = inputs.get("task_type", ["Translation"] * logits.shape[0])

        # Convert task type to weight tensor
        task_weights = torch.tensor(
            [ner_weight if "NER" in task else translation_weight for task in task_type],
            device=logits.device,
            dtype=torch.float,
        )

        # Scale loss by task weights
        weighted_loss = loss * task_weights.mean()

        return (weighted_loss, outputs) if return_outputs else weighted_loss


In [25]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss
1,0.0442,0.053552


TrainOutput(global_step=1452, training_loss=0.04828366563339864, metrics={'train_runtime': 281.9405, 'train_samples_per_second': 41.19, 'train_steps_per_second': 5.15, 'total_flos': 539686607781888.0, 'train_loss': 0.04828366563339864, 'epoch': 1.0})

In [18]:
def load_test_data(input_path):
    with open(input_path, "r", encoding="utf-8") as f:
        return json.load(f)
test_data_path = "test_split.json"
test_data = load_test_data(test_data_path)[:50]

In [19]:
# Load a sample from test data
sample_input = test_data[1]["input"]
print("Sample Input:", sample_input)
print("Expected Input:",test_data[1]["output"])
# Tokenize input
inputs = tokenizer(sample_input, return_tensors="pt", padding=True, truncation=True)

# Move to GPU if available
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate output
outputs = model.generate(**inputs)

# Decode the output
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Sample Output:", decoded_output)



Sample Input: Entity translate (EN→FR): How may states touch Lake Michigan?
Expected Input: Combien d’États touchent le lac Michigan ?
Sample Output: Combien d’États touchent le lac Michigan ?


### Using only for ea-mt tasks

In [43]:
import json
import torch
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# Load model and tokenizer
model_path = "t5_finetuned"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def load_test_data(input_path):
    """Load test data and filter only 'Entity-aware MT' tasks."""
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # ✅ Keep only samples where task == "Entity-aware MT"
    filtered_data = [sample for sample in data if sample.get("task") == "Entity-aware MT"]
    return filtered_data

# Load filtered test data
test_data_path = "test_split.json"
test_data = load_test_data(test_data_path) # Limit to first 50 samples for quick testing

# ✅ Quick test on one sample
sample_input = test_data[0]["input"]
print("Sample Input:", sample_input)

inputs = tokenizer(sample_input, return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to GPU if available

outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Sample Output:", decoded_output)

# ✅ Function to restore entities in translated text
def restore_entities(text, entity_mapping):
    """Replace placeholders in translated text with original entity names."""
    for placeholder, original in entity_mapping.items():
        text = text.replace(placeholder, original)
    return text

# Initialize BLEU evaluation
bleu = evaluate.load("sacrebleu")

predictions, references = [], []

# ✅ Process all test samples
for sample in tqdm(test_data, desc="Evaluating Translations"):
    input_text = sample["input"]
    expected_output = sample["output"]
    entity_mapping = sample.get("enriched_entities", {})

    # Tokenize and generate translation
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to GPU

    outputs = model.generate(**inputs)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Restore entity names before BLEU evaluation
    final_output = restore_entities(decoded_output, entity_mapping)
    final_reference = restore_entities(expected_output, entity_mapping)

    predictions.append(final_output)
    references.append([final_reference])  # BLEU expects a list of references

# ✅ Compute BLEU score on restored translations
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"Updated BLEU Score for Entity-Aware MT: {bleu_score['score']:.2f}")


Sample Input: Entity translate (EN→FR): How may states touch Lake Michigan?
Sample Output: Combien d’États touchent le lac Michigan ?


Evaluating Translations: 100%|███████████████████████████████████████████████████| 1660/1660 [1:41:19<00:00,  3.66s/it]


Updated BLEU Score for Entity-Aware MT: 34.96


### Trying with different models

In [20]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [32]:
def load_test_data(input_path):
    """Load test data and filter only 'Entity-aware MT' tasks."""
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # ✅ Keep only samples where task == "Entity-aware MT"
    filtered_data = [sample for sample in data if sample.get("task") == "Entity-aware MT"]
    return filtered_data

# Load filtered test data
test_data_path = "test_split.json"
test_data = load_test_data(test_data_path) # Limit to first 50 samples for quick testing

# ✅ Quick test on one sample
sample_input = test_data[0]["input"]
print("Sample Input:", sample_input)

inputs = tokenizer(sample_input, return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to GPU if available

outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Sample Output:", decoded_output)

# ✅ Function to restore entities in translated text
def restore_entities(text, entity_mapping):
    """Replace placeholders in translated text with original entity names."""
    for placeholder, original in entity_mapping.items():
        text = text.replace(placeholder, original)
    return text

Sample Input: Entity translate (EN→FR): How may states touch Lake Michigan?
Sample Output: Combien d’États touchent le lac Michigan ?


In [31]:
# ✅ Quick test on one sample
sample_input = test_data[400]["input"]
print("Sample Input:", sample_input)

inputs = tokenizer(sample_input, return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to GPU if available

outputs = model.generate(**inputs)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Sample Output:", decoded_output)

Sample Input: Entity translate (EN→FR): Which country is bigger in size, USA or China?
Sample Output: Quel pays est plus grand, les États-Unis ou la Chine ?


In [22]:
output_dir = "t5_large_finetuned"

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.672,0.081121
2,0.1239,0.0515
3,0.075,0.035651
4,0.0651,0.025517
5,0.0455,0.018231
6,0.0385,0.013639
7,0.0297,0.009977
8,0.0253,0.00771
9,0.0211,0.00615
10,0.0189,0.005607


Model saved to t5_large_finetuned


In [15]:
test_data[:5]

NameError: name 'test_data' is not defined

In [34]:
predictions, references = [], []

# ✅ Process all test samples
for sample in tqdm(test_data, desc="Evaluating Translations"):
    input_text = sample["input"]
    expected_output = sample["output"]
    entity_mapping = sample.get("enriched_entities", {})

    # Tokenize and generate translation
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Move to GPU

    outputs = model.generate(**inputs)
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Restore entity names before BLEU evaluation
    final_output = restore_entities(decoded_output, entity_mapping)
    final_reference = restore_entities(expected_output, entity_mapping)

    predictions.append(final_output)
    references.append([final_reference])  # BLEU expects a list of references

# ✅ Compute BLEU score on restored translations
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"Updated BLEU Score for Entity-Aware MT: {bleu_score['score']:.2f}")

Evaluating Translations: 100%|█████████████████████████████████████████████████████| 1660/1660 [16:37<00:00,  1.66it/s]


Updated BLEU Score for Entity-Aware MT: 42.98


In [35]:
predictions[:50]

['Combien d’États touchent le lac Michigan ?',
 'Quel pays est le plus peuplé, le Canada ou la Chine ?',
 'Qui a réalisé le film inspiré du deuxième livre de la série Le Seigneur des',
 'Quel pays n’a pas ratifié le traité de Versailles ?',
 'Qui a écrit Neuromancien ?',
 'Quand est diplômé l’auteur du livre Les Baby-sitters ?',
 'Qui est le distributeur de la série télévisée à l’adaptation des livres',
 'Quand Guillermo del Toro a-t-il remporté le dernier Oscar',
 'Quel dirigeant français était à l’origine un nationalisme corsican',
 'Quel pays a impliqué le plus de personnes dans l’espace ?',
 'Quel pays a obtenu son indépendance en premier en Afrique ?',
 'L’album 19 d’Adèle a-t-elle remporté le prix',
 'Beijing est-elle la capitale de l’Italie ?',
 'L’Allemagne a-t-elle attaqué Pearl Harbor le 7 décembre 1941 ?',
 'Qui était le dirigeant d’Angleterre lors des bombardements du Japon ?',
 'Combien de livres y a-t-il dans Le Seigneur des',
 'Quand est sorti « Super Smash Bros » ?',
 '

In [49]:
!pip install transformers sentencepiece



## Inference one sentence

In [74]:
model_path = "t5_large_finetuned1"  # Replace with the actual path to your saved model
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Example input text
input_text = "Entity translate (EN→FR): "+ input("Enter the english sentence:\n")

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate the output using the model
outputs = model.generate(**inputs)

# Decode the generated output
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the decoded output
print("Decoded Output:", decoded_output)

Enter the english sentence:
 Let me know if you'd like to dive deeper into one of these


Decoded Output: Comment s’appelle-t-on d’abord une étude d’une


# Testing for few samples and getting BLEU score

## Creating the train test_split.json

In [50]:
with open("test_data.json", "r", encoding="utf-8") as f:
    data_test = json.load(f)

In [64]:
def load_test_data(input_path):
    """Load test data and filter only 'Entity-aware MT' tasks."""
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [69]:
def preprocess_data(data, output_path):
    formatted_data = []
    for sample in data:
        source_text = sample["source"]
        target_text = sample["target"]
        entities = sample.get("enriched_entities", [])

        entity_annotations = [f"{ent['entity_name']['en']} [{ent['entity_type']}]" for ent in entities]
        entity_text = ", ".join(entity_annotations) if entity_annotations else "None"
        #Keep more translation examples
        formatted_data.append({
            "task": "Entity-aware MT",
            "input": f"Entity translate (EN→FR): {source_text}",
            "output": target_text
        })

    return formatted_data


In [72]:
test_data = preprocess_data(data_test,"test_split1.json")

In [46]:
import evaluate
import random
from tqdm import tqdm

In [43]:
def restore_entities(text, entity_mapping):
    """Replace placeholders in translated text with original entity names."""
    for placeholder, original in entity_mapping.items():
        text = text.replace(placeholder, original)
    return text

In [56]:
def evaluate_bleu_on_random_subset(test_data, num_samples):
    random_samples = random.sample(test_data, num_samples)
    predictions, references = [], []

    for sample in tqdm(random_samples, desc="Evaluating Translations"):
        input_text = sample["input"]
        expected_output = sample["output"]
        entity_mapping = sample.get("enriched_entities", {})

        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        outputs = model.generate(**inputs)
        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        final_output = restore_entities(decoded_output, entity_mapping)
        final_reference = restore_entities(expected_output, entity_mapping)

        predictions.append(final_output)
        references.append([final_reference])

    bleu_score = bleu.compute(predictions=predictions, references=references)
    return bleu_score

In [58]:
num_test_samples = 100  
bleu_score = evaluate_bleu_on_random_subset(test_data, num_test_samples)

Evaluating Translations: 100%|███████████████████████████████████████████████████████| 100/100 [01:00<00:00,  1.65it/s]


In [59]:
print(f"BLEU Score: {bleu_score['score']:.2f}")

BLEU Score: 40.15
