In [4]:
import json
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [13]:
df = pd.read_csv("df_5000_en_ge.csv")

In [14]:
df.head()

Unnamed: 0,English,German
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode
1,I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u..."
2,"Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte..."
3,You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...
4,"In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...


In [15]:
df.isna().sum()

English    17
German     11
dtype: int64

In [16]:
df.dropna(inplace=True)

In [27]:
# !pip install spacy

In [28]:
# !python -m spacy download en_core_web_sm

In [30]:
from tqdm import tqdm

In [24]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [31]:
entities_list = []
for text in tqdm(df["English"], desc="Extracting Entities", unit="sentence"):
    entities_list.append(extract_entities(text))

df["entities"] = entities_list

Extracting Entities: 100%|█████████████████████████████████████████| 4972/4972 [00:45<00:00, 109.23sentence/s]


In [32]:
df.head()

Unnamed: 0,English,German,entities
0,Resumption of the session,Wiederaufnahme der Sitzungsperiode,
1,I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[{'entity_name': 'the European Parliament', 'e..."
2,"Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...",
3,You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[{'entity_name': 'the next few days', 'entity_..."
4,"In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[{'entity_name': 'the European Union', 'entity..."


In [35]:
df['entities'][1]

[{'entity_name': 'the European Parliament', 'entity_type': 'ORG'},
 {'entity_name': 'Friday 17 December 1999', 'entity_type': 'DATE'},
 {'entity_name': 'new year', 'entity_type': 'DATE'}]

In [36]:
df = df[df["entities"].notna()].reset_index(drop=True)

In [37]:
df.shape

(3316, 3)

In [38]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [39]:
def preprocess_data(df):
    processed_data = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing Data", unit="row"):
        source_text = row["English"]
        target_text = row["German"]
        entities = eval(row["entities"]) if isinstance(row["entities"], str) else row["entities"]

        # Create entity annotation text
        entity_annotations = [f"{ent['entity_name']} [{ent['entity_type']}]" for ent in entities]
        entity_text = ", ".join(entity_annotations) if entity_annotations else "None"

        # Create NER example (1/3rd of the time to balance)
        if len(processed_data) % 3 == 0:
            processed_data.append({
                "task": "NER",
                "input": f"Recognize entities: {source_text}",
                "output": entity_text
            })

        # Create Entity-aware MT example
        processed_data.append({
            "task": "Entity-aware MT",
            "input": f"Entity translate (EN→DE): {source_text}",
            "output": target_text
        })

    return pd.DataFrame(processed_data)

In [40]:
train_data = preprocess_data(train_df)
test_data = preprocess_data(test_df)

Processing Data: 100%|████████████████████████████████████████████████| 2652/2652 [00:00<00:00, 16974.04row/s]
Processing Data: 100%|██████████████████████████████████████████████████| 664/664 [00:00<00:00, 14388.02row/s]


In [41]:
def tokenize_function(samples):
    inputs = tokenizer(samples["input"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(samples["output"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

In [42]:
train_data = train_data.apply(tokenize_function, axis=1)
test_data = test_data.apply(tokenize_function, axis=1)

In [43]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {key: torch.tensor(value) for key, value in self.data.iloc[idx].items()}

In [44]:
train_dataset = CustomDataset(train_data)
test_dataset = CustomDataset(test_data)

In [45]:
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to("cuda")

In [46]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [64]:
training_args = TrainingArguments(
    output_dir="t5_finetuned_de",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False,
)

In [65]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Custom loss function to prioritize translation over NER."""
        labels = inputs.pop("labels")  # Extract target labels
        outputs = model(**inputs)
        logits = outputs.logits  # Get logits

        # Compute CrossEntropy loss
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-100)

        # Assign higher weight to translation (80%) and lower weight to NER (20%)
        ner_weight = 0.2
        translation_weight = 0.8

        # Get task type (default to Translation)
        task_type = inputs.get("task_type", ["Translation"] * logits.shape[0])

        # Convert task type to weight tensor
        task_weights = torch.tensor(
            [ner_weight if "NER" in task else translation_weight for task in task_type],
            device=logits.device,
            dtype=torch.float,
        )

        # Scale loss by task weights
        weighted_loss = loss * task_weights.mean()

        return (weighted_loss, outputs) if return_outputs else weighted_loss

In [66]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = CustomTrainer(


In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.317336
2,0.378800,0.254828
3,0.400400,0.209418
4,0.337700,0.176456
5,0.293400,0.144211
6,0.258000,0.121422
7,0.226500,0.101785
8,0.203300,0.088808
9,0.184400,0.079777
10,0.170200,0.075887


TrainOutput(global_step=4980, training_loss=0.26171860752335513, metrics={'train_runtime': 2616.9965, 'train_samples_per_second': 15.201, 'train_steps_per_second': 1.903, 'total_flos': 6809911870095360.0, 'train_loss': 0.26171860752335513, 'epoch': 10.0})

In [68]:
model.save_pretrained("t5_finetuned_de")
tokenizer.save_pretrained("t5_finetuned_de")
print("Model saved successfully!")

Model saved successfully!


In [58]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --------------------------- ------------ 1.0/1.5 MB 5.0 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.6 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [69]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import torch
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\CSE IIT
[nltk_data]     BHILAI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [70]:
from tqdm import tqdm

# Function to calculate BLEU score for the test set with progress tracking
def calculate_bleu(model, tokenizer, test_df, num_samples=None):
    references = []  # Ground truth German translations
    hypotheses = []  # Model-generated translations

    num_samples = num_samples if num_samples else len(test_df)

    for i in tqdm(range(num_samples), desc="Calculating BLEU Score"):
        input_text = test_df.iloc[i]["English"]
        expected_translation = test_df.iloc[i]["German"]
        
        # Generate translation using model
        inputs = tokenizer(f"Entity translate (EN→DE): {input_text}", return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
        with torch.no_grad():
            output_tokens = model.generate(**inputs, max_length=128)
        
        predicted_translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

        # Tokenize expected and predicted sentences
        reference_tokens = nltk.word_tokenize(expected_translation.lower())
        hypothesis_tokens = nltk.word_tokenize(predicted_translation.lower())

        references.append([reference_tokens])  # BLEU expects a list of reference lists
        hypotheses.append(hypothesis_tokens)

    # Compute BLEU score
    bleu_score = corpus_bleu(references, hypotheses)
    print(f"\n🔹 **BLEU Score:** {bleu_score:.4f}")

    return bleu_score

# Call function to compute BLEU with tqdm
calculate_bleu(model, tokenizer, test_df, num_samples=len(test_df))  # Change sample size as needed



Calculating BLEU Score: 100%|███████████████████████████████████████████████| 664/664 [16:05<00:00,  1.45s/it]


🔹 **BLEU Score:** 0.1695





0.16952580660031294