<a href="https://colab.research.google.com/github/aleenakjames/German-Translation-using-LLMs/blob/main/german_translation_in_huggingface_wiki_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing all the required libraries**

In [None]:
!pip install langchain_community nltk==3.5 sacrebleu sacremoses

# **Libraries and Imports**

In [None]:
from langchain_text_splitters import (Language, RecursiveCharacterTextSplitter)
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_metric, load_dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import torch

# **Loading preprocessed wikipedia data from huggingface**

In [None]:
dataset = load_dataset("wikipedia", language="de", date="20220301")

df = pd.DataFrame(dataset['train'])

In [None]:
df.head()

# **Data Preprocessing**

In [None]:
doc = df['text'].tolist()
docs = doc[:10]

In [None]:
for i in range(0, len(docs)):
    if docs[i] is not None:
        docs[i] = str(docs[i]).replace("\n", "")

# **Inference using different models**
* **mBART:** The model can translate directly between any pair of 50 languages. To translate into a target language, the target language id is forced as the first generated token. To force the target language id as the first generated token, pass the forced_bos_token_id parameter to the generate method.
* **MarianMT:** The model "Helsinki-NLP/opus-mt-de-en" is a neural machine translation model designed specifically for translating text from German (de) to English (en), and it is part of the OPUS-MT project developed by the Helsinki-NLP group.
* **Tsmall100:** It is a compact and fast massively multilingual machine translation model covering more than 10K language pairs, that achieves competitive results while being much smaller and faster.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Models and Tokenizers
models = {
    "MarianMT": ("Helsinki-NLP/opus-mt-de-en", MarianMTModel, MarianTokenizer),
    "mBART": ("facebook/mbart-large-50-many-to-many-mmt", MBartForConditionalGeneration, MBart50TokenizerFast),
    "Tsmall100": ("alirezamsh/small100", AutoModelForSeq2SeqLM, AutoTokenizer)
}

In [None]:
dfdata = pd.DataFrame({})
inferenceTime = {}
mbart_translated = []
tsmall_translated = []
marianmt_translated = []

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_language(
                language=Language.PYTHON, chunk_size=1000, chunk_overlap=50
            )

In [None]:
for model_name, (model_checkpoint, model_class, tokenizer_class) in models.items():
    # Load model and tokenizer
    print(model_name)
    model = model_class.from_pretrained(model_checkpoint).to(device)
    tokenizer = tokenizer_class.from_pretrained(model_checkpoint)
    translated = []
    start_time = time.time()
    for i in range(0, len(docs)):
        docs_split = text_splitter.create_documents([docs[i]])
        pretranslated = []
        if model_name == "mBART":
            tokenizer.src_lang = "de_DE"
            for chunk in docs_split:
                encoded_de = tokenizer(chunk.page_content, return_tensors="pt").to(device)
                generated_tokens = model.generate(
                    **encoded_de,
                    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
                )
                translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                pretranslated.append(translated_chunk[0])
            translated_string = " ".join(pretranslated)
            mbart_translated.append(translated_string)
        elif model_name == "Tsmall100":
            for chunk in docs_split:
                encoded_de = tokenizer(chunk.page_content, return_tensors="pt").to(device)
                generated_tokens = model.generate(**encoded_de)
                translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                pretranslated.append(translated_chunk[0])
            translated_string = " ".join(pretranslated)
            tsmall_translated.append(translated_string)
        else:
            for chunk in docs_split:
                encoded_de = tokenizer(chunk.page_content, return_tensors="pt").to(device)
                generated_tokens = model.generate(**encoded_de)
                translated_chunk = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                pretranslated.append(translated_chunk[0])
            translated_string = " ".join(pretranslated)
            marianmt_translated.append(translated_string)
    if model_name == "mBART":
        translated_text = mbart_translated
    elif model_name == "Tsmall100":
        translated_text = tsmall_translated
    else:
        translated_text = marianmt_translated
    end_time = time.time()
    inference_time = end_time - start_time
    inferenceTime[model_name] = inference_time
    dfdata[model_name] = translated_text
    dfdata.to_csv("translation.csv", index=False)

In [None]:
dfdata = dfdata[:5]
dfdata

# **Calculating Scores**
**BLEU:** The score evaluates the quality of machine-generated translations by comparing them to reference translations. Weights define the importance of different n-grams in the BLEU score calculation.

**TER:** The Translation Edit Rate score measures the number of edits required to change a machine-translated output into one of the reference translations, with a lower score indicating a higher quality translation.

**ChrF:** The Character F-score metric is another evaluation metric for machine translation quality, which computes precision and recall over character n-grams, not word n-grams.

In [None]:
bleu_metric = load_metric('sacrebleu', trust_remote_code=True)
ter_metric = load_metric('ter', trust_remote_code=True)
chrf_metric = load_metric('chrf', trust_remote_code=True)

In [None]:
# Functions to evaluate translations
def evaluate_bleu(predictions, references):
    return bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
def evaluate_chrf(predictions, references):
    return chrf_metric.compute(predictions=[pred.split() for pred in predictions], references=[[ref.split()] for ref in references])
def evaluate_ter(predictions, references):
    for i in range(len(predictions)):
        length = min(len(predictions[i]), len(references[i]))
        predictions[i] = predictions[i][:length]
        references[i] = references[i][:length]
    return ter_metric.compute(predictions=[pred.split() for pred in predictions], references=[[ref.split()] for ref in references])

In [None]:
results = {}
keys = list(models.keys())

# Calculate all pairwise comparisons
for i in range(len(keys)):
    for j in range(len(keys)):
        if i != j:
            print(keys[i],keys[j])
            bleu = evaluate_bleu(dfdata[keys[i]], dfdata[keys[j]])
            chrf = evaluate_chrf(dfdata[keys[i]], dfdata[keys[j]])
            ter = evaluate_ter(dfdata[keys[i]], dfdata[keys[j]])
            results[(keys[i], keys[j])] = {
                "BLEU": bleu,
                "ChrF": chrf,
                "TER": ter
            }

In [None]:
for model, time_s in inferenceTime.items():
        print(f"Inferece time of {model}: {time_s}")
for (model1, model2), result in results.items():
    print(f"Model: {model1} to {model2}")
    for metric, score in result.items():
        print(f"  {metric}: {score}")

# **Visualising the results**

In [None]:
# Initialize dictionaries to hold DataFrames for each metric
metrics = ['BLEU', 'TER', 'ChrF']
metric_dfs = {metric: pd.DataFrame(index=keys, columns=keys) for metric in metrics}

# Populate the DataFrames with the 'score' from each metric's dictionary
for (model1, model2), scores in results.items():
    for metric in metrics:
        metric_dfs[metric].at[model1, model2] = scores[metric]['score']

In [None]:
def plot_heatmap(df, title):
    plt.figure(figsize=(8, 6))
    sns.heatmap(df.astype(float), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title(title)
    plt.show()

In [None]:
# Plot the results for each metric
for metric in metrics:
    plot_heatmap(metric_dfs[metric], f'{metric} Scores')