In [17]:
from tqdm import tqdm
from datasets import load_dataset

In [18]:
dataset = load_dataset('iwslt2017', 'iwslt2017-en-fr')

train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [19]:
def get_en_fr_sentences(data):
    english = [item['en'] for item in tqdm(data['translation'])]
    french = [item['fr'] for item in tqdm(data['translation'])]

    return english, french

train_en, train_fr = get_en_fr_sentences(train_data)
val_en, val_fr = get_en_fr_sentences(val_data)
test_en, test_fr = get_en_fr_sentences(test_data)

100%|██████████| 232825/232825 [00:00<00:00, 4175455.49it/s]
100%|██████████| 232825/232825 [00:00<00:00, 4155254.51it/s]
100%|██████████| 890/890 [00:00<00:00, 252891.44it/s]
100%|██████████| 890/890 [00:00<00:00, 125831.95it/s]
100%|██████████| 8597/8597 [00:00<00:00, 1562256.03it/s]
100%|██████████| 8597/8597 [00:00<00:00, 32780392.26it/s]


In [20]:
# generate 100 random indices and select English and French sentence pairs
import random
random.seed(627)

num_sentences = 100
random_indices = random.sample(range(len(train_en)), num_sentences)

# get English and French sentences
# English: 'sentence_eng_Latn' 
# French: 'sentence_fra_Latn' 
english_sentences = [train_en[i] for i in random_indices]
french_sentences = [train_fr[i] for i in random_indices]

In [21]:
from transformers import AutoTokenizer, MarianMTModel

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

In [22]:
from transformers import AutoTokenizer, MarianMTModel

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

# store generated sentences of OPUS-MT
opus_mt_sentences = []

# loop through each english sentence
for sentence in english_sentences:
    tokenizer.src_lang = "en" 
    encoded_en = tokenizer(sentence, return_tensors="pt")
    generated_tokens = model.generate(**encoded_en)
    out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    opus_mt_sentences.append(out)

In [23]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

In [24]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

# store generated sentences of M2M-100
m2m_100_sentences = []

# loop through each english sentence
for sentence in english_sentences:
    tokenizer.src_lang = "en"
    encoded_en = tokenizer(sentence, return_tensors="pt")
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.get_lang_id("fr"))
    out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    m2m_100_sentences.append(out)

In [25]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [26]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# store generated sentences from MBART-50
mbart_50_sentences = []

# loop through each english sentence
for sentence in english_sentences:
    
    tokenizer.src_lang = "en_XX"
    encoded_en = tokenizer(sentence, return_tensors="pt")
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
    out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    mbart_50_sentences.append(out)

In [27]:
# using OPUS-MT and AutoTokenizer as it had highest BLEU score
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
tokenized_inputs = tokenizer(english_sentences, french_sentences, return_tensors="pt", padding=True, truncation=True)

# Get the input IDs
input_ids = tokenized_inputs['input_ids']

# compute english and french sentence lengths
english_sentence_lengths = [len(sentence.split()) for sentence in english_sentences]
french_sentence_lengths = [len(sentence.split()) for sentence in french_sentences]

# basic min, max and avg
min_length_english = min(english_sentence_lengths)
avg_length_english = sum(english_sentence_lengths) / len(english_sentence_lengths)
max_length_english = max(english_sentence_lengths)

min_length_french = min(french_sentence_lengths)
avg_length_french = sum(french_sentence_lengths) / len(french_sentence_lengths)
max_length_french = max(french_sentence_lengths)

print("English")
print(f"Minimum sentence length: {min_length_english} \nAverage sentence length: {avg_length_english} \nMaximum sentence length: {max_length_english}")
print("=" * 50)
print("French")
print(f"French - Minimum sentence length: {min_length_french} \nAverage sentence length: {avg_length_french} \nMaximum sentence length: {max_length_french}")

English
Minimum sentence length: 4 
Average sentence length: 20.02 
Maximum sentence length: 60
French
French - Minimum sentence length: 2 
Average sentence length: 20.81 
Maximum sentence length: 58


In [28]:
import sacrebleu

models = ["OPUS-MT", "M2M-100", "MBART-50"]
output_sentences = [opus_mt_sentences, m2m_100_sentences, mbart_50_sentences]

for i, j in zip(models, output_sentences):
    k = [str(sentence) for sentence in j]

    # bleu score
    bleu = sacrebleu.corpus_bleu(k, [french_sentences])

    print(f"BLEU score for {i}: {bleu.score}")

BLEU score for OPUS-MT: 31.859311215529004
BLEU score for M2M-100: 26.592755777695388
BLEU score for MBART-50: 29.52344400211202


In [29]:
idx = [6, 27]

for k, v in enumerate(idx):
    print(f"Sentence {k+1}")
    print(f"Ground Truth: {french_sentences[v]}")
    print(f"OPUS-MT output: {opus_mt_sentences[v]}")
    print(f"M2M-100 output: {m2m_100_sentences[v]}")
    print(f"MBART-50 output: {mbart_50_sentences[v]}")
    print("=" * 50)

Sentence 1
Ground Truth: Notre rêve est donc de rassembler les adolescents, pour qu'ils vivent une expérience collective d'entraide ainsi qu'une expérience interculturelle, en instruisant les enfants de ces régions et en les aidant à construire leurs moyens de communication.
OPUS-MT output: ["Notre rêve est donc de réunir les adolescents, afin qu'ils aient une expérience de service communautaire ainsi qu'une expérience interculturelle, car ils enseignent aux enfants dans ces domaines et les aident à construire leur infrastructure de communication."]
M2M-100 output: ['Ainsi, notre rêve est de rassembler les adolescents, de sorte qu’ils auront une expérience de service communautaire ainsi qu’une expérience interculturelle, car ils enseignent aux enfants dans ces domaines et les aident à construire leur infrastructure de communication.']
MBART-50 output: ["Notre rêve est donc de rassembler les adolescents afin qu'ils aient une expérience de services communautaires ainsi qu'une expérience 