In [None]:
### Epsilon Sampling Datasets
### Generate One Translation for each example

# Temperature: 0.5 , 1 , 2
# Epsilon: 0.01 , 0.02

In [None]:
### Install Required Packages
!pip install jsonlines sentencepiece # nltk evaluate unbabel-comet accelerate

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, jsonlines
Successfully installed jsonlines-4.0.0 sentencepiece-0.1.99


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
drive_PATH = '../content/drive/MyDrive/Colab Notebooks/l101.experiments.1'

Mounted at /content/drive


In [None]:
# General
import numpy as np
import tqdm
import random

# Data
import jsonlines
import torch

# Model
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") # Load Model
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") # Load Tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm

In [None]:
### Dataset
test_tr_PATH = drive_PATH + '/data/datasets/test_tr.jsonl'
test_de_PATH = drive_PATH + '/data/datasets/test_de.jsonl'
test_tr = []
test_de = []
test_de2 = []

with jsonlines.open(test_tr_PATH) as f:
    for line in f.iter():
        test_tr.append(line)
with jsonlines.open(test_de_PATH) as f:
    for line in f.iter():
        test_de.append(line)

# with jsonlines.open(drive_PATH+'/data-hyperparameter-selection/n128-t1-e0.02.jsonl') as f:
#     for line in f.iter():
#         test_de2.append(line)

In [None]:
# SRC: DE
# TGT: EN
src_lang = "de"
tgt_lang = "en"

In [None]:
for i in tqdm.tqdm(range(len(test_de2))):
  test_de2[i]['de'] = test_de2[i]['src']
  test_de2[i]['en'] = test_de2[i]['ref']

100%|██████████| 200/200 [00:00<00:00, 495195.28it/s]


In [None]:
### Setting the Language Configurations
num_examples = 200
dataset = test_de
random.shuffle(dataset)
dataset = dataset[:num_examples]
tokenizer.src_lang = src_lang

In [None]:
### Decoding: Beam Search

res = []
# temperature = 0.5
# epsilon_cutoff = 0.02
# num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang) , num_beams=5, early_stopping=False, num_return_sequences=1)
  tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
  candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n1-beamsearchdecode.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

100%|██████████| 200/200 [01:54<00:00,  1.74it/s]


In [None]:
### Decoding: Epsilon Sampling

res = []
temperature = 0.5
epsilon_cutoff = 0.02
num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  for idxc in range(num_candidates):
    tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang), do_sample=True, temperature=temperature, epsilon_cutoff=0.02, num_beams=1, early_stopping=False, num_return_sequences=1)
    tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n{num_candidates}-t{temperature}-e{epsilon_cutoff}.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

In [None]:
### Decoding: Epsilon Sampling

res = []
temperature = 1
epsilon_cutoff = 0.02
num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  for idxc in range(num_candidates):
    tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang), do_sample=True, temperature=temperature, epsilon_cutoff=0.02, num_beams=1, early_stopping=False, num_return_sequences=1)
    tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n{num_candidates}-t{temperature}-e{epsilon_cutoff}.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

In [None]:
### Decoding: Epsilon Sampling

res = []
temperature = 2.0
epsilon_cutoff = 0.02
num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  for idxc in range(num_candidates):
    tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang), do_sample=True, temperature=temperature, epsilon_cutoff=0.02, num_beams=1, early_stopping=False, num_return_sequences=1)
    tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n{num_candidates}-t{temperature}-e{epsilon_cutoff}.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

In [None]:
### Decoding: Epsilon Sampling

res = []
temperature = 0.5
epsilon_cutoff = 0.01
num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  for idxc in range(num_candidates):
    tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang), do_sample=True, temperature=temperature, epsilon_cutoff=0.02, num_beams=1, early_stopping=False, num_return_sequences=1)
    tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n{num_candidates}-t{temperature}-e{epsilon_cutoff}.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

In [None]:
### Decoding: Epsilon Sampling

res = []
temperature = 1
epsilon_cutoff = 0.01
num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  for idxc in range(num_candidates):
    tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang), do_sample=True, temperature=temperature, epsilon_cutoff=0.02, num_beams=1, early_stopping=False, num_return_sequences=1)
    tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n{num_candidates}-t{temperature}-e{epsilon_cutoff}.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

In [None]:
### Decoding: Epsilon Sampling

res = []
temperature = 2.0
epsilon_cutoff = 0.01
num_candidates = 128

for sent in tqdm.tqdm(dataset):
  #print(sent['en'])
  src_sent = sent[src_lang]
  encoded_src_sent = tokenizer(src_sent, return_tensors="pt").input_ids.to("cuda")

  candidates = []
  for idxc in range(num_candidates):
    tgt_tokens = model.generate(encoded_src_sent, forced_bos_token_id=tokenizer.get_lang_id(tgt_lang), do_sample=True, temperature=temperature, epsilon_cutoff=0.02, num_beams=1, early_stopping=False, num_return_sequences=1)
    tgt_sent = tokenizer.batch_decode(tgt_tokens, skip_special_tokens=True)
    candidates.append(tgt_sent)

  res_sent = {'src': sent[src_lang], 'candidates': candidates, 'ref':sent[tgt_lang]}
  res.append(res_sent)

cand_de_PATH = drive_PATH+ f'/n{num_candidates}-t{temperature}-e{epsilon_cutoff}.jsonl'
with jsonlines.open(cand_de_PATH, mode='w') as writer:
    for item in res:
        writer.write(item)

100%|██████████| 200/200 [3:11:05<00:00, 57.33s/it]
