## Load The modules

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from comet import download_model, load_from_checkpoint
from huggingface_hub import login
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  warn(


## Load The model to evaluate

In [2]:
model_name_or_path = "./models/dekel-cp-124998-and-fauda-teheran-inss/"

src_lang = "heb_Hebr"
col_src = 'he'
dst_lang = "eng_Latn"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, src_lang=src_lang)
#model = model.to_bettertransformer()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Load the data to Test on

In [3]:
NUMBER_OF_SAMPLES = 100

DATA_TO_LOAD_INSS = Path('./tr_data/inss/validation.parquet')
DATA_TO_LOAD_TEHERAN = Path('./tr_data/teheran/validation.parquet')
DATA_TO_LOAD_FAUDA = Path('./tr_data/fauda/validation.parquet')


df_teheran = pd.read_parquet(DATA_TO_LOAD_TEHERAN)
df_inss = pd.read_parquet(DATA_TO_LOAD_INSS)
df_fauda = pd.read_parquet(DATA_TO_LOAD_FAUDA)

df_samp = pd.concat([df_teheran, df_inss, df_fauda])

In [4]:
df_samp

Unnamed: 0,translation
1394,"{'en': ' Give her a few hours, she'll be fine...."
353,{'en': ' The Zionists must be planning somethi...
1334,"{'en': ' Who sent you here, the guards?', 'he'..."
906,"{'en': ' The door is stuck, but don't worry.',..."
1290,"{'en': ' We have no way of knowing that.', 'he..."
...,...
7475,"{'en': ' You are our family, Doron.', 'he': 'א..."
2236,"{'en': ' Abu Ahmed sent me.', 'he': 'אבו אחמד ..."
8932,"{'en': ' We'll talk about that later.', 'he': ..."
400,"{'en': ' I'm sure they will be happy.', 'he': ..."


## Translate 

In [5]:
def predict(tokenizer, model, df_samp, col_src='he', dst_lang="eng_Latn", batch_size=500):
    src_texts = []
    for index, row in df_samp.iterrows():
        src_texts.append(row['translation'][col_src])

    # Split the texts into batches  
    translated_texts = []
    batches = [src_texts[i:i + batch_size] for i in range(0, len(src_texts), batch_size)]
    for batch in batches:
        inputs = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding=True).to("cuda")
        translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[dst_lang])
        translated_texts += tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[:]
    return translated_texts


translated_texts = predict(tokenizer, model, df_samp, col_src=col_src, dst_lang=dst_lang)
df_samp['pred'] = translated_texts

In [6]:
translated_texts

["Give her a few hours, she'll be fine.",
 'The grades must be planning something big.',
 'Who sent you here, the shifts?',
 "The door is stuck, but don't worry.",
 'We have no ability to know that.',
 'So we are operating from route A-301.',
 'You took the goods and disappeared.',
 'How soon do they unload?',
 "Well, that's a big load.",
 'But from now on, you work for me.',
 'I have to go do something in the field.',
 "But I don't want you to die of disease.",
 'let me talk to her',
 'It started as an artist commune, but now all kinds of guys live here.',
 'You owe it to my mother.',
 'No need to talk about it.',
 'Let my family go.',
 'We begin boarding, please proceed to the gate, thank you.',
 "That's why this is our main goal.",
 'If one of us talks to Farham, it will only scare him away.',
 'Maybe one of my employees had a meeting at the electric company that evening.',
 'I worked at the electric company but left.',
 "Just ask him what's going on.",
 'He is an important man who 

## DOWNLOAD COMET MODEL

In [7]:
comet_model_path = download_model("Unbabel/wmt22-cometkiwi-da")
comet_model = load_from_checkpoint(comet_model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.2 to v2.0.9. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../huggingface_cache/hub/models--Unbabel--wmt22-cometkiwi-da/snapshots/b3a8aea5a5fc22db68a554b92b3d96eb6ea75cc9/checkpoints/model.ckpt`
Encoder model frozen.
  rank_zero_warn(


In [8]:
data = []
for index, row in df_samp.iterrows():
    data.append({
        'src': row['translation'][col_src],
        'mt': row['pred']  # TODO:HIGH:Restore: row[col_dst] row['translation']['en']  
    })
data

[{'src': 'תן לה כמה שעות, היא תהיה בסדר.',
  'mt': "Give her a few hours, she'll be fine."},
 {'src': 'הציונים ודאי מתכננים משהו גדול.',
  'mt': 'The grades must be planning something big.'},
 {'src': 'מי שלח אותך לכאן, המשמרות?', 'mt': 'Who sent you here, the shifts?'},
 {'src': 'הדלת תקועה, אבל אל תדאג.',
  'mt': "The door is stuck, but don't worry."},
 {'src': 'אין לנו יכולת לדעת את זה.',
  'mt': 'We have no ability to know that.'},
 {'src': 'אז אנחנו פועלים מנתיב איי-301.',
  'mt': 'So we are operating from route A-301.'},
 {'src': 'לקחתם את הסחורה ונעלמתם.',
  'mt': 'You took the goods and disappeared.'},
 {'src': 'תוך כמה זמן הם פורקים?', 'mt': 'How soon do they unload?'},
 {'src': 'טוב, זאת חמולה גדולה.', 'mt': "Well, that's a big load."},
 {'src': 'אבל מעתה והלאה, אתה עובד בשבילי.',
  'mt': 'But from now on, you work for me.'},
 {'src': 'אני צריך ללכת לעשות משהו בשדה.',
  'mt': 'I have to go do something in the field.'},
 {'src': 'אבל אני לא רוצה שתמות לי ממחלה.',
  'mt': "But 

In [9]:
model_output = comet_model.predict(data, batch_size=8, gpus=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Predicting DataLoader 0: 100%|█████████████████████| 2103/2103 [01:00<00:00, 34.61it/s]


In [10]:
df_samp['comet_score'] = model_output[0]

In [11]:
Path('./temp').mkdir(exist_ok=True)
df_samp.to_parquet('./temp/test_commet.parquet')

df_samp[df_samp.comet_score < 0.5][:200].to_html('./temp/bad_lt_0_5.html')
df_samp[df_samp.comet_score > 0.8][:200].to_html('./temp/good_gt_0_8.html')

print('Bad translation ratio', df_samp[df_samp.comet_score < 0.50].shape[0] / df_samp.shape[0])
descrb = df_samp[['comet_score']].describe(percentiles=np.arange(0, 1, 0.1))
print(descrb)

descrb.to_html('./temp/commet_score_quantiles.html')

Bad translation ratio 0.0015458707414233903
        comet_score
count  16819.000000
mean       0.840549
std        0.054742
min        0.312532
0%         0.312532
10%        0.781993
20%        0.820015
30%        0.838762
40%        0.849653
50%        0.857665
60%        0.864106
70%        0.869710
80%        0.874933
90%        0.881219
max        0.902790


In [14]:
df_samp[:100].to_html('./temp/100_random.html')


In [13]:
for x in range(100):
    df_samp['translation']

1394    {'en': ' Give her a few hours, she'll be fine....
353     {'en': ' The Zionists must be planning somethi...
1334    {'en': ' Who sent you here, the guards?', 'he'...
906     {'en': ' The door is stuck, but don't worry.',...
1290    {'en': ' We have no way of knowing that.', 'he...
                              ...                        
7475    {'en': ' You are our family, Doron.', 'he': 'א...
2236    {'en': ' Abu Ahmed sent me.', 'he': 'אבו אחמד ...
8932    {'en': ' We'll talk about that later.', 'he': ...
400     {'en': ' I'm sure they will be happy.', 'he': ...
3919    {'en': ' thought it would fix her, straighten ...
Name: translation, Length: 16819, dtype: object