In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
drive_PATH = '../content/drive/MyDrive/Colab Notebooks/l101.experiments.1'

In [None]:
!pip install jsonlines

In [None]:
from transformers import AutoTokenizer, RemBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("google/rembert")
model = RemBertModel.from_pretrained("google/rembert")

In [None]:
def encode(sent):
    inputs = tokenizer(sent, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states.detach().numpy()[0]

def encode_sentence(sent):
    enc = encode(sent)
    pooled_enc = sum(enc) / enc.shape[0]
    return pooled_enc

def encode_candidates(cands):
    return [encode_sentence(cand) for cand in cands]

In [None]:
import jsonlines
# BLEURT
bleurt_ENTR_PATH = drive_PATH + '/data-main/bleurt-ENTR.jsonl'
bleurt_ENTR = []
with jsonlines.open(bleurt_ENTR_PATH) as f:
    for line in f.iter():
        bleurt_ENTR.append(line)
# chrF++
chrf_ENTR_PATH = drive_PATH + '/data-main/chrf-ENTR.jsonl'
chrf_ENTR = []
with jsonlines.open(chrf_ENTR_PATH) as f:
    for line in f.iter():
        chrf_ENTR.append(line)

In [None]:
num_examples = 1000
assert(len(chrf_ENTR) == num_examples)
assert(len(bleurt_ENTR) == num_examples)

for i in range(num_examples):
  assert(chrf_ENTR[i]['src'] == bleurt_ENTR[i]['src'])
  assert(chrf_ENTR[i]['ref'] == bleurt_ENTR[i]['ref'])
  assert(chrf_ENTR[i]['candidates'] == bleurt_ENTR[i]['candidates'])

In [None]:
import tqdm
ENTR = []
for i in tqdm.tqdm(range(num_examples)):
  ENTR_dict = { 'src': chrf_ENTR[i]['src'], 'ref': chrf_ENTR[i]['ref'], 'candidates': chrf_ENTR[i]['candidates'], 'enc_candidates':encode_candidates(chrf_ENTR[i]['candidates']) }
  ### bleurt
  # cand scores
  ENTR_dict['BLEURT_cand_scores'] = bleurt_ENTR[i]['BLEURT_cand_scores']
  # mbr cand scores
  ENTR_dict['MBR_BLEURT_cand_scores'] = bleurt_ENTR[i]['MBR_BLEURT_cand_scores']
  # mbr score matrix
  ENTR_dict['MBR_BLEURT_score_matrix'] = bleurt_ENTR[i]['MBR_BLEURT_score_matrix']
  # mbr winner index
  ENTR_dict['MBR_BLEURT_winner_index'] = bleurt_ENTR[i]['MBR_BLEURT_winner_index']
  # mbr winner
  ENTR_dict['MBR_BLEURT_winner'] = bleurt_ENTR[i]['MBR_BLEURT_winner']
  ### chrf
  # cand scores
  ENTR_dict['CHRF_cand_scores'] = chrf_ENTR[i]['CHRF_cand_scores']
  # mbr cand scores
  ENTR_dict['MBR_CHRF_cand_scores'] = chrf_ENTR[i]['MBR_CHRF_cand_scores']
  # mbr score matrix
  ENTR_dict['MBR_CHRF_score_matrix'] = chrf_ENTR[i]['MBR_CHRF_score_matrix']
  # mbr winner index
  ENTR_dict['MBR_CHRF_winner_index'] = chrf_ENTR[i]['MBR_CHRF_winner_index']
  # mbr winner
  ENTR_dict['MBR_CHRF_winner'] = chrf_ENTR[i]['MBR_CHRF_winner']
  ###
  ENTR.append(ENTR_dict)

In [None]:
import numpy as np
for i in tqdm.tqdm(range(num_examples)):
  ENTR[i]['enc_candidates'] = np.array(ENTR[i]['enc_candidates'], dtype=float).tolist()

In [None]:
ENTR_PATH = drive_PATH+ f'/data-main/ENTR.jsonl'
with jsonlines.open(ENTR_PATH, mode='w') as writer:
    for item in ENTR:
        writer.write(item)

In [None]:
ENTR[0]['enc_candidates']