In [1]:
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import pickle
from scipy.stats import pearsonr
from sentence_transformers import SentenceTransformer
from itertools import chain, combinations

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# utils

In [7]:
# def read_excel_dataset(path):
#   raw_df =  pd.read_excel(path)
#   xs1, xs2 = list(raw_df['Text1 Translation']), list(raw_df['Text2 Translation'])
#   scores = list(raw_df['Score']) if 'Score' in raw_df.columns else [None]*len(xs1)
#   pairIDs = list(raw_df['PairID'])
#   return [(xs1[i],xs2[i],scores[i],pairIDs[i]) for i in range(len(xs1))]
# sentences = read_excel_dataset(f'spa2eng_dev_trackD.xlsx')

In [10]:
languages = ["afr", "amh", "arb", "arq", "ary", "eng", "esp", "hau", "hin", "ind", "kin", "pan"]
target_lang = "hau2eng"

df = pd.read_csv(f'translations/{target_lang}_test.csv')
len(df)

375

In [57]:
target_lang = 'kin2eng'
sentences_trans = pd.read_csv(f'Data/translations/{target_lang}_train.csv')
sentences_orig = pd.read_csv(f'Data/original/{target_lang[0:3]}_train.csv')
assert len(sentences_orig['Score']) == len(sentences_trans)
sentences_trans.insert(5, "Score",sentences_orig['Score'], True)
sentences_trans.head()
sentences_trans.to_csv(f'Data/tmp/{target_lang}_train.csv', index=False)

In [3]:
def read_csv_dataset(path):
  raw_df = pd.read_csv(path)
  xs1 = list(raw_df['Text1 Translation'])
  xs2 = list(raw_df['Text2 Translation'])
  scores = list(raw_df['Score']) if 'Score' in raw_df.columns else [None]*len(xs1)
  pairIDs = list(raw_df['PairID'])
  return [(xs1[i],xs2[i],scores[i],pairIDs[i]) for i in range(len(xs1))]

FileNotFoundError: [Errno 2] No such file or directory: 'translations/tel2eng_test.csv'

# get E5 embeddings

In [181]:
def average_pool(last_hidden_states: torch.Tensor,
                 attention_mask: torch.Tensor) -> torch.Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def get_e5_embeddings(sentences, path_save):
  tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
  model = AutoModel.from_pretrained('intfloat/multilingual-e5-large').to(device)
  embedding_dict = {}
  scores = []
  for text1,text2,score,pairID in tqdm(sentences):
    input_texts = [f'query: {text1}', f'query: {text2}']
    batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
      outputs = model(**batch_dict)
    embedding = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    embedding = F.normalize(embedding, p=2, dim=1)
    scores.append((embedding[0,:] @ embedding[1,:]).item())
    embedding_dict[pairID] = (embedding, text1, text2, score)
  with open(path_save, 'wb') as handle:
    pickle.dump(embedding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
  return [(sentences[i][3],scores[i]) for i in range(len(scores))]

In [182]:
# sentences = read_csv_dataset(f'translations/{target_lang}_dev.csv')
# pair_scores = get_e5_embeddings(sentences, f'Embeddings/{target_lang}_dev_e5.pickle')

sentences = read_csv_dataset(f'translations/{target_lang}_test.csv')
pair_scores = get_e5_embeddings(sentences, f'NewEmbeddings/{target_lang}_test_e5.pickle')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297/297 [00:04<00:00, 64.69it/s]


In [183]:
sentences[70]

('"Shubham Gill scored 10 runs with two fours and a double in the ninth over bowled by Gopal."',
 '"Royals bowlers Varun Aaron, Thomas, Shreyas Gopal and Unadkat took 2 wickets each."',
 None,
 'TEL-test-00071')

# multilingual sentence bert v1

In [184]:
model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v1'

In [185]:
def get_sentence_transformer_embedding(sentences, model_name, path_save):
  model = SentenceTransformer(model_name).to(device)
  embedding_dict = {}
  scores = []
  for text1,text2,score,pairID in tqdm(sentences):
    input_texts = [text1.lower(), text2.lower()]
    with torch.no_grad():
      embedding = torch.tensor(model.encode(input_texts))
    embedding = F.normalize(embedding, p=2, dim=1)
    scores.append((embedding[0,:] @ embedding[1,:]).item())
    embedding_dict[pairID] = (torch.tensor(embedding), text1, text2, score)
  with open(path_save, 'wb') as handle:
    pickle.dump(embedding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
  return [(sentences[i][3],scores[i]) for i in range(len(scores))]

In [186]:
# sentences = read_csv_dataset(f'translations/{target_lang}_dev.csv')
# pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'Embeddings/{target_lang}_dev_mbertv1.pickle')

sentences = read_csv_dataset(f'translations/{target_lang}_test.csv')
pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'NewEmbeddings/{target_lang}_test_mbertv1.pickle')

  embedding_dict[pairID] = (torch.tensor(embedding), text1, text2, score)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297/297 [00:01<00:00, 187.22it/s]


# multilingual sentence bert v2

In [187]:
model_name = 'sentence-transformers/distiluse-base-multilingual-cased-v2'

In [188]:
# sentences = read_csv_dataset(f'translations/{target_lang}_dev.csv')
# pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'Embeddings/{target_lang}_dev_mbertv2.pickle')

sentences = read_csv_dataset(f'translations/{target_lang}_test.csv')
pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'NewEmbeddings/{target_lang}_test_mbertv2.pickle')

  embedding_dict[pairID] = (torch.tensor(embedding), text1, text2, score)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297/297 [00:01<00:00, 191.90it/s]


# paraphrase-multilingual-mpnet-base-v2

In [189]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'

In [190]:
# sentences = read_csv_dataset(f'translations/{target_lang}_dev.csv')
# pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'Embeddings/{target_lang}_dev_mpnetv2.pickle')

sentences = read_csv_dataset(f'translations/{target_lang}_test.csv')
pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'NewEmbeddings/{target_lang}_test_mpnetv2.pickle')

  embedding_dict[pairID] = (torch.tensor(embedding), text1, text2, score)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297/297 [00:02<00:00, 99.12it/s]


# paraphrase-multilingual-MiniLM-L12-v2

In [191]:
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

In [192]:
# sentences = read_csv_dataset(f'translations/{target_lang}_dev.csv')
# pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'Embeddings/{target_lang}_dev_minilm.pickle')

sentences = read_csv_dataset(f'translations/{target_lang}_test.csv')
pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'NewEmbeddings/{target_lang}_test_minilm.pickle')

  embedding_dict[pairID] = (torch.tensor(embedding), text1, text2, score)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297/297 [00:02<00:00, 105.80it/s]


# all-mpnet-base-v2

In [193]:
model_name = 'sentence-transformers/all-mpnet-base-v2'

In [194]:
# sentences = read_csv_dataset(f'translations/{target_lang}_dev.csv')
# pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'Embeddings/{target_lang}_dev_mpnet.pickle')

sentences = read_csv_dataset(f'translations/{target_lang}_test.csv')
pair_scores = get_sentence_transformer_embedding(sentences, model_name, f'NewEmbeddings/{target_lang}_test_mpnet.pickle')

  embedding_dict[pairID] = (torch.tensor(embedding), text1, text2, score)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 297/297 [00:02<00:00, 99.53it/s]
