Install necessary packages for code to work.

In [17]:
!pip install transformers -q
!pip install sentencepiece -q

[K     |████████████████████████████████| 3.1 MB 8.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 72.8 MB/s 
[K     |████████████████████████████████| 895 kB 18.7 MB/s 
[K     |████████████████████████████████| 59 kB 8.7 MB/s 
[K     |████████████████████████████████| 596 kB 56.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 6.7 MB/s 
[?25h

In [20]:
# Import necessary libraries
import pandas as pd
import numpy as np
import editdistance
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import copy

In [7]:
# Leaving this for an example of how the input should look - language task column optional, but helpful
# the rest of the column names are necessary for the code to work
#df.head()

Unnamed: 0,language_task,target_sentence,bart_translation,bart_paraphrase_score,indictrans_translation,indictrans_paraphrase_score
0,translate English to Hindi,प्रधानमंत्री ने कहा कि बाबा साहेब अम्बेडकर की ...,Prime Minister said Babasaheb Ambedkar has a k...,0.296427,प्रधानमंत्री ने कहा कि करोड़ों लोगों के दिलों ...,0.96405
1,translate English to Hindi,इस समारोह को आज बीजापुर में आयोजित करने के महत...,आज बीजापुर में इस समारोह को आयोजित करने के महत...,0.961452,आज बीजापुर में इस कार्यक्रम के आयोजन के महत्व ...,0.975653
2,translate English to Hindi,उन्होंने कहा कि इस कार्य को 2022 तक पूरा कर ले...,उन्होंने कहा कि लक्ष्य 2022 तक इस कार्य को पूर...,0.984172,उन्होंने कहा कि इस कार्य को 2022 तक पूरा करने ...,0.987264
3,translate English to Hindi,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,0.987658,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,0.989116
4,translate English to Hindi,"उन्होंने इस संदर्भ में जन धन खाता खोलने, गरीबो...","इस संदर्भ में उन्होंने जनधन खाते खोलने, गरीबों...",0.979594,"इस संदर्भ में उन्होंने जन धन खाते खोलने, गरीबो...",0.986208


The mean_pooling and sentence_embeddings functions below are used to get the sentence embeddings for the cosine similarity scores.

In [8]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def sentence_embeddings(device, tokenizer, model, sentences):
  """Create contextualized sentence embeddings
  
  Parameters:
  tokenizer: transformers tokenizer for creating word tokens
  model:
  sentences: sentences to generate sentence embeddings for
  """

  # Tokenize sentences and return PyTorch tensors
  encoded_input = tokenizer(sentences, padding=True, truncation=True, 
                            max_length=128, return_tensors='pt')
  encoded_input = encoded_input.to(device)
  
  # Pass the tokenized input to the model 
  with torch.no_grad():
    model.to(device)
    model_output = model(**encoded_input) 
  
  # Perform mean pooling to get total sentence embeddings
  sentence_embeddings = mean_pooling(model_output, 
                                     encoded_input['attention_mask'])
  
  return sentence_embeddings

The check_cosine_similarity function is used to get cosine similarity scores between the target and the translation - I've used the following column names: target_sentence, bart_translation, and indictrans_translation.

In [28]:
# Get the sentence embeddings and cosine similarity scores between each pair of sentences in the dataset
# This is to check if low scoring sentences have exact similarity scores or if they are of low quality
def check_cosine_similarity(df, device, tokenizer, model):

  # Create an empty list to check cosine similarity scores
  bart_cos_scores = []
  indictrans_cos_scores = []

  for index, rows in df.iterrows():
    # Get the sentence embeddings of each pair of target sentences
    sent1_embed = sentence_embeddings(device, tokenizer, model, rows['target_sentence'])
    sent2_embed = sentence_embeddings(device, tokenizer, model, rows['bart_translation'])
    sent3_embed = sentence_embeddings(device, tokenizer, model, rows['indictrans_translation'])

    # Reshape the embeddings to be of a single dimension 
    # Get the cosine similarity between the two pairs of sentence embeddings
    bart_cosine_similarity_value = F.cosine_similarity(sent1_embed.squeeze(0), sent2_embed.squeeze(0), dim=0)
    indictrans_cosine_similarity_value = F.cosine_similarity(sent1_embed.squeeze(0), sent3_embed.squeeze(0), dim=0)

    # Add the cosine similarity scores to their respective lists
    bart_cos_scores.append(bart_cosine_similarity_value.item())
    indictrans_cos_scores.append(indictrans_cosine_similarity_value.item())

  # Create two new columns in the dataframe to hold the respective scores
  df['bart_cosine_score'] = bart_cos_scores
  df['indictrans_cosine_score'] = indictrans_cos_scores

  return df

In [12]:
def get_metrics(df):
  """A function that returns the dataframe with new columns for bart and indictrans cosine scores and edit distances"""

  # Set up the device to run on GPU if available
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

  # Set the tokenizer and model as IndicBert Tokenizer and IndicBert model 
  tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
  model = AutoModel.from_pretrained('ai4bharat/indic-bert')

  # Get the cosine similarity scores for bart and indictrans
  final = check_cosine_similarity(df, device, tokenizer, model)

  # Get the edit distances for bart and indictrans
  final['indictrans_edit'] = final.apply(lambda x: editdistance.eval(x['target_sentence'], x['indictrans_translation']), axis=1)
  final['bart_edit'] = final.apply(lambda x: editdistance.eval(x['target_sentence'], x['bart_translation']), axis=1)

  return final
  

In [29]:
# Leaving this here so that you can see the function call
# final = get_metrics(df)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'sop_classifier.classifier.weight', 'predictions.decoder.weight', 'sop_classifier.classifier.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [30]:
# Leaving this here so that you can see what the output should look like
#final.head()

Unnamed: 0,language_task,target_sentence,bart_translation,bart_paraphrase_score,indictrans_translation,indictrans_paraphrase_score,bart_cosine_score,indictrans_cosine_score,indictrans_edit,bart_edit
0,translate English to Hindi,प्रधानमंत्री ने कहा कि बाबा साहेब अम्बेडकर की ...,Prime Minister said Babasaheb Ambedkar has a k...,0.296427,प्रधानमंत्री ने कहा कि करोड़ों लोगों के दिलों ...,0.96405,0.770237,0.948379,67,126
1,translate English to Hindi,इस समारोह को आज बीजापुर में आयोजित करने के महत...,आज बीजापुर में इस समारोह को आयोजित करने के महत...,0.961452,आज बीजापुर में इस कार्यक्रम के आयोजन के महत्व ...,0.975653,0.946777,0.973164,56,43
2,translate English to Hindi,उन्होंने कहा कि इस कार्य को 2022 तक पूरा कर ले...,उन्होंने कहा कि लक्ष्य 2022 तक इस कार्य को पूर...,0.984172,उन्होंने कहा कि इस कार्य को 2022 तक पूरा करने ...,0.987264,0.956433,0.985965,3,26
3,translate English to Hindi,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,0.987658,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,0.989116,0.965325,0.986373,3,3
4,translate English to Hindi,"उन्होंने इस संदर्भ में जन धन खाता खोलने, गरीबो...","इस संदर्भ में उन्होंने जनधन खाते खोलने, गरीबों...",0.979594,"इस संदर्भ में उन्होंने जन धन खाते खोलने, गरीबो...",0.986208,0.979276,0.992806,38,58
