In [1]:
!pip install transformers -q
!pip install sentencepiece

[K     |████████████████████████████████| 3.1 MB 4.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 61.8 MB/s 
[K     |████████████████████████████████| 895 kB 79.1 MB/s 
[K     |████████████████████████████████| 596 kB 68.3 MB/s 
[K     |████████████████████████████████| 59 kB 8.8 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.0 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [2]:
!pip install sentence_transformers -q

[?25l[K     |████▏                           | 10 kB 37.8 MB/s eta 0:00:01[K     |████████▍                       | 20 kB 9.3 MB/s eta 0:00:01[K     |████████████▌                   | 30 kB 8.1 MB/s eta 0:00:01[K     |████████████████▊               | 40 kB 7.6 MB/s eta 0:00:01[K     |████████████████████▉           | 51 kB 4.1 MB/s eta 0:00:01[K     |█████████████████████████       | 61 kB 4.3 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 71 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████████| 78 kB 3.3 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [3]:
# Import libraries
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [4]:
# Check if cuda is available to run on GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')# Set the model to run on GPU in place
#model.cuda()

In [5]:
# Create an instance of the tokenizer and model using Indic Bert
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/129M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'sop_classifier.classifier.bias', 'sop_classifier.classifier.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Create sentence embeddings from Indic Bert

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def sentence_embeddings(device, tokenizer, model, sentences):
  """Create contextualized sentence embeddings
  
  Parameters:
  tokenizer: transformers tokenizer for creating word tokens
  model:
  sentences: sentences to generate sentence embeddings for
  """

  # Tokenize sentences and return PyTorch tensors
  encoded_input = tokenizer(sentences, padding=True, truncation=True, 
                            max_length=128, return_tensors='pt')
  encoded_input = encoded_input.to(device)
  
  # Retrieve token embeddings for sentences
  with torch.no_grad():
    model.to(device)
    model_output = model(**encoded_input) 
  
  # Perform mean pooling to get total sentence embeddings
  sentence_embeddings = mean_pooling(model_output, 
                                     encoded_input['attention_mask'])
  
  return sentence_embeddings


In [None]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

In [8]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [9]:
# Load the dataframe - example using sample code
translation_1 = pd.read_csv(D+"/predicted_text/shuffled.csv")
translation_2 =pd.read_csv(D+"/predicted_text/unshuffled.csv")

In [10]:
def best_translation(device, tokenizer, model, trans_1, trans_2, sim_metric=util.cos_sim):
  """Find the best translation between two translations by paraphrase similarity
  
  Parameters:
  tokenizer: A tokenizer using a multi-lingual checkpoint
  model: A multi-lingual paraphrase model from a checkpoint
  trans_1: A dataframe of target_text and predicted_text for 1 translator
  trans_2: A dataframe of target_text and predicted_text for a 2nd translator
  sim_metric: method for computing similarity; defauly is cosine similarity

  Returns:
  best_match: A dataframe of target text, translations, best translator, and best paraphrase score 
  """

  # Extract the target text and predicted_text fields from the 1st translation
  df1 = translation_1[["target_text", "predicted_text"]]
  # Rename the predicted text field
  df1.rename(columns={'predicted_text': 'translation1'}, inplace=True)

  # Extract the target text and predicted_text fields from the 2nd translation
  df2 = translation_2[["target_text", "predicted_text"]]
  # Rename the predicted text field
  df2.rename(columns={'predicted_text': 'translation2'}, inplace=True)

  # Get the main dataframe
  df = pd.merge(left=df1, right=df2, how='outer', 
                left_on='target_text', right_on='target_text')

  # Drop rows with a missing value in any column
  final = df.dropna()
  
  # Create two empty lists to hold the best translator and best paraphrase score
  best_translator = []
  paraphrase_similarity_score = []

  # Retrieve the embeddings of all target text
  query_embeddings = sentence_embeddings(device, tokenizer, model, df.target_text.tolist())
  
  # Iterate over each 
  for index, rows in final.iterrows():
    # Retrieve the query embeddig
    query_embedding = query_embeddings[index]
    # Create the corpus of translations
    corpus = [rows.translation1, rows.translation2]
    
    # Get the corpus embeddings
    corpus_embeddings = sentence_embeddings(device, tokenizer, model, corpus)

    # Get the closest translation in the corpus embedding
    hits = util.semantic_search(query_embedding, corpus_embeddings, 
                              top_k=1, score_function=sim_metric)

    # Get the index and cosine similarity score of the best translation
    hit = hits[0][0]

    # Add the index of the best translator to the translator list
    best_translator.append(hit['corpus_id'])

    # Add the paraphrase similarity score of the best translator to the score list
    paraphrase_similarity_score.append(hit['score'])

  # Create a dataframe of the best translator and its corresponding score and save the scores to a dataframe
  scores = pd.DataFrame({'target': final.target_text.tolist(), 
                         'translator_0': final.translation1.tolist(),
                         'translator_1': final.translation2.tolist(), 
                         'best_translator': best_translator, 
                         'paraphrase_score': paraphrase_similarity_score})
  return scores


In [11]:
def main():
  """Call the best translator function and save the output"""

  translation_accuracy = best_translation(device, tokenizer, model, translation_1, translation_2, sim_metric=util.cos_sim)
  
  # Save the output
  translation_accuracy.to_csv(D+'/translation_accuracy/accuracy.csv')

In [12]:
if __name__ == '__main__':
  main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
