In [1]:
!pip install sentence_transformers -q

[K     |████████████████████████████████| 78 kB 3.2 MB/s 
[K     |████████████████████████████████| 3.1 MB 22.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 56.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 60.2 MB/s 
[K     |████████████████████████████████| 59 kB 6.9 MB/s 
[K     |████████████████████████████████| 895 kB 59.5 MB/s 
[K     |████████████████████████████████| 596 kB 64.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [2]:
# Import the necessary libraries
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

In [3]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [5]:
# Load the dataframe - example using sample code
translation_1 = pd.read_csv(D+"/predicted_text/shuffled.csv")
translation_2 =pd.read_csv(D+"/predicted_text/unshuffled.csv")

In [6]:
# Load the model from a multilingual checkpoint tuned for paraphrase tasks
# Can also use paraphrase-multilingual-mpnet-base-v2 as the checkpoint
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')


Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
def best_translation(model, trans_1, trans_2, sim_metric=util.cos_sim):
  """Find the best translation between two translations by paraphrase similarity
  
  Parameters:
  model: A multi-lingual paraphrase model from a checkpoint
  trans_1: A dataframe of target_text and predicted_text for 1 translator
  trans_2: A dataframe of target_text and predicted_text for a 2nd translator
  sim_metric: method for computing similarity; defauly is cosine similarity

  Returns:
  best_match: A dataframe of target text, translations, best translator, and best paraphrase score 
  """
  
  # Define the embedder as the model passed in
  embedder = model

  # Extract the target text and predicted_text fields from the 1st translation
  df1 = translation_1[["target_text", "predicted_text"]]
  # Rename the predicted text field
  df1.rename(columns={'predicted_text': 'translation1'}, inplace=True)

  # Extract the target text and predicted_text fields from the 2nd translation
  df2 = translation_2[["target_text", "predicted_text"]]
  # Rename the predicted text field
  df2.rename(columns={'predicted_text': 'translation2'}, inplace=True)

  # Get the main dataframe
  df = pd.merge(left=df1, right=df2, how='outer', 
                left_on='target_text', right_on='target_text')

  # Drop rows with a missing value in any column
  final = df.dropna()
  
  # Create two empty lists to hold the best translator and best paraphrase score
  best_translator = []
  paraphrase_similarity_score = []
  
  # Iterate over each 
  for index, rows in final.iterrows():
    # Extract the query and corpus
    query = rows.target_text
    corpus = [rows.translation1, rows.translation2]

    # Get the embedding of the query vector
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    # Get the corpus embeddings
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

    # Get the closest translation in the corpus embedding
    hits = util.semantic_search(query_embedding, corpus_embeddings, 
                              top_k=1, score_function=sim_metric)

    # Get the index and cosine similarity score of the best translation
    hit = hits[0][0]

    # Add the index of the best translator to the translator list
    best_translator.append(hit['corpus_id'])

    # Add the paraphrase similarity score of the best translator to the score list
    paraphrase_similarity_score.append(hit['score'])

  # Create a dataframe of the best translator and its corresponding score and save the scores to a dataframe
  scores = pd.DataFrame({'target': final.target_text.tolist(), 
                         'translator_0': final.translation1.tolist(),
                         'translator_1': final.translation2.tolist(), 
                         'best_translator': best_translator, 
                         'paraphrase_score': paraphrase_similarity_score})
  return scores


In [12]:
def main():
  """Call the best translator function and save the output"""

  translation_accuracy = best_translation(model, translation_1, 
                                         translation_2, sim_metric=util.cos_sim)
  
  # Save the output
  translation_accuracy.to_csv(D+'/translation_accuracy/accuracy.csv')

In [13]:
if __name__ == '__main__':
  main()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
