In [1]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Load the eval files for each model's translations
final = pd.read_csv(D+"/predicted_text/siamese_evaluations_of_translations/record_comparison.csv")
final.drop(columns={'Unnamed: 0'}, inplace=True)

In [5]:
# View the first few records in each file
final.head()

Unnamed: 0,language_task,target_sentence,bart_translation,bart_paraphrase_score,bart_cosine_score,indictrans_translation,indictrans_paraphrase_score,indictrans_cosine_score,best_trans
0,translate English to Hindi,प्रधानमंत्री ने कहा कि बाबा साहेब अम्बेडकर की ...,Prime Minister said Babasaheb Ambedkar has a k...,0.088514,0.770237,प्रधानमंत्री ने कहा कि करोड़ों लोगों के दिलों ...,0.974685,0.948379,प्रधानमंत्री ने कहा कि करोड़ों लोगों के दिलों ...
1,translate English to Hindi,इस समारोह को आज बीजापुर में आयोजित करने के महत...,आज बीजापुर में इस समारोह को आयोजित करने के महत...,0.979257,0.946777,आज बीजापुर में इस कार्यक्रम के आयोजन के महत्व ...,0.978843,0.973164,आज बीजापुर में इस समारोह को आयोजित करने के महत...
2,translate English to Hindi,उन्होंने कहा कि इस कार्य को 2022 तक पूरा कर ले...,उन्होंने कहा कि लक्ष्य 2022 तक इस कार्य को पूर...,0.9865,0.956433,उन्होंने कहा कि इस कार्य को 2022 तक पूरा करने ...,0.987696,0.985965,उन्होंने कहा कि इस कार्य को 2022 तक पूरा करने ...
3,translate English to Hindi,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,0.98067,0.965325,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...,0.986279,0.986373,प्रधानमंत्री ने कहा कि सरकार स्पष्ट लक्ष्यों औ...
4,translate English to Hindi,"उन्होंने इस संदर्भ में जन धन खाता खोलने, गरीबो...","इस संदर्भ में उन्होंने जनधन खाते खोलने, गरीबों...",0.984052,0.979276,"इस संदर्भ में उन्होंने जन धन खाते खोलने, गरीबो...",0.985711,0.992806,"इस संदर्भ में उन्होंने जन धन खाते खोलने, गरीबो..."


In [7]:
# Add a new column for the source of the best translation
final['best_trans_source'] = np.where(final.best_trans == final.indictrans_translation, "indictrans", "mbart")

In [11]:
# Add a new column based upon whether or not cosine similarity score went up or down
conditions = [
    (final['best_trans_source'] == 'indictrans') & (final['indictrans_cosine_score'] > final['bart_cosine_score']),
    (final['best_trans_source'] == 'indictrans') & (final['indictrans_cosine_score'] < final['bart_cosine_score']),
    (final['best_trans_source'] == 'indictrans') & (final['indictrans_cosine_score'] == final['bart_cosine_score']),
    (final['best_trans_source'] == 'mbart') & (final['bart_cosine_score'] > final['indictrans_cosine_score']),
    (final['best_trans_source'] == 'mbart') & (final['bart_cosine_score'] < final['indictrans_cosine_score']),
    (final['best_trans_source'] == 'mbart') & (final['bart_cosine_score'] == final['bart_cosine_score'])
    ]
values = ['score increased', 'score decreased', 'no change', 'score increased', 'score decreased', 'no change']

final['cos_change'] = np.select(conditions, values)

In [14]:
# The cosine similarity score of the sentence chosen by the paraphrase evaluator was higher than that of the non-chosen sentence 75% of the time
# 2,243 of 3,000 times
# in 724 cases, the cosine similarity score of the chosen sentence was lower than that of the non-chosen sentence
final.cos_change.value_counts()

score increased    2243
score decreased     724
no change            33
Name: cos_change, dtype: int64

In [21]:
# IndicTrans initially had higher Sacre Bleu scores for 
indic_chosen = final[final.best_trans_source == 'indictrans']
mbart_chosen = final[final.best_trans_source == 'mbart']

In [22]:
# When Indictrans was chosen as the translation, the cosine similarity was higher than that the Bart translation 1,489 times out of 1,921
# 78% of the time
indic_chosen.cos_change.value_counts()

score increased    1489
score decreased     399
no change            33
Name: cos_change, dtype: int64

In [24]:
# Mbart's translation was chosen as the better translation 1,079 times
# 70% of the time, this replacement resulted in an increase in sentence similarity as measured by cosine similarity score
mbart_chosen.cos_change.value_counts()

score increased    754
score decreased    325
Name: cos_change, dtype: int64

In [25]:
final.columns

Index(['language_task', 'target_sentence', 'bart_translation',
       'bart_paraphrase_score', 'bart_cosine_score', 'indictrans_translation',
       'indictrans_paraphrase_score', 'indictrans_cosine_score', 'best_trans',
       'best_trans_source', 'cos_change'],
      dtype='object')

In [26]:
# Look at langage specific changes in cosine sentence similarity, then by language + translator used
# Initial Sacre Bleu scores were highest for Hindi from IndicTrans, MBart for Tamil, and IndicTrans for Malayalam
hi_cos_change = final[final.language_task == 'translate English to Hindi']
ta_cos_change = final[final.language_task == 'translate English to Tamil']
ml_cos_change = final[final.language_task == 'translate English to Malayalam']

In [27]:
# 793 of 1,000 times, the cosine similarity score for Hindi increased
# an additional 24 times there was no change
hi_cos_change.cos_change.value_counts()

score increased    793
score decreased    183
no change           24
Name: cos_change, dtype: int64

In [29]:
print(hi_cos_change[hi_cos_change.best_trans_source == 'indictrans'].cos_change.value_counts(), "\n")
print(hi_cos_change[hi_cos_change.best_trans_source == 'mbart'].cos_change.value_counts())

score increased    555
score decreased     81
no change           24
Name: cos_change, dtype: int64 

score increased    238
score decreased    102
Name: cos_change, dtype: int64


In [32]:
print(ta_cos_change.cos_change.value_counts(), "\n")
print(ta_cos_change[ta_cos_change.best_trans_source == 'indictrans'].cos_change.value_counts(), "\n")
print(ta_cos_change[ta_cos_change.best_trans_source == 'mbart'].cos_change.value_counts())

score increased    723
score decreased    268
no change            9
Name: cos_change, dtype: int64 

score increased    408
score decreased    124
no change            9
Name: cos_change, dtype: int64 

score increased    315
score decreased    144
Name: cos_change, dtype: int64


In [33]:
print(ml_cos_change.cos_change.value_counts(), "\n")
print(ml_cos_change[ml_cos_change.best_trans_source == 'indictrans'].cos_change.value_counts(), "\n")
print(ml_cos_change[ml_cos_change.best_trans_source == 'mbart'].cos_change.value_counts())

score increased    727
score decreased    273
Name: cos_change, dtype: int64 

score increased    526
score decreased    194
Name: cos_change, dtype: int64 

score increased    201
score decreased     79
Name: cos_change, dtype: int64
