In [None]:
%cd indic-bert

In [None]:
import glob
import os

In [None]:
from fine_tune.modules import get_modules
from fine_tune.data import load_dataset
from fine_tune.data.examples import *
from fine_tune.modules import base

In [None]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=5)

In [None]:
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
def _softmax(x1, x2, tog=0):
    a = np.exp([x1,x2])/np.sum(np.exp([x1,x2]))
    if tog == 1:
        return a[0]
    else:
        return a[1]

def apply_softmax(df, tog):
    return df.apply(lambda row: _softmax(row[0], row[1], tog), axis=1)

In [None]:
hparams = {
    'dataset': 'amrita-paraphrase-exact', 
    'model': 'ai4bharat/indic-bert', 
    'config_name': '', 
    'tokenizer_name': '', 
    'max_seq_length': 256, 
    'iglue_dir': '../indic-glue', 
    'overwrite_cache': True, 
    'cache_dir': '../ib-cache', 
    'fp16': False, 
    'fp16_opt_level': 'O1', 
    'n_gpu': 1, 
    'n_tpu_cores': 0, 
    'max_grad_norm': 1.0, 
    'do_train': True, 
    'do_predict': True, 
    'gradient_accumulation_steps': 1,
    'seed': 8, 
    'learning_rate': 2e-05, 
    'weight_decay': 0.0, 
    'adam_epsilon': 1e-08, 
    'warmup_steps': 0, 
    'num_train_epochs': 1, 
    'train_batch_size': 16, 
    'eval_batch_size': 32, 
    'labels': '', 
    'model_name_or_path': 'ai4bharat/indic-bert', 
    'data_dir': '../indic-glue/amrita-paraphrase-exact', 
    'mode': 'sequence-classification', 
    'output_mode': 'regression', #'classification', 
    'example_type': 'text', 
    'num_labels': 2
}

In [None]:
### Set up parameters for malayalam
hparams['out_dir'] = '../outputs/5/'
hparams['lang'] = 'ma' 
hparams['train_lang'] = 'ma'
hparams['dev_lang'] = 'ma'
hparams['test_lang'] = 'ma'
hparams['output_dir'] = '{}amrita-paraphrase-exact/train-{}/model-ai4bharat-indic-bert'.format(hparams['out_dir'],hparams['lang'] ) 

In [None]:
config_name = hparams['config_name'] or hparams['model_name_or_path']
args = {'num_labels': hparams['num_labels']}
config = AutoConfig.from_pretrained(
    config_name,
    **args,
    cache_dir=hparams['cache_dir']
)

## Score mBART Tamil

In [None]:
! rm ../indic-glue/amrita-paraphrase-exact/cached*

In [None]:
!cp ../indic-glue/amrita-paraphrase-exact/ma/ma-bart-test.csv ../indic-glue/amrita-paraphrase-exact/ma/ma-test.csv 

In [None]:
## Check if you have the right file
mbart_test = pd.read_csv('../indic-glue/amrita-paraphrase-exact/ma/ma-test.csv', header=None)
mbart_test.shape

In [None]:
# module_name = 'text_classification'

text_classification_trainer = get_modules('text_classification')
text_classification_trainer

In [None]:
tc_trainer_instance = text_classification_trainer(hparams)
checkpoints = list(sorted(glob.glob(os.path.join(hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True)))
trained_model = tc_trainer_instance.load_from_checkpoint(checkpoints[-1])

In [None]:
trainer = base.create_trainer(trained_model, hparams) #BaseModule(hparams)

In [None]:
trainer.test(trained_model)

## Load the scores for mBART translations.

In [None]:
hparams['results_file'] = '{}{}/train-{}/model-ai4bharat-indic-bert/test_results_{}.csv'.format(hparams['out_dir'],  hparams['dataset'] , hparams['lang'] , hparams['lang']) 
hparams['results_file']

In [None]:
mbart = pd.read_csv(hparams['results_file'], header=None)

In [None]:
mbart.shape

In [None]:
mbart.head(5)

## Score IndicTrans Tamil

In [None]:
! rm ../indic-glue/amrita-paraphrase-exact/cached*

In [None]:
!cp ../indic-glue/amrita-paraphrase-exact/ma/ma-indictrans-test.csv ../indic-glue/amrita-paraphrase-exact/ma/ma-test.csv 


In [None]:
## Check if you have the right file
mbart_test = pd.read_csv('../indic-glue/amrita-paraphrase-exact/ma/ma-test.csv', header=None)
mbart_test.shape

In [None]:
# module_name = 'text_classification'

text_classification_trainer = get_modules('text_classification')
text_classification_trainer

In [None]:
tc_trainer_instance = text_classification_trainer(hparams)
checkpoints = list(sorted(glob.glob(os.path.join(hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True)))
trained_model = tc_trainer_instance.load_from_checkpoint(checkpoints[-1])

In [None]:
trainer = base.create_trainer(trained_model, hparams) #BaseModule(hparams)

In [None]:
trainer.test(trained_model)

## Load the scores for IndicTrans translations.

In [None]:
hparams['results_file'] = '{}{}/train-{}/model-ai4bharat-indic-bert/test_results_{}.csv'.format(hparams['out_dir'],  hparams['dataset'] , hparams['lang'] , hparams['lang']) 
hparams['results_file']

In [None]:
indictrans = pd.read_csv(hparams['results_file'], header=None)

In [None]:
indictrans.shape

In [None]:
indictrans.head(5)

## Combine logits, original sentence, and translations

In [None]:
indic_orig = pd.read_csv('../indic-glue/amrita-paraphrase-exact/ma/ma-indictrans-test.csv', header=None)

In [None]:
indic_orig.head()

In [None]:
bart_orig = pd.read_csv('../indic-glue/amrita-paraphrase-exact/ma/ma-bart-test.csv', header=None)

In [None]:
bart_orig.head()

In [None]:
combined = bart_orig[[0, 1]]

In [None]:
combined.columns =['Original_Sentence', 'mBART_Translated', ]

In [None]:
combined['IndicTrans_Translated'] = indic_orig[[1]]

In [None]:
combined.head()

In [None]:
# combined = pd.concat([bart_orig['sentence_1'], b_trans['sentence_2']], axis=1, keys=['sentence_1', 'sentence_2'])

In [None]:
combined['mBART_NP_prob'] = mbart[[0]]
combined['mBART_P_prob'] = mbart[[1]]
combined['model_pred_mbart'] = ['NP' if x==0 else 'P' for x in mbart[2]]
combined['mbart_P'] = apply_softmax(mbart, 0)
combined['mbart_NP'] = apply_softmax(mbart, 1)

In [None]:
combined.head()

In [None]:
combined['IndicT_NP_prob'] = indictrans[[0]]
combined['IndicT_P_prob'] = indictrans[[1]]
combined['model_pred_IndicT'] = ['NP' if x==0 else 'P' for x in indictrans[2]]
combined['indicT_P'] = apply_softmax(indictrans, 0)
combined['indicT_NP'] = apply_softmax(indictrans, 1)

In [None]:
combined.head()

In [None]:
def select_trasnlation(row):
    if row['indicT_P']>=row['mbart_P']:
        return row['IndicTrans_Translated']
    else:
        return row['mBART_Translated']

combined['selected_translation'] = combined.apply(lambda row : select_trasnlation(row), axis=1) 

In [None]:
combined['mBART_avg'] = combined.iloc[:,3:5].mean(axis=1)

In [None]:
combined.head()

In [None]:
combined['IT_avg'] = combined.iloc[:,5:7].mean(axis=1)

In [None]:
combined.head()

In [None]:
combined.to_csv('../combined-ma.csv', sep='\t', encoding='utf-8')

In [None]:
np.savetxt('../original_sentence-ma.txt', combined['Original_Sentence'], fmt='%s')
np.savetxt('../mBART_translated-ma.txt', combined['mBART_Translated'], fmt='%s')
np.savetxt('../indictrans_translated-ma.txt', combined['IndicTrans_Translated'], fmt='%s')
np.savetxt('../selected_translation-ma.txt', combined['selected_translation'], fmt='%s')

In [None]:
!cp ../original_sentence-ma.txt ../../bleu_test/testing/indicTrans/
!cp ../mBART_translated-ma.txt  ../../bleu_test/testing/indicTrans/
!cp ../indictrans_translated-ma.txt  ../../bleu_test/testing/indicTrans/
!cp ../selected_translation-ma.txt  ../../bleu_test/testing/indicTrans/

In [None]:
!ls -lt ../../bleu_test/testing/indicTrans/*.txt 

# Measure the semantic similarity

In [None]:
# # Install the necessary libraries
# !pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library
# # Install fairseq from source
# !git clone https://github.com/pytorch/fairseq.git
# %cd fairseq
# # !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d
# !pip install --editable ./
# %cd ..

In [None]:
%cd ../../bleu_test/testing/indicTrans/

In [None]:
!sacrebleu original_sentence-ma.txt -i mBART_translated-ma.txt | jq -r .score

In [None]:
!sacrebleu original_sentence-ma.txt -i indictrans_translated-ma.txt | jq -r .score

In [None]:
!sacrebleu original_sentence-ma.txt -i selected_translation-ma.txt   | jq -r .score