In [1]:
!pip install transformers -q
!pip install sentencepiece -q

[K     |████████████████████████████████| 3.1 MB 6.4 MB/s 
[K     |████████████████████████████████| 596 kB 20.3 MB/s 
[K     |████████████████████████████████| 895 kB 12.4 MB/s 
[K     |████████████████████████████████| 59 kB 4.1 MB/s 
[K     |████████████████████████████████| 3.3 MB 36.4 MB/s 
[K     |████████████████████████████████| 1.2 MB 10.0 MB/s 
[?25h

In [2]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import pandas as pd
import csv
import torch

In [3]:
# Check if GPU is available and run on GPU if so
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
# Create an instance of the model and tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt").to(device)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/649 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [5]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [7]:
# Write the English dev file to a txt
en_dev = open(D+"/predicted_text/MBart/en_dev.txt", "w")
with open(D+"/dev/dev.en", "r", encoding="utf-8") as f:
  en_dev.write(f.read())
  en_dev.write("\n")
f.close()

In [8]:
# Load the txt file into pandas
english_df = pd.read_csv(D+"/predicted_text/MBart/en_dev.txt", sep = "\t", header=None)

In [9]:
# Check the first few rows
english_df.head()

Unnamed: 0,0
0,The Prime Minister said Babasaheb Ambedkar had...
1,Explaining the significance of holding this ev...
2,He said the aim is to complete this task by 2022.
3,The Prime Minister said that the Government is...
4,"In this context, he mentioned the progress mad..."


In [10]:
# Translate into Tamil and write to a txt file
dev_bart_ta = open(D+"/predicted_text/MBart/dev_bart_ta.txt", "w")

with open(D+"/predicted_text/MBart/en_dev.txt", "r", encoding="utf-8") as f:
  lines = f.readlines()
  for line in lines:
    model_inputs = tokenizer(line, return_tensors="pt")
    generated_tokens = model.generate(**model_inputs.to(device), forced_bos_token_id=tokenizer.lang_code_to_id["ta_IN"])
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    dev_bart_ta.write(translation[0])
    dev_bart_ta.write("\n")
  dev_bart_ta.close()

In [11]:
# Verify that 1k lines were written
with open(D+"/predicted_text/MBart/dev_bart_ta.txt", "r") as f:
  x = len(f.readlines())
  print('Total lines:', x)

Total lines: 1000


In [12]:
# Translate into Malayalam and write to a txt file
dev_bart_ml = open(D+"/predicted_text/MBart/dev_bart_ml.txt", "w")

with open(D+"/predicted_text/MBart/en_dev.txt", "r", encoding="utf-8") as f:
  lines = f.readlines()
  for line in lines:
    model_inputs = tokenizer(line, return_tensors="pt")
    generated_tokens = model.generate(**model_inputs.to(device), forced_bos_token_id=tokenizer.lang_code_to_id["ml_IN"])
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    dev_bart_ml.write(translation[0])
    dev_bart_ml.write("\n")
  dev_bart_ml.close()

In [13]:
# Verify that 1k lines were written to the Malayalam file
with open(D+"/predicted_text/MBart/dev_bart_ml.txt", "r") as f:
  x = len(f.readlines())
  print('Total lines:', x)

Total lines: 1000


In [14]:
# Translate into Hindi and write to a txt file
dev_bart_hi = open(D+"/predicted_text/MBart/dev_bart_hi.txt", "w")

with open(D+"/predicted_text/MBart/en_dev.txt", "r", encoding="utf-8") as f:
  lines = f.readlines()
  for line in lines:
    model_inputs = tokenizer(line, return_tensors="pt")
    generated_tokens = model.generate(**model_inputs.to(device), forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"])
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    dev_bart_hi.write(translation[0])
    dev_bart_hi.write("\n")
  dev_bart_hi.close()

In [15]:
# Verify that 1k lines were written to the Hindi file
with open(D+"/predicted_text/MBart/dev_bart_hi.txt", "r") as f:
  x = len(f.readlines())
  print('Total lines:', x)

Total lines: 1000


In [16]:
!pip install sacrebleu -q

[?25l[K     |███▋                            | 10 kB 25.3 MB/s eta 0:00:01[K     |███████▏                        | 20 kB 28.2 MB/s eta 0:00:01[K     |██████████▉                     | 30 kB 22.6 MB/s eta 0:00:01[K     |██████████████▍                 | 40 kB 17.1 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 10.6 MB/s eta 0:00:01[K     |█████████████████████▋          | 61 kB 10.4 MB/s eta 0:00:01[K     |█████████████████████████▎      | 71 kB 10.5 MB/s eta 0:00:01[K     |████████████████████████████▉   | 81 kB 11.5 MB/s eta 0:00:01[K     |████████████████████████████████| 90 kB 4.7 MB/s 
[?25h

In [17]:
%cd /content/drive/My Drive/W266_Project_Data/pmi_data/predicted_text/MBart

/content/drive/My Drive/W266_Project_Data/pmi_data/predicted_text/MBart


In [18]:
ls

 dev_bart_hi.txt               dev.ml
 dev_bart_ml.txt               dev_ml_sacrebleu.txt
 dev_bart_myhitrans.gdoc       dev_ml.txt
 dev_bart_myhitrans.txt        dev_pa.txt
 dev_bart_ta.txt               dev.ta
 dev_bart_translated_hi        dev_ta_sacrebleu.txt
'dev_bart_translated_hi (1)'   dev_ta.txt
 dev_bart_translated_hi.zip    en_dev.gdoc
 dev_bart_translated_ml        en_dev.txt
 dev_bart_translated_ta        new_dev_bart_translated_hi
 dev.hi                        new_dev_bart_translated_ml
 dev_hi_sacrebleu.txt          new_dev_bart_translated_ta
 dev_hi.txt


In [19]:
# Get the Hindi translation Sacre Bleu scores
!sacrebleu dev_bart_hi.txt -i dev_hi.txt -l en-hi --tokenize intl

{
 "name": "BLEU",
 "score": 28.1,
 "signature": "nrefs:1|case:mixed|eff:no|tok:intl|smooth:exp|version:2.0.0",
 "verbose_score": "59.5/35.0/21.7/13.7 (BP = 1.000 ratio = 1.023 hyp_len = 19557 ref_len = 19117)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "intl",
 "smooth": "exp",
 "version": "2.0.0"
}
[0m

In [20]:
# Get the Tamil translation SacreBleu scores
!sacrebleu dev_bart_ta.txt -i dev_ta.txt -l en-ta --tokenize intl

{
 "name": "BLEU",
 "score": 13.0,
 "signature": "nrefs:1|case:mixed|eff:no|tok:intl|smooth:exp|version:2.0.0",
 "verbose_score": "42.3/17.6/8.6/4.4 (BP = 1.000 ratio = 1.053 hyp_len = 14425 ref_len = 13705)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "intl",
 "smooth": "exp",
 "version": "2.0.0"
}
[0m

In [21]:
# Get the Malayalam translation Sacre Bleu scores
!sacrebleu dev_bart_ml.txt -i dev_ml.txt -l en-ml --tokenize intl

{
 "name": "BLEU",
 "score": 1.3,
 "signature": "nrefs:1|case:mixed|eff:no|tok:intl|smooth:exp|version:2.0.0",
 "verbose_score": "23.4/3.0/0.5/0.2 (BP = 0.811 ratio = 0.827 hyp_len = 11929 ref_len = 14429)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "intl",
 "smooth": "exp",
 "version": "2.0.0"
}
[0m

In [22]:
# Get the Hindi translation Sacre Bleu scores
!sacrebleu new_dev_bart_translated_hi -i dev_hi.txt -l en-hi --tokenize intl

{
 "name": "BLEU",
 "score": 28.1,
 "signature": "nrefs:1|case:mixed|eff:no|tok:intl|smooth:exp|version:2.0.0",
 "verbose_score": "59.5/35.0/21.7/13.7 (BP = 1.000 ratio = 1.023 hyp_len = 19557 ref_len = 19117)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "intl",
 "smooth": "exp",
 "version": "2.0.0"
}
[0m

In [23]:
# Get the Tamil translation SacreBleu scores
!sacrebleu new_dev_bart_translated_ta -i dev_ta.txt -l en-ta --tokenize intl

{
 "name": "BLEU",
 "score": 13.0,
 "signature": "nrefs:1|case:mixed|eff:no|tok:intl|smooth:exp|version:2.0.0",
 "verbose_score": "42.3/17.6/8.6/4.4 (BP = 1.000 ratio = 1.053 hyp_len = 14425 ref_len = 13705)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "intl",
 "smooth": "exp",
 "version": "2.0.0"
}
[0m

In [26]:
# Get the Malayalam translation Sacre Bleu scores
!sacrebleu new_dev_bart_translated_ml -i dev_ml.txt -l en-ml --tokenize intl

{
 "name": "BLEU",
 "score": 1.3,
 "signature": "nrefs:1|case:mixed|eff:no|tok:intl|smooth:exp|version:2.0.0",
 "verbose_score": "23.4/3.0/0.5/0.2 (BP = 0.811 ratio = 0.827 hyp_len = 11929 ref_len = 14429)",
 "nrefs": "1",
 "case": "mixed",
 "eff": "no",
 "tok": "intl",
 "smooth": "exp",
 "version": "2.0.0"
}
[0m