In [None]:
!rm -rf sample_data/
!pip install OpenNMT-tf
!pip install gdown
!pip install sacremoses
!pip install transformers
!pip install sentencepiece
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import opennmt
import os
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import numpy as np
import sacrebleu
import pyonmttok
from sacremoses import MosesDetokenizer
from opennmt.utils import checkpoint as checkpoint_util
from pyonmttok import SentencePieceTokenizer
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

In [None]:
def split_data(data, train_size, val_size, test_size):
  if train_size + val_size + test_size != 1.0:
    raise Exception("Train, validation, and test sizes must add up to 1.") 
  
  train_mark = int(len(data) * train_size)
  val_mark = train_mark + int(len(data) * val_size)

  train_data = data[0:train_mark]
  val_data = data[train_mark:val_mark]
  test_data = data[val_mark:]

  return train_data, val_data, test_data

def save_data(data, data_folder_name, filename):
  with open(os.path.join(data_folder_name, filename), mode="w") as f:
    for line in data:
      if line.strip():
        f.write(line)

def count_weights(model):
  trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
  non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])

  print('Total params: {:,}'.format(trainable_count + non_trainable_count))
  print('Trainable params: {:,}'.format(trainable_count))
  print('Non-trainable params: {:,}'.format(non_trainable_count))

def display_weights(model):
  for layer in model.encoder.layers:
    print(f"===== LAYER: {layer.name} =====")
    if layer.get_weights() != []:
        weights = layer.get_weights()[0]
        biases = layer.get_weights()[1]
        print("weights:")
        print(weights)
        print("biases:")
        print(biases)
    else:
        print("weights: ", [])

def compute_scores(runner, features_filename, labels_filename, pred_filename, include_ppl=False, include_ter=False):
  runner.infer(features_filename, pred_filename)

  dot_idx = pred_filename.index('.')
  base_pred_name = pred_filename[0:dot_idx]
  dot_idx = labels_filename.index('.')
  base_model_name = labels_filename[0:dot_idx]
  pred_filename = detokenize_data(base_pred_name, base_model_name)
  detokenized_labels_filename = detokenize_data(base_model_name, base_model_name)
  preds = []
  truth = []
  with open(pred_filename) as f:
    preds = f.readlines()

  with open(detokenized_labels_filename) as f:
    truth = f.readlines()

  scores = dict()
  if include_ppl:
    scores = runner.evaluate(
        features_file=features_filename,
        labels_file=labels_filename)
  
  bleu = sacrebleu.corpus_bleu(preds, [truth])
  scores.update({'bleu': bleu.score})
  if include_ter:
    ter = sacrebleu.corpus_ter(preds, [truth])
    scores.update({'ter': ter.score})
  
  return scores

def tokenize_data(save_folder_name, basename):
  tokenize_sub_data(save_folder_name, basename, "train")
  tokenize_sub_data(save_folder_name, basename, "test")
  tokenize_sub_data(save_folder_name, basename, "val")

def tokenize_sub_data(save_folder_name, basename, set_type):
  model_path = os.path.join("sentencepiece_models", f"{basename}.model")
  vocabulary_path = os.path.join("sentencepiece_models", f"{basename}.vocab")
  tokenizer = SentencePieceTokenizer(model_path=model_path,
                                     vocabulary_path=vocabulary_path,)
  
  with open(os.path.join(f"{save_folder_name}_raw", f"{basename}_{set_type}.raw")) as f:
    with open(os.path.join(save_folder_name, f"{basename}_{set_type}.tok"), mode="w") as fout:
      for line in f.readlines():
        if line.strip():
          fout.write(" ".join(tokenizer.tokenize(line)[0]) + "\n")


def detokenize_data(tokenized_basename, model_basename):
  model_path = os.path.join("sentencepiece_models", model_basename + ".model")
  vocabulary_path = os.path.join("sentencepiece_models", f"{model_basename}.vocab")
  tokenizer = SentencePieceTokenizer(model_path=model_path,
                                     vocabulary_path=vocabulary_path,)
  
  with open(f"{tokenized_basename}.tok") as f:
    with open(f"{tokenized_basename}.txt", mode="w") as fout:
      for line in f.readlines():
        fout.write(tokenizer.detokenize(line.strip().split(" ")) + "\n")

  return f"{tokenized_basename}.txt"

In [None]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [None]:
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", use_fast=False)

In [None]:
src_d = []

with open("pred.txt") as f:
  src_d = f.readlines()
  src_d = src_d[:1000]

out = []
tokenizer.src_lang = "es_XX"
for l in src_d:
  encoded_es = tokenizer(l, return_tensors="pt",truncation=True,padding=True)
  generated_tokens = model.generate(
    **encoded_es,
    forced_bos_token_id=tokenizer.lang_code_to_id["it_IT"]
  )
  out.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))

In [None]:
with open("final.out", 'w') as fout:
    for line in out:
      fout.write(line[0] + '\n')