In [14]:
import ctranslate2
import sentencepiece as spm
import pandas as pd
import time
from functools import lru_cache
import threading
pd.set_option('display.max_colwidth', None)
import os

In [15]:
def time_logger(func):
  def wrapper(*args, **kwargs):
    print(time.strftime("%H:%M:%S",time.localtime()))
    func(*args, **kwargs)
  return wrapper

In [16]:
def get_progress(filepath):
  num_lines = int(0)
  try:
    with open(filepath,'r', encoding='utf-8') as current_file:
      num_lines += int(len(current_file.readlines()))
      return num_lines
  except:
    return int()

In [17]:
# Flush the buffer to the file
@time_logger
def write_list_to_file(list_to_write, out_filepath):
  with open(out_filepath, 'a', encoding='utf-8') as file:
    file.writelines([str(x)+"\n" for x in list_to_write])

In [18]:
def encode(encoder, input_text):
  return encoder.encode(input_text, out_type=str)

def decode(decoder, output_tokens):
  return decoder.decode(output_tokens)


def translate(model,encoder, decoder, input_text):
  input_tokens = encode(encoder, input_text)
  output_tokens = model.translate_batch([input_tokens])[0].hypotheses[0]
  return decode(decoder, output_tokens)

In [19]:
OPENNMT_MODEL_PATH = 'C:\\Users\\bijgu\\Desktop\\Test\\bleu_selftrained\\en_mt_trans_ct2_20k\\'
SOURCE_VOCAB_PATH = 'C:\\Users\\bijgu\\Desktop\\Test\\bleu_selftrained\\source.model'
TARGET_VOCAB_PATH = 'C:\\Users\\bijgu\\Desktop\\Test\\bleu_selftrained\\target.model'

BLEU_ENG_FILEPATH = 'C:\\Users\\bijgu\\Desktop\\Test\\bleu_selftrained\\common.en'
BLEU_TRANS_FILEPATH = 'C:\\Users\\bijgu\\Desktop\\Test\\bleu_selftrained\\'

In [20]:
@time_logger
def buffer_translate(input_list, buffer_size, out_filepath):
  total_sentences = len(input_list)
  translator = ctranslate2.Translator(OPENNMT_MODEL_PATH, device="cpu")
  sp1 = spm.SentencePieceProcessor(SOURCE_VOCAB_PATH)
  sp2 = spm.SentencePieceProcessor(TARGET_VOCAB_PATH)
  
  buffers_flushed = 0
  buffer_list = []

  for sentence in input_list:
    buffer_list.append(translate(translator, sp1, sp2, sentence))
    if len(buffer_list) >= buffer_size:
      write_list_to_file(buffer_list, out_filepath)
      buffer_list.clear()
      buffers_flushed += 1
      print(out_filepath + ": " + str(int((100*buffers_flushed*buffer_size)/total_sentences)) + "% complete")
  else:
    write_list_to_file(buffer_list, out_filepath)
  print(out_filepath + " DONE")

In [21]:
eng_text = []
with open(BLEU_ENG_FILEPATH,'r',encoding='utf-8') as eng_file:
  eng_text = [x.rstrip("\n") for x in eng_file.readlines()]

In [22]:
batch_1 = eng_text[:2000]
batch_2 = eng_text[2000:4000]
batch_3 = eng_text[4000:6000]
batch_4 = eng_text[6000:8000]
batch_5 = eng_text[8000:10000]
batch_6 = eng_text[10000:]


In [23]:
t1=threading.Thread(target=buffer_translate,args=(batch_1, 200, os.path.join(BLEU_TRANS_FILEPATH,'batch_1.txt')))
t2=threading.Thread(target=buffer_translate,args=(batch_2, 200, os.path.join(BLEU_TRANS_FILEPATH,'batch_2.txt')))
t3=threading.Thread(target=buffer_translate,args=(batch_3, 200, os.path.join(BLEU_TRANS_FILEPATH,'batch_3.txt')))
t4=threading.Thread(target=buffer_translate,args=(batch_4, 200, os.path.join(BLEU_TRANS_FILEPATH,'batch_4.txt')))
t5=threading.Thread(target=buffer_translate,args=(batch_5, 200, os.path.join(BLEU_TRANS_FILEPATH,'batch_5.txt')))
t6=threading.Thread(target=buffer_translate,args=(batch_6, 200, os.path.join(BLEU_TRANS_FILEPATH,'batch_6.txt')))

In [None]:
t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()

t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()

In [29]:
def merge_file_content(current_list, file):
  with open(file, 'r', encoding='utf-8') as i_file:
    file_data = [x.strip("\n") for x in i_file.readlines()]
  current_list.extend(file_data)
  return current_list

list_orig = []

file_list = [x for x in os.listdir(BLEU_TRANS_FILEPATH) if x[-4:] == '.txt']
file_list = sorted(file_list)

for txt_file in file_list:
  merge_file_content(list_orig, txt_file)

with open('translated_opennmt.mt','w',encoding='utf-8') as o_file:
  o_file.writelines([x+"\n" for x in list_orig])

len(list_orig)

12322