<a href="https://colab.research.google.com/github/aaubs/ds-master/blob/main/notebooks/M3_HyggeBERT_translation_en_da.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Intstalls
!pip install simpletransformers sacremoses datasets -q

In [None]:
# Imports

import pandas as pd
from tqdm import tqdm
import logging

# datasets library
from datasets import load_dataset

# transformer models for translation
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

In [None]:
# logging settings
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# paths for translation file and data and chunk-size for translation
path_translate = 'translated.txt'
path_data = 'data.json.gz'
chunksize = 1000

In [None]:
# translation model args
model_args = Seq2SeqArgs()
model_args.num_beams = 10
model_args.max_length = 50
model_args.use_multiprocessed_decoding = True

In [None]:
# Initialize a Seq2SeqModel for English to Danish translation
model = Seq2SeqModel(
    encoder_decoder_type="marian",
    encoder_decoder_name="Helsinki-NLP/opus-mt-en-da",
    tokenizer = "Helsinki-NLP/opus-mt-en-da",
    args=model_args,
)

In [None]:
# download and prepare dataset via datasets library
emotions = load_dataset("go_emotions", "raw")

In [None]:
# transform to DF
data = emotions['train'].to_pandas()

In [None]:
# take sample for quick prototyping
data = data.sample(n=1000)

Working with Colab I decided to run translation in chunks (no need to restart a 4h job due to disconnect etc.) and append translated lines to a file on disk/google-drive.

In [None]:
# handy split function for chunking
def split(list_a, chunk_size):

  for i in range(0, len(list_a), chunk_size):
    yield list_a[i:i + chunk_size]

In [None]:
# create splits (considering the last one won't be N)
text_gen = split(data.iloc[:,0].to_list(), chunksize)

In [None]:
# progress bar
pbar = tqdm(total=len(data.iloc[:,0].to_list()))

In [None]:
# loop creates chunks of N sentences, translates and appends to a textfile
while text_gen:
  try:
    chunk = next(text_gen)
    f = open(path_translate,'a')
    translated_chunk = model.predict(chunk)
    f.write('\n'.join(translated_chunk) + '\n')
    f.close()
    pbar.update(len(chunk))
  except StopIteration:
    pbar.close()
    break

In [None]:
# open up the text file again
translated_f = open(path_translate,'r').readlines()

In [None]:
# clean up and add as column to the initial file
dk_text = [line.strip() for line in translated_f]
data['text_dk'] = dk_text

In [None]:
# write DF with translation to disk
data.to_json(path_data)