# Translator Project: preprocess data

This Notebook prepares the data for loading and testing the model. Based on code provided by Dekel.

## Load Dependencies

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, load_metric, Dataset, DatasetDict, load_from_disk
import os

OUTPUT_DIRECTORY = os.path.join(os.getcwd(), 'output')

2023-11-23 18:19:55.340527: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-23 18:20:00.179667: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-23 18:20:06.291653: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/

## Organize the data

In [42]:
max_input_length = 200
max_target_length = 200
dataset_name = "wiki"

df_path = f"/home/azureuser/translator/tr_data/{dataset_name}/translated_{dataset_name}_final.parquet"
df = pd.read_parquet(df_path)
df

Unnamed: 0,translation
0,{'en': ' Local government leaders and the Unit...
1,{'en': ' The flag was officially adopted on Ap...
2,{'en': ' The flag was raised for the first tim...
3,"{'en': ' The previous year, the flag design wo..."
4,{'en': ' A draft was sent to the Institute of ...
...,...
1463243,"{'en': ' However, this did not last for years,..."
1463245,"{'en': ' Despite quite a few losses, the Scots..."
1463247,{'en': ' After the Second World War Scottish i...
1463249,{'en': ' Scotland is part of the four nations ...


# Split to training and validation sets

In [43]:
test_size = 0.25
random_state = 42

# Split the dataset into a train set and a validation set (small amount)  
train_df, val_df = train_test_split(df, test_size=test_size, random_state=random_state)

#if want to control the size of the training data, can do it here:
#if train_size > 0:
#    train_df = train_df.iloc[:train_size]  

train_df

Unnamed: 0,translation
265691,"{'en': ' During the Gulf War, US Air Force pla..."
385531,{'en': ' The association observes that a menta...
80583,{'en': ' In 1977 he developed a DNA labeling m...
906164,"{'en': ' What information do they give?', 'he'..."
352722,"{'en': ' ""Allah is our Lord and your Lord, but..."
...,...
123023,{'en': ' 1939 - Billie Holiday records the son...
288712,"{'en': ' B, an opinion is presented that the J..."
147045,"{'en': ' In 2003, the new Rolls-Royce from BMW..."
746666,{'en': ' Given an electric circuit with two vo...


In [44]:
dataset_name = f"{dataset_name}_he_en"
train_dataset = Dataset.from_pandas(train_df)  
train_dataset = train_dataset.remove_columns(['__index_level_0__'])  # Remove '__index_level_0__' feature from the datasets  
val_dataset = Dataset.from_pandas(val_df)    
val_dataset = val_dataset.remove_columns(['__index_level_0__'])  

split_datasets = DatasetDict({  
    'train' : train_dataset,  
    'validation' : val_dataset,  
    })  

data_folder = Path(df_path).parent  
train_df.to_parquet(data_folder / 'train.parquet')  
val_df.to_parquet(data_folder / 'validation.parquet')  
split_datasets.save_to_disk(data_folder / dataset_name) 

Saving the dataset (0/1 shards):   0%|          | 0/984119 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/328040 [00:00<?, ? examples/s]

## Tokenize (prepare input for model)

In [45]:
src_lang = "heb_Hebr"
tgt_lang="eng_Latn"
model_checkpoint = "facebook/nllb-200-distilled-1.3B" 
tokenizer = NllbTokenizer.from_pretrained(model_checkpoint, src_lang=src_lang, tgt_lang=tgt_lang)

In [None]:
def preprocess_function(examples):
    inputs = [ex["he"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# if stored once, instead of running split_datasets, run:
# tokenized_datasets.load_from_disk("/home/azureuser/translator/wiki_data/wiki_dataset")

tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

tokenized_datasets

Map:   0%|          | 0/984119 [00:00<?, ? examples/s]



In [41]:
data_folder = Path(df_path).parent
tokenized_datasets.save_to_disk(f"{data_folder}/prepared_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/8231 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2744 [00:00<?, ? examples/s]