In [11]:
import pandas as pd
from Models.AutoModel import get_model
from Training.Trainer import Trainer
from Training.TrainingArguments import TrainingArguments
from Tokenizers.Tokenizers import Callable_tokenizer
from Models.ModelArgs import ModelArgs

from utils import MT_Dataset, MYCollate, compute_bleu, get_parameters_info

In [2]:
train_csv_path = "out/data/df_train.csv"
valid_csv_path = "out/data/df_valid.csv"
src_tokenizer_path = "out/tokenizers/src_tokenizer.model"
trg_tokenizer_path = "out/tokenizers/trg_tokenizer.model"
model_config_path = "Configurations/model_config.json"
training_config_path = "Configurations/training_config.json"

In [3]:
src_tokenizer = Callable_tokenizer(src_tokenizer_path)
trg_tokenizer = Callable_tokenizer(trg_tokenizer_path)
src_vocab_size = len(src_tokenizer)
trg_vocab_size = len(trg_tokenizer)
print(f"Source tokenizer length {src_vocab_size}, Target tokenizer length {trg_vocab_size}")

Source tokenizer length 4096, Target tokenizer length 6144


In [4]:
train_df = pd.read_csv(train_csv_path)
valid_df = pd.read_csv(valid_csv_path)

train_ds = MT_Dataset(src_sentences_list=train_df[train_df.columns[0]][:100].to_list(), trg_sentences_list=train_df[train_df.columns[1]][:100].to_list(),
                        src_tokenizer=src_tokenizer, trg_tokenizer=trg_tokenizer)
valid_ds = MT_Dataset(src_sentences_list=valid_df[valid_df.columns[0]][:50].to_list(), trg_sentences_list=valid_df[valid_df.columns[1]][:50].to_list(),
                        src_tokenizer=src_tokenizer, trg_tokenizer=trg_tokenizer)
mycollate = MYCollate(batch_first=True, pad_value=trg_tokenizer.get_tokenId('<pad>'))
print(f"Training data length {len(train_ds)}, Validation data length {len(valid_ds)}")
print(f"Source tokens shape: {train_ds[3][0].shape}, Target_forward shape {train_ds[3][1].shape}, Target_loss shape {train_ds[3][2].shape}")
print("Data Loading Done.")

Training data length 100, Validation data length 50
Source tokens shape: torch.Size([281]), Target_forward shape torch.Size([183]), Target_loss shape torch.Size([183])
Data Loading Done.


In [5]:
model_args = ModelArgs(config_path=model_config_path)
print(model_args)

ModelArgs(
model_type=s2s,
dim_embed=16,
dim_model=16,
dim_feedforward=64,
num_layers=2,
dropout=0.3,
maxlen=512,
flash_attention=False
)


In [6]:
model = get_model(model_args, src_vocab_size, trg_vocab_size)
names, tr, nontr = get_parameters_info(model=model)
print(f"{'Module':<15}{'Trainable':>15}{'Non-Trainable':>15}")
for n, ttp, ntp in zip(names, tr, nontr):
    print(f"{n:<15}{ttp:>15,}{ntp:>15,}")

Module               Trainable  Non-Trainable
encoder                 76,752              0
decoder                101,840              0
classifier             104,448              0
TotalParams            283,040              0


In [7]:
training_args = TrainingArguments(training_config_path)
print(training_args)

TrainingArguments(
  save_models_dir='./out/models',
  save_plots_dir='./out/plots',
  learning_rate=0.0001,
  max_steps=20,
  seed=123,
  precision='high',
  device='cpu',
  batch_size=64,
  cpu_num_workers=4,
  weight_decay=0.01,
  onnx=False,
  run_name='experiment_01',
  pin_memory=True,
  warmup_steps=2,
  save_steps=5,
  eval_steps=5,
  torch_compile=False
)


In [None]:
trainer = Trainer(args=training_args, model=model,
                    train_ds=train_ds, valid_ds=valid_ds,
                    collator=mycollate,
                    compute_metrics_func=compute_bleu)

train_losses, valid_losses = trainer.train()