### I) define hyper_parameters, datasets, and IO operations

In [41]:
# convention : all new inputs parameters for the notebbok through papermil,
# that are progressivelt added have a default value, equal to the one tha would be used in
# order obtain the same results of scripts

# two types of inputs: 
# inputs that influences the training (e.g hyper-paramteres, dataset set and splitting)
# inputs that controls state of training (reset it or load from)
import torch
from pathlib import Path


device = "cuda" if torch.cuda.is_available() else "cpu"

In [42]:
path_model_and_dependencies = paths.path_model_and_dependencies

if train_state_control.load_from_backup:
    assert Path(path_model_and_dependencies).exists(),f"back_up at path : {path_model_and_dependencies} doesn't exists"
    back_up = torch.load(path_model_and_dependencies)
    back_up_train_state = back_up["model_training_state"]
    # add here routines to check the if the given parameters are coherent ith the values from the back
    # up maybe using the notebookRunner class, that would handle this resposability
    print("loaded backup")

init or load the model

In [43]:
from translation_machine.models import transformer_mod
from translation_machine import sentence_mod

from pathlib import Path
import torch

if train_state_control.load_from_backup:
    model_inputs = back_up_train_state["model_inputs"]
else:
    model_inputs = {
        "d_model":simple_hp.d_model,
        "vocab_src":sentence_mod.EnglishSentence.vocab,
        "vocab_tgt":sentence_mod.FrenchSentence.vocab,
    }

model = transformer_mod.TransformerForSeq2Seq(**model_inputs)
    
if train_state_control.load_from_backup:
    # once we choose to load the model, it is always loaded from backup
    model.load_state_dict(back_up_train_state["model_params"])
    print("loaded model")

# loading(transfering) to the device

model = model.to(device)

In [44]:
optimizer = opt_params.unlinked_optimizer(model.parameters())
scheduler = opt_params.unlinked_scheduler(optimizer)

if train_state_control.load_from_backup and train_state_control.restore_optimizer:
    optimizer.load_state_dict(back_up["model_training_state"]["optimizer"])
    scheduler.load_state_dict(back_up["model_training_state"]["scheduler"])

In [45]:
import numpy as np
if train_state_control.load_from_backup:
    losses = back_up["results"]["losses"]
    metrics = back_up["results"]["metrics"]
    best_loss_val_mean = np.min(losses["val"])
else:
    losses = {"train":[],"val":[]}
    metrics = {"train":[],"val":[]}
    best_loss_val_mean = np.inf
    


# II) Load the dataset (as pairs of plain text)

In [46]:
from translation_machine import dataset_mod

whole_dataset_raw = dataset_mod.DatasetFromTxt(paths.path_dataset)

idxs_whole = np.arange(dset_truncation.limit_length)
dataset_raw = torch.utils.data.Subset(whole_dataset_raw,idxs_whole)
len(dataset_raw),len(whole_dataset_raw)

(10, 177210)

In [47]:
# Remark : the responsability to split the dataset is done outside of this notebook
from pathlib import Path

if dset_truncation.use_splitting:
    path_dataset_splitting = paths.path_dataset_splitting
    path_idxs_train = str(Path(path_dataset_splitting).joinpath("idxs_train.npy"))
    path_idxs_val = str(Path(path_dataset_splitting).joinpath("idxs_val.npy"))
    path_idxs_test = str(Path(path_dataset_splitting).joinpath("idxs_test.npy"))

    idxs_train = np.load(path_idxs_train)
    idxs_val = np.load(path_idxs_val)
    idxs_test = np.load(path_idxs_test)

    idxs_train,idxs_val,idxs_test = [[idx for idx in idxs if idx<len(whole_dataset)] for idxs in [idxs_train,idxs_val,idxs_test]]
    idxs_train = list(set(idxs_whole).intersection(set(idxs_train)))
    idxs_val = list(set(idxs_whole).intersection(set(idxs_val)))
    idxs_test = list(set(idxs_whole).intersection(set(idxs_test)))
    
    idxs_train_val = list(set(idxs_train).intersection(set(idxs_val)))

    
    train_dataset_raw = torch.utils.data.Subset(dataset_raw,idxs_train)
    val_dataset_raw = torch.utils.data.Subset(dataset_raw,idxs_val)
    test_dataset_raw = torch.utils.data.Subset(dataset_raw,idxs_test)
    train_val_dataset = torch.utils.data.ChainDataset(train_dataset_raw,val_dataset_raw)

else:
    idxs_train_val = np.arange(len(dataset_raw))

    train_dataset_raw = dataset_raw
    val_dataset_raw = dataset_raw
    test_dataset_raw = dataset_raw
    train_val_dataset = dataset_raw

    
len(train_dataset_raw),len(val_dataset_raw),len(test_dataset_raw),len(train_val_dataset)

(10, 10, 10, 10)

# III) Load the vocabulary

In [48]:
import torch,numpy as np
from ploomber_engine.ipython import PloomberClient
from pathlib import Path
from argparse import Namespace

from translation_machine.models import transformer_mod
from translation_machine import dataset_mod,sentence_mod

        
if dset_truncation.recompute_vocabulary:
    # initialize client
    client = PloomberClient.from_path(Path("../create_vocabulary.ipynb"),"../")
    train_setup = client.get_namespace(dict(train_val_dataset=train_val_dataset,
                                           path_language_info=paths.path_language_info))
    #in case of use_splitting=False, we feed the whole dataset to the pipeline

language_info = torch.load(paths.path_language_info)
vocab_french = language_info["french"]["vocab"]
vocab_english = language_info["english"]["vocab"]


len(vocab_french),len(vocab_english)

Executing cell: 2:   0%|                                 | 0/13 [00:00<?, ?it/s]


data already downloaded


Executing cell: 11: 100%|███████████████████████| 13/13 [00:01<00:00,  6.51it/s]


(19, 12)

 convert datasets to a custom dataset, taking into account the vocabulary

In [54]:
dsets = []
for dset in [train_dataset_raw,val_dataset_raw,test_dataset_raw]:
    dsets.append(list(dataset_mod.SentenceDataSet(dset,
                                                  sentence_type_src=sentence_mod.EnglishSentence,
                                                  sentence_type_dst=sentence_mod.FrenchSentence)))
train_dataset,val_dataset,test_dataset = dsets

In [55]:
max_length_from_file = False
if max_length_from_file:
    max_length_french = language_info["french"]["max_sentence_train_val"]
    max_length_english = language_info["english"]["max_sentence_train_val"]
else:# get max length from current dataset, which is prefered
    import itertools
    tmp = [(len(el[0]),len(el[1])) for el in train_val_dataset]
    a,b = zip(*tmp)
    max_length_english  = max(a)
    max_length_french = max(b)
    
max_length_english,max_length_french

(5, 9)

### VI) dataloader construction

In [56]:
from translation_machine import collate_fn_mod
from torch.utils.data import DataLoader

import torch
import numpy as np

collate_fn = collate_fn_mod.get_collate_fn(max_length_english,max_length_french)

train_data_loader = DataLoader(train_dataset,batch_size=simple_hp.batch_size,
                               shuffle=True,collate_fn=collate_fn)
val_data_loader = DataLoader(val_dataset,batch_size=simple_hp.batch_size,
                             shuffle=True,collate_fn=collate_fn)

In [57]:
from torch import optim,nn


baseline_loss = nn.CrossEntropyLoss(reduction="sum")
