In [6]:
import torch,inspect
import argparse
from pathlib import Path
from functools import partial
import numpy as np
from argparse import Namespace
from dev import namespace_tools
# nested namespace arguement containing all elements associated to the training setup

notebook_run = Namespace(
    simple_hp = Namespace(
        batch_size= 32,
        d_model = 64,
        early_stop_thresh = np.inf, # default to np.inf
        nb_epochs = 10,
        warm_up_epochs = 20,
    ),
    # parameters to limit the size of the dataset
    dset_truncation = Namespace(
        limit_length= 15,
        use_splitting = False,
        max_length_from_file = False,
    ),
    # parameters for the optimization algorithm
    opt_params = Namespace(
        unlinked_optimizer = partial(torch.optim.NAdam,lr=0.01),
        unlinked_scheduler = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, mode='min', 
                                     factor=0.9, patience=20)
    ),
    # parameters to reload the model
    train_state_control = Namespace(             
        load_from_backup = False,
        restore_optimizer = False
    ),
    #paths from root
    paths = namespace_tools.Paths(
        path_dataset = "data/french_english_dataset/fra.txt",
        path_language_info = "models/language_info.pth",
        path_dataset_splitting = "dataset_splitting",
        path_model_and_dependencies = "models/sequence_translator_transformer_over_fitted.pth",
        root = "../.."
    )

)

In [7]:
notebook_run = namespace_tools.NameSpaceAggregation(notebook_run)
notebook_run.diffuse(globals())

In [8]:
from ploomber_engine.ipython import PloomberClient
from ploomber import DAG
from pathlib import Path
from ploomber.products import File

# initialize client
client = PloomberClient.from_path(Path("./training_setup.ipynb"),cwd=Path("../../"))
from argparse import Namespace

from translation_machine.models import transformer_mod
from translation_machine import sentence_mod

initial_namespace_as_dict = notebook_run.diffuse()
train_setup = client.get_namespace(initial_namespace_as_dict)
for key,val in train_setup.items():
        globals()[key] = val

Executing cell: 13: 100%|███████████████████████| 21/21 [00:00<00:00, 82.72it/s]


In [9]:
# revert to train mode
model.train()
model.training

True

In [10]:
from translation_machine import model_trainer_mod
model_trainer = model_trainer_mod.ModelTrainer(model,optimizer,train_data_loader,val_data_loader,baseline_loss)

In [11]:
## import matplotlib.pyplot as plt,numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
best_loss_val_mean = np.inf
best_epoch = scheduler.last_epoch

for epoch in tqdm(range(simple_hp.nb_epochs)):
    #import time
    #start = time.time()
    print(f"training for epoch {epoch}")
    print(f"for epoch {epoch} learning rate is {optimizer.param_groups[0]['lr']}" )
    print("training_step")
    loss_train,nb_words_per_batch_train,metric_train = model_trainer.train_on_epoch()
    print("validation_step")
    loss_val,nb_words_per_batch_val,metric_val = model_trainer.validate_on_epoch()

    sum_loss_train = torch.tensor(loss_train).sum()
    sum_loss_val = torch.tensor(loss_val).sum()
    mean_train_loss = sum_loss_train/sum(nb_words_per_batch_train)
    mean_val_loss = sum_loss_val/sum(nb_words_per_batch_val)

    scheduler.step(mean_val_loss)

        
    print(f"for epoch {epoch} mean loss on train {mean_train_loss}")
    print(f"for epoch {epoch} mean loss on val {mean_val_loss}")
        
    losses["train"].append(mean_train_loss)
    losses["val"].append(mean_val_loss)
    metrics["train"].append(metric_train)
    metrics["val"].append(metric_val)
    
    if (mean_val_loss < best_loss_val_mean):
        best_epoch = scheduler.last_epoch
        best_loss_val_mean = mean_val_loss

        model_training_state = {"model_params":model_trainer.model.state_dict(),
                               "model_inputs":model_inputs,
                              "optimizer":optimizer.state_dict(),
                              "scheduler":scheduler.state_dict(),
                              }
        results = { "losses":losses,
                   "metrics":metrics}
        new_back_up = dict()
        if "back_up" in globals():
            new_back_up["notebook_runs"] = back_up["notebook_runs"] + tuple([notebook_run.state_dict()])
        else:
            new_back_up["notebook_runs"] = tuple([notebook_run.state_dict()])

        new_back_up["results"] = results
        new_back_up["model_training_state"] = model_training_state
        
        back_up = new_back_up
        torch.save(back_up,paths.path_model_and_dependencies)
        print(f"saving for epoch {epoch}")
        
        plt.plot(losses["train"],"b*")
        plt.plot(losses["val"],"g*")
        plt.title("losses")
        plt.savefig("loss_curve")
        #import pdb;pdb.set_trace()
    elif epoch - best_epoch > simple_hp.early_stop_thresh  and epoch > simple_hp.warm_up_epochs:
        print("Early stopped training at epoch %d" % epoch)
        break  # terminate the training loop

    del loss_train,nb_words_per_batch_train,metric_train

    del loss_val,nb_words_per_batch_val,metric_val


  0%|                                                                              | 0/10 [00:00<?, ?it/s]

training for epoch 0
for epoch 0 learning rate is 0.01
training_step
0 8.863941192626953
validation_step
0 9.079339345296225
for epoch 0 mean loss on train 8.863941192626953
for epoch 0 mean loss on val 9.079339027404785
saving for epoch 0


 10%|███████                                                               | 1/10 [00:01<00:11,  1.25s/it]

training for epoch 1
for epoch 1 learning rate is 0.01
training_step
0 9.138506571451822
validation_step
0 8.865436553955078
for epoch 1 mean loss on train 9.138506889343262
for epoch 1 mean loss on val 8.865436553955078
saving for epoch 1


 20%|██████████████                                                        | 2/10 [00:01<00:05,  1.42it/s]

training for epoch 2
for epoch 2 learning rate is 0.01
training_step
0 9.06058438618978
validation_step
0 8.833587646484375
for epoch 2 mean loss on train 9.06058406829834
for epoch 2 mean loss on val 8.833587646484375
saving for epoch 2


 30%|█████████████████████                                                 | 3/10 [00:01<00:03,  1.88it/s]

training for epoch 3
for epoch 3 learning rate is 0.01
training_step
0 8.700008392333984
validation_step
0 9.12891960144043
for epoch 3 mean loss on train 8.700008392333984
for epoch 3 mean loss on val 9.12891960144043


 40%|████████████████████████████                                          | 4/10 [00:02<00:02,  2.70it/s]

training for epoch 4
for epoch 4 learning rate is 0.01
training_step
0 8.88258425394694
validation_step
0 8.741373697916666
for epoch 4 mean loss on train 8.882584571838379
for epoch 4 mean loss on val 8.741374015808105
saving for epoch 4


 50%|███████████████████████████████████                                   | 5/10 [00:02<00:01,  2.95it/s]

training for epoch 5
for epoch 5 learning rate is 0.01
training_step
0 9.156496047973633
validation_step
0 8.795818328857422
for epoch 5 mean loss on train 9.156496047973633
for epoch 5 mean loss on val 8.795818328857422
training for epoch 6
for epoch 6 learning rate is 0.01
training_step
0 8.810162862141928
validation_step
0 8.958353042602539
for epoch 6 mean loss on train 8.810162544250488
for epoch 6 mean loss on val 8.958353042602539


 70%|█████████████████████████████████████████████████                     | 7/10 [00:02<00:00,  4.69it/s]

training for epoch 7
for epoch 7 learning rate is 0.01
training_step
0 9.013396581013998
validation_step
0 9.0629030863444
for epoch 7 mean loss on train 9.013396263122559
for epoch 7 mean loss on val 9.06290340423584
training for epoch 8
for epoch 8 learning rate is 0.01
training_step
0 8.847869237263998
validation_step
0 9.078512191772461
for epoch 8 mean loss on train 8.847868919372559
for epoch 8 mean loss on val 9.078512191772461


 90%|███████████████████████████████████████████████████████████████       | 9/10 [00:02<00:00,  6.05it/s]

training for epoch 9
for epoch 9 learning rate is 0.01
training_step
0 8.871288299560547
validation_step
0 9.224248250325521
for epoch 9 mean loss on train 8.871288299560547
for epoch 9 mean loss on val 9.224247932434082


100%|█████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.59it/s]


In [12]:
for el in model_trainer.train_data_loader:
    print([el1.shape for el1 in el])

[torch.Size([1, 2]), torch.Size([1, 4]), torch.Size([1]), torch.Size([1])]


In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
plt.plot(results["losses"]["train"],"b*")
plt.plot(results["losses"]["val"],"g*")
plt.title("losses")
plt.savefig(f'test.png', bbox_inches='tight')