In [1]:
import argparse
from pathlib import Path

batch_size= 32
d_model = 512
early_stopping_activated = True
early_stop_steps_per_half_clr_cycle = 10
nb_epochs = 200

limit_length= None
use_splitting = True
max_length_from_file = False


base_lr = 10**(-6)
max_lr = 0.001
momentum = 0.9
half_period_cycle = 10
gamma = 0.8

load_from_backup = False
restore_from_backup = tuple(["model_params","scheduler","optimizer","losses","metrics"])

simple_hyp_params = argparse.Namespace(batch_size= batch_size,d_model = d_model,
early_stopping_activated = early_stopping_activated,
early_stop_steps_per_half_clr_cycle = early_stop_steps_per_half_clr_cycle,
nb_epochs = nb_epochs)

dset_truncation =argparse.Namespace(limit_length= limit_length,
                                    use_splitting = use_splitting,
                                    max_length_from_file = max_length_from_file)


opt_params = argparse.Namespace(base_lr = base_lr,
                                max_lr = max_lr,
                                momentum = momentum,
                                half_period_cycle = half_period_cycle,
                               gamma=gamma)

train_state_control = argparse.Namespace(load_from_backup = load_from_backup,
restore_from_backup = restore_from_backup
)


path_to_root = Path("../../")

paths = argparse.Namespace(path_dataset = "data/french_english_dataset/fra.txt",
path_language_info = "models/language_info.pth",
path_dataset_splitting = "dataset_splitting",
path_model_and_dependencies = "models/sequence_translator_transformer_new.pth"
)

for key,path in paths.__dict__.items():
    paths.__dict__[key] = str(path_to_root.joinpath(path))



In [2]:
from ploomber_engine.ipython import PloomberClient
from ploomber import DAG
from pathlib import Path
from ploomber.products import File

# initialize client
client = PloomberClient.from_path(Path("./training_setup.ipynb"))
from argparse import Namespace

from translation_machine.models import transformer_mod
from translation_machine import sentence_mod

initial_namespace = argparse.Namespace(**{key:globals()[key] for key in ["simple_hyp_params","dset_truncation",
                                                  "opt_params","train_state_control",
                                                  "paths"]})
train_setup = client.get_namespace(initial_namespace.__dict__)
for key,val in train_setup.items():
        globals()[key] = val

  from .autonotebook import tqdm as notebook_tqdm
Executing cell: 12: 100%|███████████████████████| 19/19 [00:03<00:00,  5.12it/s]


In [None]:
## import matplotlib.pyplot as plt,numpy as np
import matplotlib.pyplot as plt
best_loss_val_mean = np.inf
best_epoch = scheduler.last_epoch

for epoch in tqdm(range(simple_hyp_params.nb_epochs)):
    #import time
    #start = time.time()
    print(f"training for epoch {epoch}")

    print(f"for epoch {epoch} learning rate is {optimizer.param_groups[0]['lr']}" )

    print("training_step")
    loss_train,nb_words_per_batch_train,metric_train = model_trainer.train_on_epoch()
    
    print("validation_step")
    loss_val,nb_words_per_batch_val,metric_val = model_trainer.validate_on_epoch()

    loss_train = np.array([float(el) for el in loss_train])
    loss_val = np.array([float(el) for el in loss_val])
    mean_train_loss = np.sum(loss_train)/sum(nb_words_per_batch_train)
    mean_val_loss = np.sum(loss_val)/sum(nb_words_per_batch_val)
    
    print(f"for epoch {epoch} mean loss on train {mean_train_loss}")
    print(f"for epoch {epoch} mean loss on val {mean_val_loss}")
        
    losses["train"].append(mean_train_loss)
    losses["val"].append(mean_val_loss)
    metrics["train"].append(metric_train)
    metrics["val"].append(metric_val)
    
    current_loss_val_mean = np.mean(loss_val)
    if (current_loss_val_mean < best_loss_val_mean):
        best_epoch = model_trainer.scheduler.last_epoch
        best_loss_val_mean = current_loss_val_mean

        model_training_state = {"model_params":model_trainer.model.state_dict(),
                               "model_inputs":model_inputs,
                              "optimizer":optimizer.state_dict(),
                              "scheduler":scheduler.state_dict(),
                              "losses":losses,
                              "metrics":metrics
                              }
        
        torch.save(model_training_state,path_model_and_dependencies)
        print(f"saving for epoch {epoch}")
        
        
        plt.plot(losses["train"],"b*")
        plt.plot(losses["val"],"g*")
        plt.title("losses")
        plt.show()
        #import pdb;pdb.set_trace()
    elif epoch - best_epoch > simple_hyp_params.early_stop_thresh  and early_stopping_activated:
        print("Early stopped training at epoch %d" % epoch)
        break  # terminate the training loop
    #stop = time.time()
    #print(stop-start)

    del loss_train,nb_words_per_batch_train,metric_train

    del loss_val,nb_words_per_batch_val,metric_val


  0%|                                                                                          | 0/200 [00:00<?, ?it/s]

training for epoch 0
for epoch 0 learning rate is 1e-06
training_step
0 6.687884852216749
100 5.323258503828899
200 5.010234102289727
300 4.5811552531978625
400 4.58891304132297
validation_step
0 4.412269701797997
for epoch 0 mean loss on train 4.938881250013578
for epoch 0 mean loss on val 4.240734629012119
saving for epoch 0
Figure(640x480)


  0%|▍                                                                               | 1/200 [00:55<3:03:53, 55.45s/it]

training for epoch 1
for epoch 1 learning rate is 8.092000000000009e-05
training_step
0 4.298705435041244
100 2.9251587553174083
200 2.329769095989189
300 2.079179349152938
400 2.0220182002498412
validation_step
0 1.9919254363529266
for epoch 1 mean loss on train 2.410864146081984
for epoch 1 mean loss on val 1.8487869126336371
saving for epoch 1
Figure(640x480)


  1%|▊                                                                               | 2/200 [01:50<3:01:47, 55.09s/it]

training for epoch 2
for epoch 2 learning rate is 0.00012887200000000014
training_step
0 1.6733774768256153
100 1.7759250779226037
200 1.6018329376870013
300 1.543948301334971
400 1.3310269725566009
validation_step
0 1.4836842346191406
for epoch 2 mean loss on train 1.5568904796649858
for epoch 2 mean loss on val 1.3689830655965014
saving for epoch 2
Figure(640x480)


  2%|█▏                                                                              | 3/200 [02:44<2:59:49, 54.77s/it]

training for epoch 3
for epoch 3 learning rate is 0.00015444639999999995
training_step
0 1.3402431120122145
100 0.8327610355397169
200 1.0389971051897322
300 0.9737141678972943
400 1.0873854312990687
validation_step
0 1.0268258163609456
for epoch 3 mean loss on train 1.1602130076571917
for epoch 3 mean loss on val 1.1519184296968707
saving for epoch 3
Figure(640x480)


  2%|█▌                                                                              | 4/200 [03:39<2:59:05, 54.83s/it]

training for epoch 4
for epoch 4 learning rate is 0.00016467616
training_step
0 1.0072544415791829
100 0.9475847746196546
200 0.7232176765562996
300 0.7673772446652676
400 0.7653746511421952
validation_step
0 0.8349850562310988
for epoch 4 mean loss on train 0.9325389186161376
for epoch 4 mean loss on val 1.012742920212476
saving for epoch 4
Figure(640x480)


  2%|██                                                                              | 5/200 [04:34<2:58:03, 54.79s/it]

training for epoch 5
for epoch 5 learning rate is 0.00016467616000000006
training_step
0 0.7249169158935547
100 0.7163392745237299
200 0.9954613905686599
300 1.133708452937579
400 0.6422796694108241
validation_step
0 0.994787166255931
for epoch 5 mean loss on train 0.8550141770445625
for epoch 5 mean loss on val 0.9507036815456655
saving for epoch 5
Figure(640x480)


  3%|██▍                                                                             | 6/200 [05:29<2:57:08, 54.78s/it]

training for epoch 6
for epoch 6 learning rate is 0.00015812911360000008
training_step
0 0.8627910614013672
100 0.677917847266564
200 0.7146240234375
300 0.7027748682165659
400 0.6744446267887038
validation_step
0 0.9198189544677734
for epoch 6 mean loss on train 0.6984789532284229
for epoch 6 mean loss on val 0.9199647950944473
saving for epoch 6
Figure(640x480)


  4%|██▊                                                                             | 7/200 [06:23<2:56:12, 54.78s/it]

training for epoch 7
for epoch 7 learning rate is 0.0001476538393600001
training_step
0 0.6755851951805321
100 0.6216876109441122
200 0.6108562313780492
300 0.7913542468138416
400 0.5036589632329252
validation_step
0 1.1906116428087705
for epoch 7 mean loss on train 0.6126266432937806
for epoch 7 mean loss on val 0.8856198243510852
saving for epoch 7
Figure(640x480)


  4%|███▏                                                                            | 8/200 [07:18<2:55:13, 54.76s/it]

training for epoch 8
for epoch 8 learning rate is 0.00013508351027200006
training_step
0 0.5600748910953861
100 0.5239521695166519
200 0.6405175052472015
300 0.6320497581995831
400 0.5926768861967942
validation_step
0 0.7073931012834821
for epoch 8 mean loss on train 0.5478366265093335
for epoch 8 mean loss on val 0.8645535040698092
saving for epoch 8
Figure(640x480)


  4%|███▌                                                                            | 9/200 [08:13<2:54:13, 54.73s/it]

training for epoch 9
for epoch 9 learning rate is 0.00012167515924480006
training_step
0 0.45079340253557476
100 0.47847571161580205
200 0.5690623515361064
300 0.6224572945639725
400 0.5460664684645795
validation_step
0 0.6642640080284233
for epoch 9 mean loss on train 0.4894368508050305
for epoch 9 mean loss on val 0.8519559142668507
saving for epoch 9
Figure(640x480)


  5%|███▉                                                                           | 10/200 [09:08<2:53:42, 54.86s/it]

training for epoch 10
for epoch 10 learning rate is 0.00010826680821760007
training_step
0 0.36259286870401375
100 0.39970842997233075
200 0.507964120784276
300 0.46983822225937116
400 0.4310407980894431
validation_step
0 0.7066648809403335
for epoch 10 mean loss on train 0.4402346180254814
for epoch 10 mean loss on val 0.8341467321146069
saving for epoch 10
Figure(640x480)


  6%|████▎                                                                          | 11/200 [10:04<2:53:50, 55.19s/it]

training for epoch 11
for epoch 11 learning rate is 7.823210191667205e-05
training_step
0 0.3724473610902444
100 0.34589830714853564
200 0.3318704907359973
300 0.29352344936794705
400 0.3649669158153045
validation_step
0 0.8151100354316907
for epoch 11 mean loss on train 0.3751612481664923
for epoch 11 mean loss on val 0.8257456756287783
saving for epoch 11
Figure(640x480)


  6%|████▋                                                                          | 12/200 [11:01<2:54:28, 55.68s/it]

training for epoch 12
for epoch 12 learning rate is 5.592060580741122e-05
training_step
0 0.29035984019123057
100 0.2965439126846638
200 0.27447339784296065
300 0.3659014250102796
400 0.3410817302724041


In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
plt.plot(losses["train"],"b*")
plt.plot(losses["val"],"g*")
plt.title("losses")
plt.savefig(f'test_{batch_size}.png', bbox_inches='tight')

In [None]:
np.min(losses["val"])