# Assignment 2

## Initialization

In [1]:
#@title Link your assignment folder & install requirements
#@markdown Enter the path to the assignment folder in your Google Drive
# If you run this notebook locally or on a cluster (i.e. not on Google Colab)
# you can delete this cell which is specific to Google Colab. You may also
# change the paths for data/logs in Arguments below.
import sys
import os
import shutil
import warnings


# Install requirements
!pip install -qr requirements.txt

# Check if CUDA is available
import torch
if not torch.cuda.is_available():
  warnings.warn('CUDA is not available.')

### Running on GPU
For this assignment, it will be necessary to run your experiments on GPU. To make sure the notebook is running on GPU, you can change the notebook settings with
* (EN) `Edit > Notebook Settings`
* (FR) `Modifier > Paramètres du notebook`


In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
import urllib.request

from dataclasses import dataclass
from torch.utils.data import DataLoader
from tqdm import tqdm

from lstm_solution import LSTM
from gpt1_solution import MiniGPT1
from utils.wikitext2 import Wikitext2
from utils.torch_utils import seed_experiment, to_device
from utils.data_utils import save_logs
from run_exp import train, evaluate

EMBEDDINGS_URL = "https://ift6135-h2021.s3.us-east-2.amazonaws.com/assignment2/embeddings.npz"

## Public tests
Run the following cell in order to run the public tests to check to tensor shapes of the outputs of your functions.

In [3]:
!python -m unittest discover -s ./

..........
----------------------------------------------------------------------
Ran 10 tests in 0.765s

OK


## Experiments

Below we define a few default arguments to get you started with your experiments. You are encouraged to modify the function `main()`, as well as these arguments, to fit your needs (e.g. changing hyperparameters, the optimizer, adding regularization, adding logs).

In [3]:
@dataclass
class Arguments:
  # Data
  data_folder: str = './data'
  batch_size: int = 16

  # Model
  model: str = 'lstm'  # [lstm, gpt1]
  embeddings: str = './data/embeddings.npz'
  layers: int = 1

  # Optimization
  optimizer: str = 'adamw'  # [sgd, momentum, adam, adamw]
  epochs: int = 10
  lr: float = 1e-3
  momentum: float = 0.9
  weight_decay: float = 5e-4

  # Experiment
  exp_id: str = 'debug'
  log: bool = True
  log_dir: str = './logs'
  seed: int = 42

  # Miscellaneous
  num_workers: int = 2
  device: str = 'cuda'
  progress_bar: bool = False
  print_every: int = 10

The 12 configurations you need to run in Problem 3. Be careful that there is no discrepency between the configurations defined in `run_exp.py` and the ones below. In case there is a difference, the version from `run_exp.py` should be considered the ones to run.

In [4]:
# Note: if there is any discrepency with the configurations in run_exp.py, the
# version from run_exp.py should be the ones to use in Problem 3.
configs = {
  1: Arguments(model='lstm', layers=1, batch_size=16, log=True, epochs=10, optimizer='adam'),
  2: Arguments(model='lstm', layers=1, batch_size=16, log=True, epochs=10, optimizer='adamw'),
  3: Arguments(model='lstm', layers=1, batch_size=16, log=True, epochs=10, optimizer='sgd'),
  4: Arguments(model='lstm', layers=1, batch_size=16, log=True, epochs=10, optimizer='momentum'),

  5: Arguments(model='gpt1', layers=1, batch_size=16, log=True, epochs=10, optimizer='adam'),
  6: Arguments(model='gpt1', layers=1, batch_size=16, log=True, epochs=10, optimizer='adamw'),
  7: Arguments(model='gpt1', layers=1, batch_size=16, log=True, epochs=10, optimizer='sgd'),
  8: Arguments(model='gpt1', layers=1, batch_size=16, log=True, epochs=10, optimizer='momentum'),

  9: Arguments(model='lstm', layers=2, batch_size=16, log=True, epochs=10, optimizer='adamw'),
  10: Arguments(model='lstm', layers=4, batch_size=16, log=True, epochs=10, optimizer='adamw'),
  11: Arguments(model='gpt1', layers=2, batch_size=16, log=True, epochs=10, optimizer='adamw'),
  12: Arguments(model='gpt1', layers=4, batch_size=16, log=True, epochs=10, optimizer='adamw'),
}

In [6]:
def main(args):
  # Seed the experiment, for repeatability
  seed_experiment(args.seed)

  # Dataloaders
  train_dataset = Wikitext2(args.data_folder, split="train")
  train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    shuffle=True,
    num_workers=args.num_workers,
  )

  valid_dataset = Wikitext2(args.data_folder, split="validation")
  valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=args.batch_size,
    shuffle=False,
    num_workers=args.num_workers,
  )

  test_dataset = Wikitext2(args.data_folder, split="test")
  test_dataloader = DataLoader(
    test_dataset,
    batch_size=args.batch_size,
    shuffle=False,
    num_workers=args.num_workers,
  )

  # Download the embeddings
  if not os.path.isfile(args.embeddings):
    print("Downloading embeddings...")
    urllib.request.urlretrieve(EMBEDDINGS_URL, args.embeddings)

  # Model
  if args.model == "lstm":
    model = LSTM.load_embeddings_from(
      args.embeddings, hidden_size=512, num_layers=args.layers
    )
  elif args.model == "gpt1":
    model = MiniGPT1.load_embeddings_from(
      args.embeddings, num_layers=args.layers
    )
  else:
    raise ValueError("Unknown model {0}".format(args.model))
  model.to(args.device)

  # Optimizer
  if args.optimizer == "adamw":
    optimizer = optim.AdamW(
      model.parameters(), lr=args.lr, weight_decay=args.weight_decay
    )
  elif args.optimizer == "adam":
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
  elif args.optimizer == "sgd":
    optimizer = optim.SGD(
      model.parameters(), lr=args.lr, weight_decay=args.weight_decay
    )
  elif args.optimizer == "momentum":
    optimizer = optim.SGD(
      model.parameters(),
      lr=args.lr,
      momentum=args.momentum,
      weight_decay=args.weight_decay,
    )

  print(
    f"Initialized {args.model.upper()} model with {sum(p.numel() for p in model.parameters())} "
    f"total parameters, of which {sum(p.numel() for p in model.parameters() if p.requires_grad)} are learnable."
    f"args are: {args}"  
  )

  train_losses, valid_losses = [], []
  train_ppls, valid_ppls = [], []
  train_times, valid_times = [], []
  for epoch in range(args.epochs):

    tqdm.write(f"====== Epoch {epoch} ======>")

    loss, ppl, wall_time = train(epoch, model, train_dataloader, optimizer, args)
    train_losses.append(loss)
    train_ppls.append(ppl)
    train_times.append(wall_time)

    loss, ppl, wall_time = evaluate(epoch, model, valid_dataloader, args)
    valid_losses.append(loss)
    valid_ppls.append(ppl)
    valid_times.append(wall_time)

  test_loss, test_ppl, test_time = evaluate(
    epoch, model, test_dataloader, args, mode="test"
  )

  print(f"===== Best validation perplexity: {min(valid_ppls):.3f} =====>")

  return (
    train_losses,
    train_ppls,
    train_times,
    valid_losses,
    valid_ppls,
    valid_times,
    test_loss,
    test_ppl,
    test_time,
  )

In [8]:
args = configs[1]  # Run the first configuration
logs = main(args)
if args.log:
  save_logs(args, *logs)

Initialized LSTM model with 34107392 total parameters, of which 3019520 are learnable.
[TRAIN] Epoch: 0, Iter: 0, Loss: 10.60695
[TRAIN] Epoch: 0, Iter: 10, Loss: 8.43617
[TRAIN] Epoch: 0, Iter: 20, Loss: 7.88891
[TRAIN] Epoch: 0, Iter: 30, Loss: 7.69509
[TRAIN] Epoch: 0, Iter: 40, Loss: 7.58836
[TRAIN] Epoch: 0, Iter: 50, Loss: 7.54249
[TRAIN] Epoch: 0, Iter: 60, Loss: 7.41036
[TRAIN] Epoch: 0, Iter: 70, Loss: 7.34132
[TRAIN] Epoch: 0, Iter: 80, Loss: 7.20208
[TRAIN] Epoch: 0, Iter: 90, Loss: 7.13149
[TRAIN] Epoch: 0, Iter: 100, Loss: 7.08324
[TRAIN] Epoch: 0, Iter: 110, Loss: 7.00574
[TRAIN] Epoch: 0, Iter: 120, Loss: 7.02443
[TRAIN] Epoch: 0, Iter: 130, Loss: 6.97269
[TRAIN] Epoch: 0, Iter: 140, Loss: 6.92367
[TRAIN] Epoch: 0, Iter: 150, Loss: 6.70746
[TRAIN] Epoch: 0, Iter: 160, Loss: 6.64843
[TRAIN] Epoch: 0, Iter: 170, Loss: 6.78570
[TRAIN] Epoch: 0, Iter: 180, Loss: 6.72047
[TRAIN] Epoch: 0, Iter: 190, Loss: 6.65904
[TRAIN] Epoch: 0, Iter: 200, Loss: 6.53216
[TRAIN] Epoch: 0, It

[TRAIN] Epoch: 2, Iter: 560, Loss: 5.43663
[TRAIN] Epoch: 2, Iter: 570, Loss: 5.14930
== [TRAIN] Epoch: 2, Perplexity: 211.882 ==>
[VAL] Epoch: 2, Iter: 0, Loss: 5.26923
[VAL] Epoch: 2, Iter: 10, Loss: 5.38992
[VAL] Epoch: 2, Iter: 20, Loss: 5.54671
[VAL] Epoch: 2, Iter: 30, Loss: 5.49916
[VAL] Epoch: 2, Iter: 40, Loss: 5.24185
[VAL] Epoch: 2, Iter: 50, Loss: 5.17168
=== [VAL] Epoch: 2, Iter: 59, Perplexity: 208.056 ===>
[TRAIN] Epoch: 3, Iter: 0, Loss: 5.33706
[TRAIN] Epoch: 3, Iter: 10, Loss: 5.23129
[TRAIN] Epoch: 3, Iter: 20, Loss: 5.04408
[TRAIN] Epoch: 3, Iter: 30, Loss: 5.40912
[TRAIN] Epoch: 3, Iter: 40, Loss: 5.16361
[TRAIN] Epoch: 3, Iter: 50, Loss: 5.01146
[TRAIN] Epoch: 3, Iter: 60, Loss: 5.38484
[TRAIN] Epoch: 3, Iter: 70, Loss: 5.11159
[TRAIN] Epoch: 3, Iter: 80, Loss: 5.13397
[TRAIN] Epoch: 3, Iter: 90, Loss: 5.24537
[TRAIN] Epoch: 3, Iter: 100, Loss: 4.93726
[TRAIN] Epoch: 3, Iter: 110, Loss: 5.18550
[TRAIN] Epoch: 3, Iter: 120, Loss: 5.12564
[TRAIN] Epoch: 3, Iter: 130

[TRAIN] Epoch: 5, Iter: 490, Loss: 4.62626
[TRAIN] Epoch: 5, Iter: 500, Loss: 4.83478
[TRAIN] Epoch: 5, Iter: 510, Loss: 4.87435
[TRAIN] Epoch: 5, Iter: 520, Loss: 4.90751
[TRAIN] Epoch: 5, Iter: 530, Loss: 4.75859
[TRAIN] Epoch: 5, Iter: 540, Loss: 4.77661
[TRAIN] Epoch: 5, Iter: 550, Loss: 4.63683
[TRAIN] Epoch: 5, Iter: 560, Loss: 4.77397
[TRAIN] Epoch: 5, Iter: 570, Loss: 4.83596
== [TRAIN] Epoch: 5, Perplexity: 121.331 ==>
[VAL] Epoch: 5, Iter: 0, Loss: 4.96142
[VAL] Epoch: 5, Iter: 10, Loss: 5.13907
[VAL] Epoch: 5, Iter: 20, Loss: 5.23252
[VAL] Epoch: 5, Iter: 30, Loss: 5.27534
[VAL] Epoch: 5, Iter: 40, Loss: 4.97841
[VAL] Epoch: 5, Iter: 50, Loss: 4.88363
=== [VAL] Epoch: 5, Iter: 59, Perplexity: 158.030 ===>
[TRAIN] Epoch: 6, Iter: 0, Loss: 4.76083
[TRAIN] Epoch: 6, Iter: 10, Loss: 4.55351
[TRAIN] Epoch: 6, Iter: 20, Loss: 4.42005
[TRAIN] Epoch: 6, Iter: 30, Loss: 4.69695
[TRAIN] Epoch: 6, Iter: 40, Loss: 4.88025
[TRAIN] Epoch: 6, Iter: 50, Loss: 4.52526
[TRAIN] Epoch: 6, Iter:

[TRAIN] Epoch: 8, Iter: 420, Loss: 4.44021
[TRAIN] Epoch: 8, Iter: 430, Loss: 4.35516
[TRAIN] Epoch: 8, Iter: 440, Loss: 4.57577
[TRAIN] Epoch: 8, Iter: 450, Loss: 4.53196
[TRAIN] Epoch: 8, Iter: 460, Loss: 4.54669
[TRAIN] Epoch: 8, Iter: 470, Loss: 4.55893
[TRAIN] Epoch: 8, Iter: 480, Loss: 4.63953
[TRAIN] Epoch: 8, Iter: 490, Loss: 4.43696
[TRAIN] Epoch: 8, Iter: 500, Loss: 4.44419
[TRAIN] Epoch: 8, Iter: 510, Loss: 4.62342
[TRAIN] Epoch: 8, Iter: 520, Loss: 4.46780
[TRAIN] Epoch: 8, Iter: 530, Loss: 4.53557
[TRAIN] Epoch: 8, Iter: 540, Loss: 4.52821
[TRAIN] Epoch: 8, Iter: 550, Loss: 4.56924
[TRAIN] Epoch: 8, Iter: 560, Loss: 4.40738
[TRAIN] Epoch: 8, Iter: 570, Loss: 4.51618
== [TRAIN] Epoch: 8, Perplexity: 88.479 ==>
[VAL] Epoch: 8, Iter: 0, Loss: 4.82449
[VAL] Epoch: 8, Iter: 10, Loss: 5.04453
[VAL] Epoch: 8, Iter: 20, Loss: 5.14719
[VAL] Epoch: 8, Iter: 30, Loss: 5.21139
[VAL] Epoch: 8, Iter: 40, Loss: 4.90471
[VAL] Epoch: 8, Iter: 50, Loss: 4.79869
=== [VAL] Epoch: 8, Iter: 59,

In [5]:
!python run_exp.py --model='gpt1' --layers=1 --batch_size=16 --epochs=10 --optimizer='adamw' --exp_id='debug' --seed=1023

Initialized GPT1 model with 38372352 total parameters, of which 7087872 are learnable.args are: Namespace(batch_size=16, data_folder='./data', device='cuda', embeddings='./data/embeddings.npz', epochs=10, exp_id='debug', layers=1, log=False, log_dir='logs', lr=0.001, model='gpt1', momentum=0.9, num_workers=2, optimizer='adamw', print_every=10, progress_bar=False, seed=1023, weight_decay=0.0005)
[TRAIN] Epoch: 0, Iter: 0, Loss: 46.82186
[TRAIN] Epoch: 0, Iter: 10, Loss: 8.78794
[TRAIN] Epoch: 0, Iter: 20, Loss: 8.34682
[TRAIN] Epoch: 0, Iter: 30, Loss: 8.05178
[TRAIN] Epoch: 0, Iter: 40, Loss: 7.87844
[TRAIN] Epoch: 0, Iter: 50, Loss: 7.76828
[TRAIN] Epoch: 0, Iter: 60, Loss: 7.75323
[TRAIN] Epoch: 0, Iter: 70, Loss: 7.56756
[TRAIN] Epoch: 0, Iter: 80, Loss: 7.78600
[TRAIN] Epoch: 0, Iter: 90, Loss: 7.54513
[TRAIN] Epoch: 0, Iter: 100, Loss: 7.61290
[TRAIN] Epoch: 0, Iter: 110, Loss: 7.33569
[TRAIN] Epoch: 0, Iter: 120, Loss: 7.25536
[TRAIN] Epoch: 0, Iter: 130, Loss: 7.24319
[TRAIN] Ep

[TRAIN] Epoch: 2, Iter: 470, Loss: 4.67787
[TRAIN] Epoch: 2, Iter: 480, Loss: 4.86989
[TRAIN] Epoch: 2, Iter: 490, Loss: 4.72952
[TRAIN] Epoch: 2, Iter: 500, Loss: 4.88216
[TRAIN] Epoch: 2, Iter: 510, Loss: 4.95346
[TRAIN] Epoch: 2, Iter: 520, Loss: 4.97153
[TRAIN] Epoch: 2, Iter: 530, Loss: 4.98050
[TRAIN] Epoch: 2, Iter: 540, Loss: 4.76043
[TRAIN] Epoch: 2, Iter: 550, Loss: 4.77585
[TRAIN] Epoch: 2, Iter: 560, Loss: 4.96940
[TRAIN] Epoch: 2, Iter: 570, Loss: 4.87598
== [TRAIN] Epoch: 2, Perplexity: 141.035 ==>
Before scheduler loss is: 4.949008449869224
[VAL] Epoch: 2, Iter: 0, Loss: 5.01061
[VAL] Epoch: 2, Iter: 10, Loss: 5.11378
[VAL] Epoch: 2, Iter: 20, Loss: 5.25649
[VAL] Epoch: 2, Iter: 30, Loss: 5.29789
[VAL] Epoch: 2, Iter: 40, Loss: 5.04418
[VAL] Epoch: 2, Iter: 50, Loss: 4.92253
=== [VAL] Epoch: 2, Iter: 59, Perplexity: 163.483 ===>
[TRAIN] Epoch: 3, Iter: 0, Loss: 4.70530
[TRAIN] Epoch: 3, Iter: 10, Loss: 4.83419
[TRAIN] Epoch: 3, Iter: 20, Loss: 4.78206
[TRAIN] Epoch: 3, I

[TRAIN] Epoch: 5, Iter: 360, Loss: 4.25611
[TRAIN] Epoch: 5, Iter: 370, Loss: 4.19754
[TRAIN] Epoch: 5, Iter: 380, Loss: 4.14839
[TRAIN] Epoch: 5, Iter: 390, Loss: 4.41251
[TRAIN] Epoch: 5, Iter: 400, Loss: 4.35064
[TRAIN] Epoch: 5, Iter: 410, Loss: 4.14501
[TRAIN] Epoch: 5, Iter: 420, Loss: 4.20295
[TRAIN] Epoch: 5, Iter: 430, Loss: 4.07626
[TRAIN] Epoch: 5, Iter: 440, Loss: 4.44517
[TRAIN] Epoch: 5, Iter: 450, Loss: 4.41573
[TRAIN] Epoch: 5, Iter: 460, Loss: 4.28294
[TRAIN] Epoch: 5, Iter: 470, Loss: 4.23762
[TRAIN] Epoch: 5, Iter: 480, Loss: 4.57286
[TRAIN] Epoch: 5, Iter: 490, Loss: 4.28980
[TRAIN] Epoch: 5, Iter: 500, Loss: 4.18241
[TRAIN] Epoch: 5, Iter: 510, Loss: 4.13809
[TRAIN] Epoch: 5, Iter: 520, Loss: 4.18911
[TRAIN] Epoch: 5, Iter: 530, Loss: 4.07259
[TRAIN] Epoch: 5, Iter: 540, Loss: 4.40731
[TRAIN] Epoch: 5, Iter: 550, Loss: 4.33334
[TRAIN] Epoch: 5, Iter: 560, Loss: 4.40086
[TRAIN] Epoch: 5, Iter: 570, Loss: 4.49303
== [TRAIN] Epoch: 5, Perplexity: 69.748 ==>
Before sch

[TRAIN] Epoch: 8, Iter: 260, Loss: 3.84803
[TRAIN] Epoch: 8, Iter: 270, Loss: 4.10501
[TRAIN] Epoch: 8, Iter: 280, Loss: 3.93836
[TRAIN] Epoch: 8, Iter: 290, Loss: 3.88387
[TRAIN] Epoch: 8, Iter: 300, Loss: 3.41988
[TRAIN] Epoch: 8, Iter: 310, Loss: 3.77794
[TRAIN] Epoch: 8, Iter: 320, Loss: 3.96793
[TRAIN] Epoch: 8, Iter: 330, Loss: 4.02576
[TRAIN] Epoch: 8, Iter: 340, Loss: 3.75260
[TRAIN] Epoch: 8, Iter: 350, Loss: 3.72587
[TRAIN] Epoch: 8, Iter: 360, Loss: 3.92819
[TRAIN] Epoch: 8, Iter: 370, Loss: 3.75709
[TRAIN] Epoch: 8, Iter: 380, Loss: 3.87826
[TRAIN] Epoch: 8, Iter: 390, Loss: 3.78302
[TRAIN] Epoch: 8, Iter: 400, Loss: 3.75932
[TRAIN] Epoch: 8, Iter: 410, Loss: 3.76144
[TRAIN] Epoch: 8, Iter: 420, Loss: 3.93945
[TRAIN] Epoch: 8, Iter: 430, Loss: 4.01685
[TRAIN] Epoch: 8, Iter: 440, Loss: 4.08608
[TRAIN] Epoch: 8, Iter: 450, Loss: 3.91574
[TRAIN] Epoch: 8, Iter: 460, Loss: 3.97182
[TRAIN] Epoch: 8, Iter: 470, Loss: 3.59409
[TRAIN] Epoch: 8, Iter: 480, Loss: 3.90689
[TRAIN] Epo

In [14]:
!python run_exp.py --model='gpt1' --layers=1 --batch_size=16 --epochs=10 --optimizer='adamw' --exp_id='14' --seed=1023

Initialized GPT1 model with 38372352 total parameters, of which 7087872 are learnable.args are: Namespace(batch_size=16, data_folder='./data', device='cuda', embeddings='./data/embeddings.npz', epochs=10, exp_id='14', layers=1, log=False, log_dir='logs', lr=0.001, model='gpt1', momentum=0.9, num_workers=2, optimizer='adamw', print_every=10, progress_bar=False, seed=1023, weight_decay=0.0005)
[TRAIN] Epoch: 0, Iter: 0, Loss: 46.82186
[TRAIN] Epoch: 0, Iter: 10, Loss: 8.78794
[TRAIN] Epoch: 0, Iter: 20, Loss: 8.34682
[TRAIN] Epoch: 0, Iter: 30, Loss: 8.05178
[TRAIN] Epoch: 0, Iter: 40, Loss: 7.87844
[TRAIN] Epoch: 0, Iter: 50, Loss: 7.76828
[TRAIN] Epoch: 0, Iter: 60, Loss: 7.75323
[TRAIN] Epoch: 0, Iter: 70, Loss: 7.56756
[TRAIN] Epoch: 0, Iter: 80, Loss: 7.78600
[TRAIN] Epoch: 0, Iter: 90, Loss: 7.54513
[TRAIN] Epoch: 0, Iter: 100, Loss: 7.61290
[TRAIN] Epoch: 0, Iter: 110, Loss: 7.33569
[TRAIN] Epoch: 0, Iter: 120, Loss: 7.25536
[TRAIN] Epoch: 0, Iter: 130, Loss: 7.24319
[TRAIN] Epoch

[TRAIN] Epoch: 2, Iter: 470, Loss: 4.67787
[TRAIN] Epoch: 2, Iter: 480, Loss: 4.86989
[TRAIN] Epoch: 2, Iter: 490, Loss: 4.72952
[TRAIN] Epoch: 2, Iter: 500, Loss: 4.88216
[TRAIN] Epoch: 2, Iter: 510, Loss: 4.95346
[TRAIN] Epoch: 2, Iter: 520, Loss: 4.97153
[TRAIN] Epoch: 2, Iter: 530, Loss: 4.98050
[TRAIN] Epoch: 2, Iter: 540, Loss: 4.76043
[TRAIN] Epoch: 2, Iter: 550, Loss: 4.77585
[TRAIN] Epoch: 2, Iter: 560, Loss: 4.96940
[TRAIN] Epoch: 2, Iter: 570, Loss: 4.87598
== [TRAIN] Epoch: 2, Perplexity: 141.035 ==>
Before scheduler loss is: 4.949008449869224
[VAL] Epoch: 2, Iter: 0, Loss: 5.01061
[VAL] Epoch: 2, Iter: 10, Loss: 5.11378
[VAL] Epoch: 2, Iter: 20, Loss: 5.25649
[VAL] Epoch: 2, Iter: 30, Loss: 5.29789
[VAL] Epoch: 2, Iter: 40, Loss: 5.04418
[VAL] Epoch: 2, Iter: 50, Loss: 4.92253
=== [VAL] Epoch: 2, Iter: 59, Perplexity: 163.483 ===>
[TRAIN] Epoch: 3, Iter: 0, Loss: 4.70530
[TRAIN] Epoch: 3, Iter: 10, Loss: 4.83419
[TRAIN] Epoch: 3, Iter: 20, Loss: 4.78206
[TRAIN] Epoch: 3, I

[TRAIN] Epoch: 5, Iter: 360, Loss: 4.08648
[TRAIN] Epoch: 5, Iter: 370, Loss: 4.02147
[TRAIN] Epoch: 5, Iter: 380, Loss: 3.99231
[TRAIN] Epoch: 5, Iter: 390, Loss: 4.25779
[TRAIN] Epoch: 5, Iter: 400, Loss: 4.16451
[TRAIN] Epoch: 5, Iter: 410, Loss: 3.97354
[TRAIN] Epoch: 5, Iter: 420, Loss: 4.05477
[TRAIN] Epoch: 5, Iter: 430, Loss: 3.91704
[TRAIN] Epoch: 5, Iter: 440, Loss: 4.28636
[TRAIN] Epoch: 5, Iter: 450, Loss: 4.27324
[TRAIN] Epoch: 5, Iter: 460, Loss: 4.12977
[TRAIN] Epoch: 5, Iter: 470, Loss: 4.07615
[TRAIN] Epoch: 5, Iter: 480, Loss: 4.43358
[TRAIN] Epoch: 5, Iter: 490, Loss: 4.13068
[TRAIN] Epoch: 5, Iter: 500, Loss: 4.01960
[TRAIN] Epoch: 5, Iter: 510, Loss: 3.98169
[TRAIN] Epoch: 5, Iter: 520, Loss: 4.00735
[TRAIN] Epoch: 5, Iter: 530, Loss: 3.91017
[TRAIN] Epoch: 5, Iter: 540, Loss: 4.24141
[TRAIN] Epoch: 5, Iter: 550, Loss: 4.17427
[TRAIN] Epoch: 5, Iter: 560, Loss: 4.23285
[TRAIN] Epoch: 5, Iter: 570, Loss: 4.33557
== [TRAIN] Epoch: 5, Perplexity: 60.502 ==>
Before sch

[TRAIN] Epoch: 8, Iter: 260, Loss: 3.67517
[TRAIN] Epoch: 8, Iter: 270, Loss: 3.91416
[TRAIN] Epoch: 8, Iter: 280, Loss: 3.76780
[TRAIN] Epoch: 8, Iter: 290, Loss: 3.68484
[TRAIN] Epoch: 8, Iter: 300, Loss: 3.25367
[TRAIN] Epoch: 8, Iter: 310, Loss: 3.61306
[TRAIN] Epoch: 8, Iter: 320, Loss: 3.75710
[TRAIN] Epoch: 8, Iter: 330, Loss: 3.83169
[TRAIN] Epoch: 8, Iter: 340, Loss: 3.57013
[TRAIN] Epoch: 8, Iter: 350, Loss: 3.55956
[TRAIN] Epoch: 8, Iter: 360, Loss: 3.75187
[TRAIN] Epoch: 8, Iter: 370, Loss: 3.59154
[TRAIN] Epoch: 8, Iter: 380, Loss: 3.64999
[TRAIN] Epoch: 8, Iter: 390, Loss: 3.59570
[TRAIN] Epoch: 8, Iter: 400, Loss: 3.57535
[TRAIN] Epoch: 8, Iter: 410, Loss: 3.59383
[TRAIN] Epoch: 8, Iter: 420, Loss: 3.76861
[TRAIN] Epoch: 8, Iter: 430, Loss: 3.82177
[TRAIN] Epoch: 8, Iter: 440, Loss: 3.89195
[TRAIN] Epoch: 8, Iter: 450, Loss: 3.70080
[TRAIN] Epoch: 8, Iter: 460, Loss: 3.75740
[TRAIN] Epoch: 8, Iter: 470, Loss: 3.39137
[TRAIN] Epoch: 8, Iter: 480, Loss: 3.69711
[TRAIN] Epo