In [1]:
import pandas as pd
import time

#Personnalized libraries
from configs.config import DatasetConfig, HP
from data.DataLoader import build_dataloader
from utils.Errors import loss_estimation
from Procedures import Procedure
from model.lm import LanguageModel
from torch.utils.tensorboard import SummaryWriter

ds_config = DatasetConfig()
hp = HP()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import preprocess_data as preprocess

In [None]:
preprocess.preprocess("./data/processed/train.min.csv")
preprocess.preprocess("./data/processed/valid.min.csv")

## Build DataLoaders

In [2]:
# instanciate dataloader for train, valid and test
train_iter, vocab, _ = build_dataloader(
    file_path=ds_config.train_data, 
    vocab_size=ds_config.vocab_size,
    vocab_min_freq=ds_config.min_freq,
    vocab=None,
    is_train=True,
    shuffle_batch=False,
    max_num_reviews=ds_config.max_num_reviews,
    refs_path=None,
    max_len_rev=ds_config.max_len_rev,
    pin_memory=ds_config.pin_memory,
    num_workers=ds_config.workers,
    batch_size=ds_config.batch_size,
    device=ds_config.device
)

train_size = len(train_iter)
train_size

Loading data: 100%|█████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 691.13item/s]
Build vocabulary: 100%|███████████████████████████████████████████████████████████| 3218/3218 [00:31<00:00, 103.54it/s]

Vocabulary size: 11310





433

In [3]:
valid_iter, _, valid_references = build_dataloader(
    file_path=ds_config.valid_data, 
    vocab_size=ds_config.vocab_size,
    vocab_min_freq=ds_config.min_freq,
    vocab=vocab,
    is_train=False,
    shuffle_batch=False,
    max_num_reviews=ds_config.max_num_reviews,
    refs_path=None,
    max_len_rev=ds_config.max_len_rev,
    pin_memory=ds_config.pin_memory,
    num_workers=ds_config.workers,
    batch_size=ds_config.batch_size,
    device=ds_config.device
)

valid_size = len(valid_iter)
valid_size

Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 354.40item/s]


85

In [4]:
test_iter, _, test_references = build_dataloader(
    file_path=ds_config.test_data, 
    vocab_size=ds_config.vocab_size,
    vocab_min_freq=ds_config.min_freq,
    vocab=vocab,
    is_train=False,
    shuffle_batch=False,
    max_num_reviews=ds_config.max_num_reviews,
    refs_path=None,
    max_len_rev=ds_config.max_len_rev,
    pin_memory=ds_config.pin_memory,
    num_workers=ds_config.workers,
    batch_size=ds_config.batch_size,
    device=ds_config.device
)

test_size = len(test_iter)
test_size

Loading data: 100%|█████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 947.22item/s]


20

## Model Definition

In [5]:
import torch
import torch.nn as nn
import random
import math

In [6]:
from utils.model_utils import postprocess, is_special, clean_up_tokenization

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        self.rnn = nn.GRU(emb_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        outputs, hidden = self.rnn(embedded) #no cell state!
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        return hidden

In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_, hidden, context):
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        input_ = input_.unsqueeze(0)
        #input = [1, batch size]
        embedded = self.dropout(self.embedding(input_))
        #embedded = [1, batch size, emb dim]
        emb_con = torch.cat((embedded, context), dim = 2)
        #emb_con = [1, batch size, emb dim + hid dim]
        output, hidden = self.rnn(emb_con, hidden)
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim = 1)
        #output = [batch size, emb dim + hid dim * 2]
        prediction = self.fc_out(output)
        #prediction = [batch size, output dim]
        
        return prediction, hidden

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        #last hidden state of the encoder is the context
        context = self.encoder(src)
        #context also used as the initial hidden state of the decoder
        hidden = context
        #first input to the decoder is the <sos> tokens
        input_ = trg[0,:]
        for t in range(1, trg_len):
            #insert input token embedding, previous hidden state and the context state
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input_, hidden, context)
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input_ = trg[t] if teacher_force else top1

        return outputs

In [10]:
def train(model, iterator, optimizer, criterion, clip, check):  
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.enc_input.permute(1, 0).contiguous().cuda()
        trg = batch.enc_input.permute(1, 0).contiguous().cuda()
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        if check and i % 5 == 0 and i <= 25:
            reviews = inference(output, vocab)
            #print(f"[batch:{i}]", output.permute(1,2,0).argmax(1)[:4])
            for j,r in enumerate(reviews):
                print(f"[batch:{i}, itr: {j}]", r)
                if j >= 5:
                    break
            print("-"*100)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [11]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.enc_input.permute(1, 0).contiguous().cuda()
            trg = batch.enc_input.permute(1, 0).contiguous().cuda()

            output = model(src, trg, 0) #turn off teacher forcing
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [12]:
def inference(final_dists, vocab):
    final_dists = final_dists.permute(1, 2, 0) # [batch_size, ext_vocab, gen_len]
    rev_hyps = final_dists.argmax(1)  # [batch_size, seq_len]
    rev_hyp_words = [vocab.outputids2words(hyp) for i, hyp in enumerate(rev_hyps)]
    reviews = [postprocess(words, skip_special_tokens=True, clean_up_tokenization_spaces=True) for words in rev_hyp_words]
    return reviews

In [13]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)

In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)
model.apply(init_weights)

import torch.optim as optim
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = vocab.pad()
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')
check = False
for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP, check)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if epoch+2 == 10:
        check = True
    else:
        check = False
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 36s
	Train Loss: 6.392 | Train PPL: 597.161
	 Val. Loss: 6.610 |  Val. PPL: 742.371
Epoch: 02 | Time: 1m 40s
	Train Loss: 5.661 | Train PPL: 287.531
	 Val. Loss: 6.697 |  Val. PPL: 810.029
Epoch: 03 | Time: 1m 38s
	Train Loss: 5.436 | Train PPL: 229.634
	 Val. Loss: 6.872 |  Val. PPL: 965.131
Epoch: 04 | Time: 1m 38s
	Train Loss: 5.276 | Train PPL: 195.599
	 Val. Loss: 7.000 |  Val. PPL: 1096.433
Epoch: 05 | Time: 1m 39s
	Train Loss: 5.142 | Train PPL: 171.009
	 Val. Loss: 7.239 |  Val. PPL: 1392.864
Epoch: 06 | Time: 1m 35s
	Train Loss: 5.012 | Train PPL: 150.279
	 Val. Loss: 7.261 |  Val. PPL: 1423.819
Epoch: 07 | Time: 1m 35s
	Train Loss: 4.871 | Train PPL: 130.459
	 Val. Loss: 7.321 |  Val. PPL: 1511.658
Epoch: 08 | Time: 1m 35s
	Train Loss: 4.729 | Train PPL: 113.182
	 Val. Loss: 7.224 |  Val. PPL: 1372.635
Epoch: 09 | Time: 1m 36s
	Train Loss: 4.581 | Train PPL:  97.633
	 Val. Loss: 7.374 |  Val. PPL: 1593.334
[batch:0, itr: 0] i bought this product it it it 

In [18]:
from model.encoder import Encoder
from model.decoder import Decoder

INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)
model.apply(init_weights)

import torch.optim as optim
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = vocab.pad()
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')
check = False
for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP, check)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if epoch+2 == 10:
        check = True
    else:
        check = False
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 35s
	Train Loss: 6.337 | Train PPL: 565.305
	 Val. Loss: 6.656 |  Val. PPL: 777.160
Epoch: 02 | Time: 1m 38s
	Train Loss: 5.652 | Train PPL: 284.807
	 Val. Loss: 6.696 |  Val. PPL: 809.394
Epoch: 03 | Time: 1m 39s
	Train Loss: 5.422 | Train PPL: 226.270
	 Val. Loss: 6.846 |  Val. PPL: 940.292
Epoch: 04 | Time: 1m 39s
	Train Loss: 5.267 | Train PPL: 193.858
	 Val. Loss: 7.005 |  Val. PPL: 1102.024
Epoch: 05 | Time: 1m 39s
	Train Loss: 5.122 | Train PPL: 167.654
	 Val. Loss: 7.167 |  Val. PPL: 1295.786
Epoch: 06 | Time: 1m 39s
	Train Loss: 4.991 | Train PPL: 147.067
	 Val. Loss: 7.214 |  Val. PPL: 1357.788
Epoch: 07 | Time: 1m 39s
	Train Loss: 4.835 | Train PPL: 125.872
	 Val. Loss: 7.323 |  Val. PPL: 1514.604
Epoch: 08 | Time: 1m 39s
	Train Loss: 4.685 | Train PPL: 108.336
	 Val. Loss: 7.262 |  Val. PPL: 1425.588
Epoch: 09 | Time: 1m 39s
	Train Loss: 4.511 | Train PPL:  91.017
	 Val. Loss: 7.269 |  Val. PPL: 1435.282
[batch:0, itr: 0] i bought this husband and it it

In [19]:
from model.lm import LanguageModel

INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LanguageModel(vocab, hp=hp).to(device)
model.apply(init_weights)

import torch.optim as optim
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = vocab.pad()
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')
check = False
for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP, check)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if epoch+2 == 10:
        check = True
    else:
        check = False
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 1m 38s
	Train Loss: 6.382 | Train PPL: 591.261
	 Val. Loss: 6.577 |  Val. PPL: 718.542
Epoch: 02 | Time: 1m 38s
	Train Loss: 5.643 | Train PPL: 282.171
	 Val. Loss: 6.720 |  Val. PPL: 828.858
Epoch: 03 | Time: 1m 39s
	Train Loss: 5.422 | Train PPL: 226.229
	 Val. Loss: 6.888 |  Val. PPL: 980.236
Epoch: 04 | Time: 1m 38s
	Train Loss: 5.260 | Train PPL: 192.502
	 Val. Loss: 6.982 |  Val. PPL: 1077.446
Epoch: 05 | Time: 1m 39s
	Train Loss: 5.114 | Train PPL: 166.350
	 Val. Loss: 7.257 |  Val. PPL: 1418.249
Epoch: 06 | Time: 1m 38s
	Train Loss: 4.970 | Train PPL: 144.075
	 Val. Loss: 7.187 |  Val. PPL: 1322.012
Epoch: 07 | Time: 1m 38s
	Train Loss: 4.832 | Train PPL: 125.406
	 Val. Loss: 7.251 |  Val. PPL: 1409.351
Epoch: 08 | Time: 1m 39s
	Train Loss: 4.695 | Train PPL: 109.361
	 Val. Loss: 7.349 |  Val. PPL: 1555.268
Epoch: 09 | Time: 1m 38s
	Train Loss: 4.552 | Train PPL:  94.833
	 Val. Loss: 7.283 |  Val. PPL: 1455.646
[batch:0, itr: 0] i purchased it it it it it and 