In [1]:
import torch
import torch.nn.functional as F
import torchtext
import time 
import random
import pandas as pd

In [2]:
import os
import math
import time
import spacy
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
from typing import List

from torchtext.legacy.data import Field, BucketIterator

In [3]:
torch.backends.cudnn.deterministic = True

In [4]:
random_seed = 42
torch.manual_seed(random_seed)
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [5]:
path = "/content/drive/MyDrive/Lumiere/Dataset/artemis_data.csv"
artemis_data = pd.read_csv(path)

In [6]:
artemis_data.head()

Unnamed: 0,TEXT,LABEL
0,"She seems very happy in the picture, and you w...","She seems very happy in the picture, and you w..."
1,This woman has really knotty hands which makes...,This woman has really knotty hands which makes...
2,"When looking at this woman, I am filled with c...","When looking at this woman, I am filled with c..."
3,"A woman looking at ease, peaceful, and satisfi...","A woman looking at ease, peaceful, and satisfi..."
4,She looks like a lady from that past that migh...,She looks like a lady from that past that migh...


In [7]:
!pip install spacy



In [8]:
TEXT = torchtext.legacy.data.Field(sequential=True, use_vocab=True,
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en', init_token="<sos>", eos_token="<eos>" #not used lower=True
)

LABEL = torchtext.legacy.data.Field(sequential=True, use_vocab=True,
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en', init_token="<sos>", eos_token="<eos>"
)

In [9]:
fields = [("TEXT", TEXT), ("LABEL", LABEL)]

In [10]:
dataset = torchtext.legacy.data.TabularDataset(
    path=path, format='csv',
    skip_header=True, fields=fields)

In [11]:
train_data, valid_data, test_data = dataset.split(
    split_ratio=[0.8, 0.1, 0.1], # 80/10/10 split
    random_state=random.seed(random_seed))

print(f'Num Train: {len(train_data.examples)}')
print(f'Num Validation: {len(valid_data.examples)}')
print(f'Num Test: {len(test_data.examples)}')


Num Train: 363747
Num Validation: 45469
Num Test: 45468


In [12]:
print(vars(train_data.examples[0]))

{'TEXT': ['One', 'brave', 'knight', 'is', 'taking', 'on', 'a', 'team', 'of', 'opposing', 'knights', 'in', 'a', 'forest', 'field', '.'], 'LABEL': ['One', 'brave', 'knight', 'is', 'taking', 'on', 'a', 'team', 'of', 'opposing', 'knights', 'in', 'a', 'forest', 'field', '.']}


In [13]:
TEXT.build_vocab(train_data, min_freq=2, vectors="glove.6B.100d") #vectors="glove.6B.100d" not working <urlopen error [Errno 111] Connection refused>

print(f'Vocabulary size: {len(TEXT.vocab)}')

.vector_cache/glove.6B.zip: 862MB [02:42, 5.32MB/s]                           
100%|█████████▉| 399312/400000 [00:18<00:00, 20432.57it/s]

Vocabulary size: 29656


In [14]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 364835), ('.', 321266), ('and', 205470), ('a', 177577), ('of', 168372), ('The', 162589), ('is', 144451), (',', 114619), ('to', 106939), ('in', 100437), ('like', 95528), ('I', 77201), ('looks', 69386), ('it', 61362), ('me', 61302), ('are', 56835), ('this', 55337), ('on', 50688), ('with', 46207), ("'s", 44195)]


In [15]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '<sos>', '<eos>', 'the', '.', 'and', 'a', 'of', 'The']


In [16]:
print(TEXT.vocab.stoi['art'])

169


In [17]:
LABEL.build_vocab(train_data, min_freq=2, vectors="glove.6B.100d") # same here
print(f'Number of classes: {len(LABEL.vocab)}')

Number of classes: 29656


In [18]:
print(LABEL.vocab.stoi["art"])

169


In [19]:
train_loader, valid_loader, test_loader = torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.TEXT),
         device=DEVICE
    )

In [20]:
print(' Train')
for batch in train_loader:
    print(f'Text size: {batch.TEXT.size()}')
    print(f'Target size: {batch.LABEL.size()}')
    break
    
print('\n', 'Valid:')
for batch in valid_loader:
    print(f'Text size: {batch.TEXT.size()}')
    print(f'Target vector size: {batch.LABEL.size()}')
    break
    
print('\n', 'Test:')
for batch in test_loader:
    print(f'Text size: {batch.TEXT.size()}')
    print(f'Target size: {batch.LABEL.size()}') 
    break

#[SENTENCE LENGTH, BATCH SIZE] #sentences of only size 6?

 Train
Text size: torch.Size([65, 128])
Target size: torch.Size([65, 128])

 Valid:
Text size: torch.Size([6, 128])
Target vector size: torch.Size([6, 128])

 Test:
Text size: torch.Size([6, 128])
Target size: torch.Size([6, 128])


In [21]:
try_one = next(iter(test_loader))
try_one


[torchtext.legacy.data.batch.Batch of size 128]
	[.TEXT]:[torch.LongTensor of size 6x128]
	[.LABEL]:[torch.LongTensor of size 6x128]

In [22]:
try_one.TEXT

tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,  

In [35]:
INPUT_DIM = len(TEXT.vocab)
OUTPUT_DIM = len(LABEL.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
LAT_DIM = 100
N_LAYERS = 1 # N layers = 2 is complicated
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5 

In [36]:
#ENCODER
class Encoder(nn.Module):
    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, latent_dim:int, n_layers: int, dropout: float):
        super(Encoder,self).__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.latent_dim = latent_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim) #[25805-> 256] [len(TEXT.vocab)->embed dim]
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout) #[256 -> 512] [embed dim -> hidden dim]
        

        self.z_mean = torch.nn.Linear(hid_dim, latent_dim)  #[512 -> 100] [hidden dim -> latent dim]
        self.z_logvar = torch.nn.Linear(hid_dim, latent_dim) #[512 -> 100] [hidden dim -> latent dim]

    def reparameterise(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        eps = std.data.new(std.size()).normal_() # sample from normal distribution
        return eps.mul(std).add_(mu)
    
    def forward(self, x: torch.LongTensor):
        embedded = self.embedding(x) # [sentence len, batch size, embed dim]
        outputs, (hidden, cell) = self.rnn(embedded)

        mu = self.z_mean(hidden)
        logvar = self.z_logvar(hidden)
        # epsilon = torch.randn([batch_size, self.hidden_dim]) # may be problems here: maybe [1, batch_size, hidden_dim]
        z = self.reparameterise(mu, logvar) 

        return z, mu, logvar #we do not need to return mu and logvar
        

In [37]:
try_one.TEXT.shape

torch.Size([6, 128])

In [38]:
encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, LAT_DIM, N_LAYERS, ENC_DROPOUT).to(DEVICE)
z_try, _, _= encoder(try_one.TEXT)

  "num_layers={}".format(dropout, num_layers))


In [39]:
z_try.shape

torch.Size([1, 128, 100])

In [40]:
encoder

Encoder(
  (embedding): Embedding(29656, 256)
  (rnn): LSTM(256, 512, dropout=0.5)
  (z_mean): Linear(in_features=512, out_features=100, bias=True)
  (z_logvar): Linear(in_features=512, out_features=100, bias=True)
)

In [41]:
#DECODER
class Decoder(nn.Module):
  def __init__(self, input_dim:int, emb_dim: int, hid_dim: int, latent_dim:int, n_layers: int, dropout: float):
      super(Decoder, self).__init__()

      self.input_dim = input_dim
      self.emb_dim = emb_dim
      self.hid_dim = hid_dim
      self.latent_dim = latent_dim
      self.n_layers = n_layers
      self.dropout = dropout

      # is it nn.linear(2*latent dim, hidden dim) -> check
      self.linear1 = torch.nn.Linear(latent_dim, emb_dim) #[100-> 256] [2*latent dim -> embed dim]
      # embed?
      self.rnn_decoder = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)  #[256 -> 512] [embed dim -> hidden dim] # not sure about this here
      self.linear2 = torch.nn.Linear(hid_dim, input_dim) #[512 -> 25805] [hidden dim -> len(TEXT.vocab)]

  def forward(self, z):
      output_lat = self.linear1(z.unsqueeze(0)) #requires 3 dimensions [1, batch size, emb dim] sentence len = 1 (decode one at a time)
      output_lstm, (hidden, cell_state) = self.rnn_decoder(output_lat.squeeze(0))
      output = self.linear2(output_lstm.squeeze(0)) # squeezing out the [1,]
      return output # we do not need to return hidden, cell state; squeeze is to remove the nlayers dimension
    

In [42]:
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM,LAT_DIM, N_LAYERS, DEC_DROPOUT).to(DEVICE)
decoder

  "num_layers={}".format(dropout, num_layers))


Decoder(
  (linear1): Linear(in_features=100, out_features=256, bias=True)
  (rnn_decoder): LSTM(256, 512, dropout=0.5)
  (linear2): Linear(in_features=512, out_features=29656, bias=True)
)

In [43]:
out_try = decoder(z_try)

In [44]:
out_try.shape

torch.Size([128, 29656])

In [41]:
class VAE(nn.Module):

  def __init__(self, encoder:Encoder, decoder:Decoder, device: torch.device):

    super(VAE,self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

    assert encoder.hid_dim == decoder.hid_dim, \
              'Hidden dimensions of encoder and decoder must be equal!'
    assert encoder.n_layers == decoder.n_layers, \
              'Encoder and decoder must have equal number of layers!'
  
  def forward(self, x:torch.LongTensor):
    max_len, batch_size = x.shape 
    v_size = self.decoder.input_dim # size of vocab
    outputs = torch.zeros(max_len, batch_size, v_size).to(self.device) #store outputs

    z,mu,logvar = self.encoder(x)

    # trg = x[0] -> not the case as we have to pass in z to the decoder.

    # decoding one at a time
    for i in range(1, max_len):
      prediction = self.decoder(trg) #not sure what to do here

    output = self.decoder(z)

    return output, mu, logvar
  

In [42]:
vae = VAE(encoder, decoder, DEVICE).to(DEVICE)
vae

VAE(
  (encoder): Encoder(
    (embedding): Embedding(29656, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (z_mean): Linear(in_features=512, out_features=100, bias=True)
    (z_logvar): Linear(in_features=512, out_features=100, bias=True)
  )
  (decoder): Decoder(
    (linear1): Linear(in_features=100, out_features=256, bias=True)
    (rnn_decoder): LSTM(256, 512, num_layers=2, dropout=0.5)
    (linear2): Linear(in_features=512, out_features=29656, bias=True)
  )
)

In [43]:
outputs,_,_ = vae(try_one.TEXT)

In [44]:
outputs.shape

torch.Size([2, 128, 29656])

In [None]:
def rec_kl_loss(yhat, y, mu, logvar):
  BCE = nn.functional.binary_cross_entropy(yhat, y, reduction="sum")
  KLD = 0.5 * torch.sum(logvar.exp() - logvar - 1 + mu.pow(2))
  loss = BCE + KLD
  return loss

In [None]:
lr = 1e-3
optimizer = optim.Adam(vae.parameters(), lr=lr)

PAD_IDX = TEXT.vocab.stoi["<pad>"]

In [None]:
def train(vae, iterator, optimizer, criterion):
    vae.train()

    epoch_loss = 0
    for batch in iterator:
        optimizer.zero_grad()
        outputs = vae(batch.TEXT)
        outputs_flatten = outputs[0][1:].view(-1, outputs[0].shape[-1]) #These shapes dont work for my model
        label_flatten = batch.LABEL.view(-1)
        loss = criterion(outputs_flatten, label_flatten, outputs[1], outputs[2])

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def evaluate(vae, iterator, criterion):
    vae.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
           
            outputs = vae(batch.TEXT) 
            outputs_flatten = outputs[0][1:].view(-1, outputs[0].shape[-1])
            label_flatten = batch.LABEL.view(-1, outputs[0].shape[-1])
            loss = criterion(outputs_flatten, label_flatten, outputs[1], outputs[2])
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):    
    start_time = time.time()
    train_loss = train(vae, train_loader, optimizer, rec_kl_loss)
    valid_loss = evaluate(vae, valid_loader, rec_kl_loss)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(vae.state_dict(), 'model1.pt')

    # it's easier to see a change in perplexity between epoch as it's an exponential
    # of the loss, hence the scale of the measure is much bigger
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')