In [1]:
import gensim.utils as utils
from tqdm import tqdm
import torch
import torch.nn as nn
import numpy as np

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, corpus_length = None, device = None):
        corpus_file = open('./data/train_shuf.txt')

        if device == None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        if corpus_length == None:
            corpus_length = sum(1 for line in corpus_file)
        
        self.corpus = []

        for i in tqdm(range(corpus_length)):
            self.corpus.append(utils.simple_preprocess(corpus_file.readline(), min_len=1))

        self.corpus = sorted(self.corpus, key=lambda x: len(x))
        
        self.unique_words = self.get_unique_words()

        self.index_to_word = {index: word for index, word in enumerate(self.unique_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.unique_words)}

        self.input_corpus_indexes = [list(map(lambda word: self.word_to_index[word], sentence)) for sentence in self.corpus]
        output_corpus = [sentence[1:] + ['<STOP>'] for sentence in self.corpus]

        self.output_corpus_indexes = [list(map(lambda word: self.word_to_index[word], sentence)) for sentence in output_corpus]
        
        self.device = device


    def indexes_to_sentence(self, sentence):
        return list(map(lambda x: self.index_to_word[x], sentence))


    def get_unique_words(self):
        words = list(set([word for line in self.corpus for word in line]))
        words.sort()
        words = ['<PAD>','<STOP>'] + words
        return words

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):
        return (torch.tensor(self.input_corpus_indexes[index], device=self.device),
            torch.tensor(self.output_corpus_indexes[index], device=self.device))

In [3]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [40]:
def pad_collate(data):
    def left_pad_sequence(tensors):
        max_len = max(list(map(len, tensors)))
        padded_seq = [torch.hstack([torch.zeros(max_len - len(t), device=t.device), t]) for t in tensors]
        return torch.stack(padded_seq)


    inputs = [d[0] for d in data]
    outputs = [d[1] for d in data]
    inputs = left_pad_sequence(inputs)
    outputs = left_pad_sequence(outputs)
    return inputs, outputs

In [41]:
dataset = Dataset(corpus_length = 10, device='cuda')
loader = torch.utils.data.DataLoader(dataset, batch_size=5, collate_fn=pad_collate)


100%|██████████| 10/10 [00:00<00:00, 17978.16it/s]


In [42]:
for x,y in loader:
    for s_in, s_out in zip(x,y):
        print(dataset.indexes_to_sentence([x.item() for x in s_in]))
    print('-----------------------------------')

cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
-----------------------------------
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
-----------------------------------


In [None]:
#TODO: copied from Assigment 5, need to change

class RNN(nn.Module):
    def __init__(self, dataset, device):
        super(RNN, self).__init__()
        self.lstm_size = 512
        self.embedding_dim = 100
        self.num_layers = 2
        self.device = device
        

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
            padding_idx=0
        )

        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)

        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.rnn(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device))
        
model = LSTMModel(pooh_dataset, device) 
model.to(device)

In [None]:
def train(dataset, model, max_epochs = 30, batch_size = 512):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(SEQUENCE_LENGTH)
        
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()            

            loss.backward()
            optimizer.step()

        print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
            
train(pooh_dataset, model)       