# LSTM

### Imports

In [31]:
from typing import Tuple, List
from collections import deque

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim import Optimizer

import numpy as np
from torch import Tensor

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss

from lincoln.pytorch.layers import PyTorchLayer, DenseLayer
from lincoln.pytorch.model import PyTorchModel
from lincoln.pytorch.train import PyTorchTrainer
from lincoln.pytorch.preprocessor import ConvNetPreprocessor
from lincoln.pytorch.utils import assert_dim, permute_data
print("all libs imported")

all libs imported


In [32]:
import torchvision
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

img_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1305,), (0.3081,))
])
print("Data, transforms, and DataLoader imported")

Data, transforms, and DataLoader imported


### Set Constants 

In [33]:
SEED = 20190325
NUM_THREADS = 8 # for interop parallelism 
NUM_WORKERS = 32 # for torch.DataLoader 
USE_GPU = True
DEFAULT_DTYPE = torch.FloatTensor
print("SEED:", SEED)
print("NUM_THREADS:", NUM_THREADS)
print("NUM_WORKERS:", NUM_WORKERS)
print("USE_GPU:", USE_GPU)
print("DEFAULT_DTYPE", DEFAULT_DTYPE)

SEED: 20190325
NUM_THREADS: 8
NUM_WORKERS: 32
USE_GPU: True
DEFAULT_DTYPE <class 'torch.FloatTensor'>


### Prime Torch with Seed

In [34]:
torch.manual_seed(SEED);
print("seed: ", SEED)

seed:  20190325


### Enable GPU

In [35]:
if USE_GPU:
    if torch.cuda.is_available():
        device = torch.device("cuda:0")
    else:
        print("gpu not available")
        import sys
        sys.exit()
else:
    device = torch.device("cpu")
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda:0


### Set default Tensor type (cpu or cuda)
Not sure if we really want to do this. It seems non-canonical. 
No, we don't want to do this because it doesn't play well with multiprocessing.
https://discuss.pytorch.org/t/is-there-anything-wrong-with-setting-default-tensor-type-to-cuda/27949/3

In [36]:
# if USE_GPU:
#     # set default Tensor type (cpu or cuda)
#     torch.set_default_tensor_type('torch.cuda.FloatTensor')
if DEFAULT_DTYPE:
    torch.set_default_tensor_type(DEFAULT_DTYPE)
print("default Tensor type:", torch.tensor([3., 3.]).dtype)

default Tensor type: torch.float32


### Set num threads

In [37]:
# set num_threads
# torch.set_num_threads(NUM_THREADS)
print("num threads: ", torch.get_num_threads())

num threads:  4


### Enable autoreloading of modules

In [38]:
# autoreload reloads modules automatically before entering the 
# execution of code typed at the IPython prompt.
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2
print("modules autorelading enabled")
# !jupyter nbextension list


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
modules autorelading enabled


## `LSTMLayer`

In [39]:
class LSTMLayer(PyTorchLayer):
    def __init__(self,
                 sequence_length: int,
                 input_size: int,
                 hidden_size: int,
                 output_size: int,
                 device: torch.device,
                 dropout: float = 1.0) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        self.h_init = torch.zeros((1, hidden_size))
        self.c_init = torch.zeros((1, hidden_size))
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = DenseLayer(hidden_size, output_size)
        self.device = device
        
        if dropout < 1.0:
            self.dropout = nn.Dropout(1 - dropout)

        self.lstm = self.lstm.to(self.device)
        
    def _transform_hidden_batch(self, hidden: Tensor,
                                batch_size: int,
                                before_layer: bool) -> Tensor:
        
        if before_layer:
            return (hidden
                    .repeat(batch_size, 1)
                    .view(batch_size, 1, self.hidden_size)
                    .permute(1,0,2))
        else:
            return (hidden
                    .permute(1,0,2)
                    .mean(dim=0))         
    

    def forward(self, x: Tensor) -> Tensor:
        
        batch_size = x.shape[0]

        h_layer = self._transform_hidden_batch(self.h_init, batch_size, before_layer=True)
        c_layer = self._transform_hidden_batch(self.c_init, batch_size, before_layer=True)
        
        h_layer = h_layer.to(self.device)
        c_layer = c_layer.to(self.device)
        x = x.to(self.device)
        self.lstm = self.lstm.to(self.device)
        
        x, (h_out, c_out) = self.lstm(x, (h_layer, c_layer))
        
        self.h_init, self.c_init = (
            self._transform_hidden_batch(h_out, batch_size, before_layer=False).detach(),
            self._transform_hidden_batch(c_out, batch_size, before_layer=False).detach()
        )

        x = self.fc(x)
        if hasattr(self, "dropout"):
            x = self.dropout(x) 
            
        return x

In [40]:
lay = LSTMLayer(sequence_length=25,
          input_size=62,
          hidden_size=100,
          output_size=128, device=device)

x = torch.randn(32, 25, 62)
x = x.to(device)
lay = lay.to(device)
lay(x).shape

torch.Size([32, 25, 128])

## `NextCharacterModel`

In [41]:
class NextCharacterModel(PyTorchModel):
    def __init__(self,
                 vocab_size: int,
                 hidden_size: int = 256,
                 sequence_length: int = 25):
        super().__init__()
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        
        # In this model, we have only one layer, with the same output size as input_size
        self.lstm = LSTMLayer(self.sequence_length, self.vocab_size, hidden_size, self.vocab_size, device=device)

    def forward(self,
                inputs: Tensor):
        assert_dim(inputs, 3) # batch_size, sequence_length, vocab_size

        out = self.lstm(inputs)       
        
        return out.permute(0, 2, 1),

## `LSTMTrainer`

In [42]:
class LSTMTrainer(PyTorchTrainer):
    def __init__(self,
                 model: NextCharacterModel,
                 optim: Optimizer,
                 criterion: _Loss,
                 device: torch.device):
        super().__init__(model, optim, criterion, device)
        self.vocab_size = self.model.vocab_size
        self.max_len = self.model.sequence_length
        self.device = device
        
    def fit(self,
            data: str,
            epochs: int=10,
            eval_every: int=1,
            batch_size: int=32,
            seed: int = 121718)-> None:
        
        self.data = data
        self.train_data, self.test_data = self._train_test_split_text()
        self.chars = list(set(self.data))
        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}

        torch.manual_seed(seed)

        losses = deque(maxlen=50)
        
        for e in range(epochs):

            batch_generator = self.generate_batches_next_char(batch_size)

            for ii, (X_batch, y_batch) in enumerate(batch_generator):

                X_batch = X_batch.to(self.device)
                y_batch = y_batch.to(self.device)
                self.model = self.model.to(self.device)
                
                self.optim.zero_grad()                
                outputs = self.model(X_batch)[0]

                loss = self.loss(outputs, y_batch)
                losses.append(loss.item())

                loss.backward()
                
                self.optim.step()    

            if (e+1) % eval_every == 0:

                X_test, y_test = self.generate_test_data()
                test_preds = self.model.forward(X_test)[0]
                X_test = X_test.to(self.device)
                y_test = y_test.to(self.device)

                loss = self.loss.forward(test_preds, y_test)
                print(f"Validation loss after {e+1} epochs is {loss.item():.3f}")

    def _train_test_split_text(self, pct=0.8) -> Tuple[str]:

        n = len(self.data)
        return self.data[:int(n * pct)], self.data[int(n * pct):]

    def generate_batches_next_char(self,
                                   batch_size: int) -> Tuple[Tensor]:
        N = len(self.train_data)
        # add batch size
        for ii in range(0, N, batch_size):

            features_tensors = []
            target_indices = []

            for char in range(batch_size):

                features_str, target_str =\
                 self.train_data[ii+char:ii+char+self.max_len],\
                 self.train_data[ii+char+1:ii+char+self.max_len+1]

                features_array = self._string_to_one_hot_array(features_str)
                target_indices_seq = [self.char_to_idx[char] for char in target_str]

                features_tensors.append(features_array)
                target_indices.append(target_indices_seq)
            if len(features_str) != len(target_str):
                break
            yield torch.stack(features_tensors).to(self.device), torch.LongTensor(target_indices).to(self.device)

    def _string_to_one_hot_array(self, input_string: str) -> Tuple[Tensor]:

        ind = [self.char_to_idx[ch] for ch in input_string]

        array = self._one_hot_text_data(ind)

        return array.to(self.device)

    def _one_hot_text_data(self,
                           sequence: List):

        sequence_length = len(sequence)
        batch = torch.zeros(sequence_length, self.vocab_size)
        for i in range(sequence_length):
            batch[i, sequence[i]] = 1.0

        return Tensor(batch).to(self.device)

    def generate_test_data(self) -> Tuple[Tensor]:

        features_str, target_str = self.test_data[:-1], self.test_data[1:]

        X_tensors = []
        y_tensors = []

        N = len(self.test_data)

        for start in range(0, N, self.max_len):

            features_str, target_str =\
                self.test_data[start:start+self.max_len],\
                self.test_data[start+1:start+self.max_len+1]

            if len(features_str) != len(target_str):
                break
            features_array = self._string_to_one_hot_array(features_str)
            target_indices_seq = [self.char_to_idx[char] for char in target_str]

            X_tensors.append(features_array)
            y_tensors.append(torch.LongTensor(target_indices_seq))
            
        return torch.stack(X_tensors).to(self.device), torch.stack(y_tensors).to(self.device)


In [43]:
# torch.set_num_threads(16)
data = open('../06_rnns/input.txt', 'r').read()
vocab_size = len(set(data))
model = NextCharacterModel(vocab_size, hidden_size=vocab_size, sequence_length=50)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,
                             weight_decay=1e-5)

In [44]:
lstm_trainer = LSTMTrainer(model, optimizer, criterion, device)

In [45]:
import time
start = time.time()
lstm_trainer.fit(data, epochs=1)
print("time: ", time.time() - start, " seconds")




Validation loss after 1 epochs is 2.314
time:  107.50356197357178  seconds


* cpu: 207.68804907798767  seconds
* gpu: 104.77917432785034  seconds