<a href="https://colab.research.google.com/github/akashmehra/AntColonyOptimization/blob/master/LightningLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install PyTorch Lightning

In [1]:
!pip install pytorch-lightning

Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/17/f6/bfe4676f3577063045e9d19f176163d9367a3e93fc999a7f72ced85287e7/pytorch_lightning-1.3.7.post0-py3-none-any.whl (810kB)
[K     |████████████████████████████████| 819kB 8.2MB/s 
[?25hCollecting pyDeprecate==0.3.0
  Downloading https://files.pythonhosted.org/packages/14/52/aa227a0884df71ed1957649085adf2b8bc2a1816d037c2f18b3078854516/pyDeprecate-0.3.0-py3-none-any.whl
Collecting torchmetrics>=0.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/3b/e8/513cd9d0b1c83dc14cd8f788d05cd6a34758d4fd7e4f9e5ecd5d7d599c95/torchmetrics-0.3.2-py3-none-any.whl (274kB)
[K     |████████████████████████████████| 276kB 50.2MB/s 
Collecting tensorboard!=2.5.0,>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/64/21/eebd23060763fedeefb78bc2b286e00fa1d8abda6f70efa2ee08c28af0d4/tensorboard-2.4.1-py3-none-any.whl (10.6MB)
[K     |████████████████████████████████| 10.6MB 34.6MB/s 
Collectin

In [2]:
!pip install wandb -qqq


[K     |████████████████████████████████| 1.8MB 8.7MB/s 
[K     |████████████████████████████████| 174kB 45.5MB/s 
[K     |████████████████████████████████| 133kB 50.0MB/s 
[K     |████████████████████████████████| 102kB 13.8MB/s 
[K     |████████████████████████████████| 71kB 11.6MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone


In [11]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torchvision import transforms
from torch.utils.data import  Dataset
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import os
from collections import Counter
from tqdm.notebook import tqdm
import wandb

In [4]:
wandb.login(key='') # set key here

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
 wandb.init(
      # Set entity to specify your username or team name
      # ex: entity="carey",
      # Set the project where this run will be logged
      entity = "modai",
      project="nlp_tutorials", 
      # Track hyperparameters and run metadata
      config={
      "learning_rate": 1e-2,
      "architecture": "LSTM",
      "dataset": "WikiText-2",})
  

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [12]:
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 24 if AVAIL_GPUS else 12
NUM_WORKERS = int(os.cpu_count() / 2)

In [13]:
wandb_logger = WandbLogger()

### Lightning Model
Here we define the model using PyTorch Lightning's self-contained approach. The model is a very simple **Language Model** which is an `LSTM Encoder` followed by a `Fully Connected Layer` and a `Softmax`. We output the distribution over the tokens to predict the next one given the context.

In [14]:
class RNNLanguageModel(pl.LightningModule):
    def __init__(self, input_size: int, hidden_size: int = 64, 
                 dropout_prob: float = 0.4, num_layers: int = 2,
                 tie_weights: bool = True):
        super().__init__()
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, 
                           num_layers=num_layers, bias=False, batch_first=True, 
                           bidirectional=False)
        self.dropout = nn.Dropout(dropout_prob)
        self.decoder = nn.Linear(hidden_size, input_size)
        if tie_weights:
            self.decoder.weight = self.encoder.weight

    def forward(self, x):
        encoded = self.encoder(x)
        self.rnn.flatten_parameters()
        output, hidden = self.rnn(encoded)
        return self.decoder(output)
    
    def training_step(self, batch, batch_idx):
        x,y = batch['sources'], batch['targets']
        embedded = self.encoder(x)
        self.rnn.flatten_parameters()
        output, hidden = self.rnn(embedded)
        hx, ctx = hidden
        output = self.dropout(output)
        output = output.reshape(output.size(0) * output.size(1), output.size(2))
        x_hat = self.decoder(output)
        loss = F.cross_entropy(x_hat, y.reshape(-1))
        self.log("training_loss", loss)
        return loss

    def configure_optimizers(self, lr=1e-2):
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        return optimizer


In [15]:
class ToTensor():
    def __call__(self, sample):
        sample['source'] = torch.LongTensor(sample['source'])
        sample['target'] = torch.LongTensor(sample['target'])
        return sample

In [16]:
class Dictionary(object):
    
    def __init__(self):
        
        self.dont_care = 0
        self.bos_token = 1
        self.eos_token = 2
        self.unk_token = 3
        self.bos_word = '<bos>'
        self.eos_word = '<eos>'
        self.unk_word = '<unk>'
        
        self._word2idx = {self.bos_word: self.bos_token, 
                          self.eos_word: self.eos_token, 
                          self.unk_word: self.unk_token}
        
        self._idx2word = [self.dont_care, self.bos_word, self.eos_word, self.unk_word]
        self._dist = Counter()
        
    def add_word(self, word):
        self._dist[word] += 1
        if word not in self._word2idx:
            self._idx2word.append(word)
            self._word2idx[word] = len(self._idx2word) - 1
        #assert len(self._dist) == len(self._idx2word) == len(self._word2idx)
        return self._word2idx[word]

    def idx_to_word(self, idx):
        return self._idx2word[idx]
    
    def word_to_idx(self, word):
        return self._word2idx[word]
    
    def __add__(self, other):
        pass
        
    def __iadd__(self, other):
        widmap = other.wordidmap
        for k,v in widmap.items():
            if k not in self._word2idx:
                self._word2idx[k] = len(self._word2idx)
                self._idx2word.append(k)
        return self
    
    @property
    def wordidmap(self):
        return self._word2idx
    
    @property
    def idxwordmap(self):
        return self._idx2word
    
    @property
    def word_count(self):
        return self._dist
        

    def __len__(self):
        return len(self._idx2word)


In [17]:
class SentenceDataset(Dataset):
    def __init__(self, path: str, dictionary: Dictionary, transform = ToTensor()):
        self._dictionary = dictionary
        self._tokens = self._tokenize(path)
        self.transform = transform

    def _tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        doc = []
        tokens = []
        self._num_tokens = 0
        # read lines from file.
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                words = [word.lower() for word in line.strip().split()]
                self._num_tokens += len(words)
                doc.append(words)
                
        for words in tqdm(doc): 
            tokens.append([self._dictionary.word_to_idx(word) for word in words])
                
        return tokens
    
    def __len__(self):
        return len(self._tokens)
    
    @property
    def corpus_size(self):
        return self._num_tokens
    
    def numpy(self):
        samples = []
        transform = ToNumpy()
        for idx in range(len(self._tokens)):
            sample = self[idx]
            sample = transform(sample)
            samples.append(sample)
        return samples
    
    @property
    def vocab(self):
        return self._dictionary
        
    def sentence(self, idx):
        return ' '.join([self._dictionary.idx_to_word(token) for token in self._tokens[idx]])

    def __getitem__(self, idx):
        tokens = self._tokens[idx]
        sample = {
            'source': [self._dictionary.bos_token] + tokens,
            'target': tokens + [self._dictionary.eos_token]
        }
        if self.transform:
            sample = self.transform(sample)
        return sample
        
    def __len__(self):
        return len(self._tokens)

In [18]:
def seq_collate_fn(data):
    
    def padding(seqs, seq_lens, dtype=torch.LongTensor):
        batch_size = len(seqs)
        max_seq_len = max(seq_lens)
        source_padded = torch.zeros(batch_size, max_seq_len).type(dtype)
        target_padded = torch.zeros(batch_size, max_seq_len).type(dtype)
        for i, seq in enumerate(seqs):
            end = len(seq['source'])
            source_padded[i,:end] = seq['source'] 
            target_padded[i,:end] = seq['target']
        
        return source_padded, target_padded
        
    data.sort(key=lambda d: len(d['source']), reverse=True)
    seq_lens = [len(d['source']) for d in data]
    source_padded, target_padded = padding(data, seq_lens)
    samples = {
        'sources': source_padded,
        'targets': target_padded,
        'seq_lens': seq_lens,
    }
    
    return samples

In [19]:
class SentenceDataModule(pl.LightningDataModule):
    def __init__(self, path: str, batch_size: int, num_workers: int):
        super().__init__()
        self.data_path = path
        self.batch_size = batch_size 
        self.num_workers = num_workers

        self.train_path = os.path.join(path, 'train.txt')
        self.val_path = os.path.join(path, 'valid.txt')
        self.test_path = os.path.join(path, 'test.txt')
        self.all_paths = [self.train_path, self.val_path, self.test_path]
        self.transforms = ToTensor()
        self.vocab = self._create_vocab(self.all_paths)
        self.dims = len(self.vocab)
        
    
    def prepare_data(self):
        pass

    def setup(self, stage = None):
        print(f"In Setup, with stage: {stage}")
        #if stage == 'fit' or stage is None:
        self.train_dataset = SentenceDataset(self.train_path, self.vocab, 
                                                 transform=self.transforms)
        self.val_dataset = SentenceDataset(self.val_path, self.vocab,
                                               transform=self.transforms)
        #if stage == 'test' or stage is None:
        self.test_dataset = SentenceDataset(self.test_path, self.vocab,
                                                transform=self.transforms)


    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, 
                          num_workers=self.num_workers, collate_fn=seq_collate_fn)
        
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, 
                          num_workers=self.num_workers, collate_fn=seq_collate_fn)
        
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, 
                          num_workers=selfnum_workers, collate_fn=seq_collate_fn)

    def _create_vocab(self, paths):
        vocab = Dictionary()
        for path in paths:
            # Add words to the dictionary
            with open(path, 'r', encoding="utf8") as f: 
                for line in f:
                    if len(line.strip()) == 0:
                        continue
                    words = [word.lower() for word in line.strip().split()]
                    for word in words:
                        vocab.add_word(word)
        return vocab
        

In [21]:
dm = SentenceDataModule('drive/My Drive/wiki-2', BATCH_SIZE, NUM_WORKERS)
lm = RNNLanguageModel(dm.size(), 256, dropout_prob=0.2, num_layers=4)
trainer = pl.Trainer(gpus=AVAIL_GPUS, max_epochs=5, progress_bar_refresh_rate=2, logger=wandb_logger)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [22]:
trainer.fit(lm, dm)

  rank_zero_warn(f'you passed in a {loader_name} but have no {step_name}. Skipping {stage} loop')


In Setup, with stage: fit


HBox(children=(FloatProgress(value=0.0, max=23767.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2461.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2891.0), HTML(value='')))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]






  | Name    | Type      | Params
--------------------------------------
0 | encoder | Embedding | 7.4 M 
1 | rnn     | LSTM      | 2.1 M 
2 | dropout | Dropout   | 0     
3 | decoder | Linear    | 7.4 M 
--------------------------------------
9.5 M     Trainable params
0         Non-trainable params
9.5 M     Total params
38.112    Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




In [23]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
training_loss,0.73432
epoch,4.0
trainer/global_step,4949.0
_runtime,830.0
_timestamp,1624476135.0
_step,98.0


0,1
training_loss,█▆▇▃▄▄▃▃▆▅▅▄▅▅▅▅▅▆▅▃▅▆▅▆▃▆▇▆▅▅▃▄█▄▅▄▄▂▆▁
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
