## Set device to GPU

In [1]:
import pynvml

def get_memory_free_MiB(gpu_index):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_index))
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.free // 1024 ** 2

In [2]:
for i in range(4):
    print(f'GPU {i} available vram {get_memory_free_MiB(i)}')

GPU 0 available vram 11003
GPU 1 available vram 11003
GPU 2 available vram 11003
GPU 3 available vram 11003


In [3]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


In [4]:
torch.cuda.set_device(1)

In [5]:
torch.cuda.current_device()

1

In [6]:
tensor = torch.randn(3, 3, device=device)

print(f"Tensor is on: {tensor.device}")

Tensor is on: cuda:1


In [7]:
torch.cuda.get_device_name(1)

'NVIDIA GeForce RTX 2080 Ti'

In [8]:
import torchtext
from torch import nn, optim
import torchtext, datasets, math
from tqdm import tqdm

datasets.config.HF_DATASETS_CACHE = "./downloaded_datasets"

torchtext.disable_torchtext_deprecation_warning()

In [9]:
# This is for reproducability 

SEED = 69
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Load data - Maximofn/short-jokes

In [10]:
dataset_name = "Maximofn/short-jokes-dataset"

In [11]:
dataset_config = datasets.get_dataset_config_names(dataset_name)
dataset_config

['default']

In [12]:
dataset = datasets.load_dataset(dataset_name)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'Joke'],
        num_rows: 231657
    })
})


### split the dataset into train and test

In [13]:
train_test_split  = dataset['train'].train_test_split(test_size=0.3, seed=SEED)
validation_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=SEED)

# recombine into DatasetDict for easier usage
split_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"],
}
split_dataset = datasets.DatasetDict(split_dataset)

In [14]:
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'Joke'],
        num_rows: 162159
    })
    validation: Dataset({
        features: ['ID', 'Joke'],
        num_rows: 34749
    })
    test: Dataset({
        features: ['ID', 'Joke'],
        num_rows: 34749
    })
})


In [15]:
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
validation_dataset = split_dataset['validation']

In [16]:
train_dataset.shape, test_dataset.shape, validation_dataset.shape

((162159, 2), (34749, 2), (34749, 2))

## Preprocessing

In [17]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

column_name = 'Joke'

def tokenize_data(example):
    return {'tokens': tokenizer(example[column_name])}

#  applies the tokenize_data function to every example in the dataset 
#  and removes the original 'Joke' column.
tokenized_train_dataset = train_dataset.map(tokenize_data, remove_columns=[column_name])
tokenized_test_dataset = test_dataset.map(tokenize_data, remove_columns=[column_name])
tokenized_validation_dataset = validation_dataset.map(tokenize_data, remove_columns=[column_name])

In [18]:
print(tokenized_train_dataset[1]['tokens'])
print(tokenized_test_dataset[1]['tokens'])
print(tokenized_validation_dataset[1]['tokens'])

['what', "'", 's', 'the', 'difference', 'between', 'two', 'dicks', 'and', 'a', 'joke', '?', 'your', 'mom', 'can', "'", 't', 'take', 'a', 'joke']
['since', 'the', 'snow', 'came', ',', 'all', 'the', 'wife', 'has', 'done', 'is', 'look', 'through', 'the', 'window', '.', 'if', 'it', 'gets', 'any', 'worse', ',', 'i', "'", 'll', 'have', 'to', 'let', 'her', 'in', '.']
['what', 'do', 'they', 'say', 'about', 'a', 'rapper', 'who', 'stole', 'a', 'dictionary', '?', 'he', 'got', 'a', 'way', 'with', 'words', '.']


## Numericalizing

In [19]:
from torchtext.vocab import build_vocab_from_iterator

vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_train_dataset['tokens'], min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [20]:
print(len(vocab))

25164


In [21]:
print(vocab.get_itos()[:10])

['<unk>', '<eos>', '.', 'a', 'the', "'", 'i', '?', ',', 'you']


## Prepare batch loader

In [22]:
def get_data(dataset, vocab, batch_size):
    data = []
    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [23]:
batch_size = 128
train_data = get_data(tokenized_train_dataset, vocab, batch_size)
valid_data = get_data(tokenized_validation_dataset, vocab, batch_size)
test_data  = get_data(tokenized_test_dataset,  vocab, batch_size)

In [24]:
train_data.shape, valid_data.shape, test_data.shape

(torch.Size([128, 28444]), torch.Size([128, 6103]), torch.Size([128, 6107]))

## Modelling

In [25]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

## Training

In [26]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3     

In [27]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 68,354,636 trainable parameters


In [28]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [29]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    # reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        # hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        # need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
        
    return epoch_loss / num_batches

In [30]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
            
    return epoch_loss / num_batches

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time % 60)
    elapsed_millis = int((elapsed_time - int(elapsed_time)) * 1000)  # get milliseconds
    return elapsed_mins, elapsed_secs, elapsed_millis

In [32]:
import time

n_epochs = 15
seq_len  = 50 #<----decoding length
clip    = 0.25
total_training_time = 0

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    start = time.time()
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)
    end = time.time()
    epoch_mins, epoch_secs, epoch_millis = epoch_time(start, end)
    total_training_time += (end - start)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')
    print(f'\tEpoch: {epoch + 1} | time: {epoch_mins}m {epoch_secs}s {epoch_millis}ms')
    print(f'\ttrain_loss: {train_loss:.6f} | validation_loss: {valid_loss:.6f}')

total_mins, total_secs, total_millis = epoch_time(0, total_training_time)
print(f"Total training time: {total_mins}m {total_secs}s {total_millis}ms")

                                                           

	Train Perplexity: 296.840
	Valid Perplexity: 124.718
	Epoch: 1 | time: 1m 47s 229ms
	train_loss: 5.693194 | validation_loss: 4.826053


                                                           

	Train Perplexity: 123.851
	Valid Perplexity: 89.654
	Epoch: 2 | time: 1m 48s 779ms
	train_loss: 4.819082 | validation_loss: 4.495953


                                                           

	Train Perplexity: 97.398
	Valid Perplexity: 76.277
	Epoch: 3 | time: 1m 49s 334ms
	train_loss: 4.578803 | validation_loss: 4.334376


                                                           

	Train Perplexity: 83.877
	Valid Perplexity: 68.745
	Epoch: 4 | time: 1m 49s 265ms
	train_loss: 4.429352 | validation_loss: 4.230399


                                                           

	Train Perplexity: 75.182
	Valid Perplexity: 63.877
	Epoch: 5 | time: 1m 49s 482ms
	train_loss: 4.319907 | validation_loss: 4.156963


                                                           

	Train Perplexity: 68.964
	Valid Perplexity: 60.471
	Epoch: 6 | time: 1m 49s 751ms
	train_loss: 4.233587 | validation_loss: 4.102163


                                                           

	Train Perplexity: 64.242
	Valid Perplexity: 57.803
	Epoch: 7 | time: 1m 50s 239ms
	train_loss: 4.162652 | validation_loss: 4.057048


                                                           

	Train Perplexity: 60.464
	Valid Perplexity: 55.922
	Epoch: 8 | time: 1m 49s 295ms
	train_loss: 4.102053 | validation_loss: 4.023961


                                                           

	Train Perplexity: 57.302
	Valid Perplexity: 54.298
	Epoch: 9 | time: 1m 49s 574ms
	train_loss: 4.048328 | validation_loss: 3.994485


                                                           

	Train Perplexity: 54.686
	Valid Perplexity: 53.127
	Epoch: 10 | time: 1m 49s 423ms
	train_loss: 4.001607 | validation_loss: 3.972692


                                                           

	Train Perplexity: 52.483
	Valid Perplexity: 52.182
	Epoch: 11 | time: 1m 49s 310ms
	train_loss: 3.960483 | validation_loss: 3.954732


                                                           

	Train Perplexity: 50.557
	Valid Perplexity: 51.306
	Epoch: 12 | time: 1m 49s 409ms
	train_loss: 3.923104 | validation_loss: 3.937802


                                                           

	Train Perplexity: 48.912
	Valid Perplexity: 50.696
	Epoch: 13 | time: 1m 49s 351ms
	train_loss: 3.890024 | validation_loss: 3.925855


                                                           

	Train Perplexity: 47.409
	Valid Perplexity: 50.228
	Epoch: 14 | time: 1m 49s 726ms
	train_loss: 3.858806 | validation_loss: 3.916581


                                                           

	Train Perplexity: 46.104
	Valid Perplexity: 49.846
	Epoch: 15 | time: 1m 49s 673ms
	train_loss: 3.830908 | validation_loss: 3.908940
Total training time: 27m 19s 846ms


## Testing

In [33]:
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 49.433


## real-world inference

In [34]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [35]:
prompt = 'You are very smart but '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
you are very smart but you ' re gay

0.7
you are very smart but you ' re gay

0.75
you are very smart but you ' re gay

0.8
you are very smart but you ' re gay

1.0
you are very smart but you just need to see you .

