In [1]:
import string
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output
from random import sample
import json
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import tqdm

In [2]:
POETRY_FILE_PATH = "./sonnets.txt"

# Data loading

In [3]:
with open(POETRY_FILE_PATH, 'r', encoding="utf-8") as iofile:
    text = iofile.readlines()

TEXT_START = 45
TEXT_END = -368

text = text[TEXT_START:TEXT_END]
text[:10]

['  From fairest creatures we desire increase,\n',
 "  That thereby beauty's rose might never die,\n",
 '  But as the riper should by time decease,\n',
 '  His tender heir might bear his memory:\n',
 '  But thou, contracted to thine own bright eyes,\n',
 "  Feed'st thy light's flame with self-substantial fuel,\n",
 '  Making a famine where abundance lies,\n',
 '  Thy self thy foe, to thy sweet self too cruel:\n',
 "  Thou that art now the world's fresh ornament,\n",
 '  And only herald to the gaudy spring,\n']

In [4]:
text = [sentence.lower() for sentence in text] # Your great code here
text = "".join(text)

In [5]:
text[:300]

"  from fairest creatures we desire increase,\n  that thereby beauty's rose might never die,\n  but as the riper should by time decease,\n  his tender heir might bear his memory:\n  but thou, contracted to thine own bright eyes,\n  feed'st thy light's flame with self-substantial fuel,\n  making a famine wh"

In [6]:
import math
from torch.utils.data import Dataset

class CharDataset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        """
        arrange data and targets so that the first i elements of x
        will be asked to predict the i-th element of y. Notice that
        the eventual language model will actually make block_size
        individual predictions at the same time based on this data,
        so we are being clever and amortizing the cost of the forward
        pass of the network. So for example if block_size is 4, then
        we could e.g. sample a chunk of text "hello", the integers in
        x will correspond to "hell" and in y will be "ello". This will
        then actually "multitask" 4 separate examples at the same time
        in the language model:
        - given just "h", please predict "e" as next
        - given "he" please predict "l" next
        - given "hel" predict "l" next
        - given "hell" predict "o" next
        
        In addition, because the DataLoader will create batches of examples,
        every forward/backward pass during traning will simultaneously train
        a LOT of predictions, amortizing a lot of computation. In particular,
        for a batched input of integers X (B, T) where B is batch size and
        T is block_size and Y (B, T), the network will during training be
        simultaneously training to make B*T predictions, all at once! Of course,
        at test time we can paralellize across batch B, but unlike during training
        we cannot parallelize across the time dimension T - we have to run
        a forward pass of the network to recover the next single character of the 
        sequence along each batch dimension, and repeatedly always feed in a next
        character to get the next one.
        
        So yes there is a big asymmetry between train/test time of autoregressive
        models. During training we can go B*T at a time with every forward pass,
        but during test time we can only go B at a time, T times, with T forward 
        passes.
        """
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [8]:
block_size = 16

In [9]:
train_dataset = CharDataset(text, block_size)

data has 100225 characters, 38 unique.


# Learning

[Модель](https://github.com/karpathy/minGPT)

In [10]:
import torch, torch.nn as nn
import torch.nn.functional as F

In [11]:
from minGPT import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=512)
model = GPT(mconf)

In [12]:
from minGPT import Trainer, TrainerConfig

# initialize a trainer instance and kick off training
tconf = TrainerConfig(max_epochs=5, batch_size=512//2, learning_rate=3e-4,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=0)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

epoch 1 iter 391: train loss 1.64276. lr 1.507548e-04: 100%|█████████████████████████| 392/392 [03:11<00:00,  2.05it/s]
epoch 2 iter 391: train loss 1.45182. lr 3.000000e-05: 100%|█████████████████████████| 392/392 [03:10<00:00,  2.06it/s]
epoch 3 iter 391: train loss 1.42399. lr 1.507548e-04: 100%|█████████████████████████| 392/392 [03:10<00:00,  2.06it/s]
epoch 4 iter 391: train loss 1.36966. lr 2.999924e-04: 100%|█████████████████████████| 392/392 [03:11<00:00,  2.05it/s]
epoch 5 iter 391: train loss 1.19551. lr 1.477356e-04: 100%|█████████████████████████| 392/392 [03:10<00:00,  2.06it/s]


# Generating poems

In [23]:
from minGPT_utils import sample

context = " i"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 500, temperature=0.1, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print('temperature = 0.1:')
print(completion)

temperature = 0.1:
 i see the sun, and straight the straight to the state of the sun,
  and summer's length to see the sun, and straight to the see the strong,
  and that i this shall i death the sun, and straight the straight to the see the strong offence's created still the world's eyes doth the state,
    to the world's eyes doth the state,
    to the sea, that thou thy self away, and the beauty's summer's delight.
    the world's eyes doth the state,
    the store; the sun, though the sun of the state
    to sho


In [24]:
y = sample(model, x, 500, temperature=0.5, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print('temperature = 0.5:')
print(completion)

temperature = 0.5:
 i will come to catch
  that thou to make the saw my self with self in eyes behold, the tomb of that strangely bright days,
  and thou thy self alone,
  since his strong that have i saw my self thou see'st the time do i not so fair subject that the summer's flowers are beauty's dead and play the that thou this shadow shadow it the still,
  where is the time;
  and then thou art the basest that shall face she knows not to the show thee how with this the time, despite the remover to the world's eyes


In [25]:
y = sample(model, x, 500, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print('temperature = 1.0:')
print(completion)

temperature = 1.0:
 is in my advis'd fair loving what this becoming of their spirit? gave not the excuse the believed to make my way.
  for love's fire shall best by side, o that says 'in himponing shall have speen:
    so then my pays:
    till i speed's and wrinkles stave, to well-my death truth it thus, that first confound
  a happy made thee so believe thence it of weaks, thy picture's me, not, the like to the sensions of still,
  so thou, that i may state,
  shall those fickled stay,
  and by that i i hold,
  a
