### Getting the shakspear data as a tensorflow dataset and converting it to simple text

In [138]:
import tensorflow as tf
import keras
import tensorflow_datasets as tfds
import numpy as np
import torch

Sample_dataset = tfds.load("tiny_shakespeare", split="train", try_gcs=True)
ds = tfds.as_dataframe(Sample_dataset)
text = ds.head(1)['text'][0]
text = ''.join([chr(x) for x in text])
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [139]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [140]:
print("length of dataset:",len(text))

length of dataset: 1003854


# finding what is the vocabulary

In [141]:
vocab = sorted(list(set(text)))
print("vocabulary:",vocab)
vocab_size = len(vocab)
print("vocab size:",vocab_size)

vocabulary: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
vocab size: 65


# Tokenizing the data

In [142]:
tokenization_dict = {x:idx for idx,x in enumerate(vocab)}
detokenization_dict = {idx:x for idx,x in enumerate(vocab)}
encode = lambda s:[tokenization_dict[x] for x in s]
decode = lambda i:[detokenization_dict[x] for x in i]
print("tokenization dictionary:",tokenization_dict)
print("detokenization dictionary:",detokenization_dict)

tokenization dictionary: {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
detokenization dictionary: {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 4

# transforming into torch tensor and spliting to train-test datasets

In [143]:
import torch
data = torch.tensor(encode(text),dtype = torch.long)
print(data.shape,data.dtype)
print(data[:1000])

torch.Size([1003854]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [144]:
train_data = data[:int(len(data)*0.9)]
test_data = data[int(len(data)*0.9):]
print("train:",train_data[:100])
print("test:",test_data[:100])

train: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])
test: tensor([21, 27, 10,  0, 32, 46, 39, 58,  1, 57, 46, 39, 50, 50,  1, 52, 53, 58,
         1, 40, 43,  1, 51, 59, 41, 46,  1, 39, 51, 47, 57, 57, 10,  1, 37, 43,
        58,  6,  1, 39, 57,  1, 58, 46, 43,  1, 51, 39, 58, 58, 43, 56,  0, 52,
        53, 61,  1, 57, 58, 39, 52, 42, 57,  6,  1, 46, 43,  1, 61, 47, 50, 50,
         1, 39, 60, 53, 47, 42,  1, 63, 53, 59, 56,  1, 39, 41, 41, 59, 57, 39,
        58, 47, 53, 52, 11,  1, 46, 43,  1, 51])


# creating a function that makes batches from the data

In [145]:
torch.manual_seed(11111)
batch_size = 128
seq_len = 150

def get_batch(Train = True):
  data = train_data if Train else test_data
  idxs = torch.randint(low =0,high = len(data)-seq_len-1,size = (batch_size,))
  Xs = torch.stack([data[i:i+seq_len] for i in idxs])
  Ys = torch.stack([data[i+1:i+seq_len+1] for i in idxs])
  return Xs,Ys
Xs,Ys = get_batch()
print(decode(Xs[0].numpy()),decode(Ys[0].numpy()))

['h', 'u', 'r', 'l', '!', ' ', 'd', 'r', 'u', 'n', 'k', ' ', 'a', 'l', 'l', ',', ' ', 'a', 'n', 'd', ' ', 'l', 'e', 'f', 't', ' ', 'n', 'o', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 'l', 'y', ' ', 'd', 'r', 'o', 'p', '\n', 'T', 'o', ' ', 'h', 'e', 'l', 'p', ' ', 'm', 'e', ' ', 'a', 'f', 't', 'e', 'r', '?', ' ', 'I', ' ', 'w', 'i', 'l', 'l', ' ', 'k', 'i', 's', 's', ' ', 't', 'h', 'y', ' ', 'l', 'i', 'p', 's', ';', '\n', 'H', 'a', 'p', 'l', 'y', ' ', 's', 'o', 'm', 'e', ' ', 'p', 'o', 'i', 's', 'o', 'n', ' ', 'y', 'e', 't', ' ', 'd', 'o', 't', 'h', ' ', 'h', 'a', 'n', 'g', ' ', 'o', 'n', ' ', 't', 'h', 'e', 'm', ',', '\n', 'T', 'o', ' ', 'm', 'a', 'k', 'e', ' ', 'd', 'i', 'e', ' ', 'w', 'i', 't', 'h', ' ', 'a', ' ', 'r', 'e', 's', 't', 'o', 'r', 'a'] ['u', 'r', 'l', '!', ' ', 'd', 'r', 'u', 'n', 'k', ' ', 'a', 'l', 'l', ',', ' ', 'a', 'n', 'd', ' ', 'l', 'e', 'f', 't', ' ', 'n', 'o', ' ', 'f', 'r', 'i', 'e', 'n', 'd', 'l', 'y', ' ', 'd', 'r', 'o', 'p', '\n', 'T', 'o', ' ', 'h', 'e', 'l', 'p',

# positional embbeding

In [146]:
embed_size = 128
import math
from torch import nn
from torch.nn import functional as F

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()

        position = torch.arange(max_len,device = device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2,device = device) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term).to(device)
        pe[0, :, 1::2] = torch.cos(position * div_term).to(device)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        #print(self.pe[:,:x.shape[-2]].shape)
        x = x + self.pe[:,:x.shape[-2]]
        return x

In [147]:
embed = nn.Embedding(vocab_size,embed_size)
pos_embed = PositionalEncoding(embed_size,seq_len)
Xs,_ = get_batch()
embeded = embed(Xs)
final = pos_embed(embeded)
print(embeded[0])
print(final[0])

tensor([[ 0.2443, -0.7473, -0.5315,  ..., -0.4596, -1.3047, -0.0761],
        [ 0.2443, -0.7473, -0.5315,  ..., -0.4596, -1.3047, -0.0761],
        [ 0.8729, -0.9268, -2.4263,  ...,  0.5712, -0.0637, -2.9809],
        ...,
        [-0.3287,  1.1828,  1.3727,  ...,  1.4871,  0.0398, -0.9164],
        [ 0.0116,  0.8708,  1.5919,  ...,  0.3623,  0.1317,  0.6312],
        [-0.0321, -0.1118,  1.0336,  ...,  0.1925,  0.6394, -0.6297]],
       grad_fn=<SelectBackward0>)
tensor([[ 0.2443,  0.2527, -0.5315,  ...,  0.5404, -1.3047,  0.9239],
        [ 1.0857, -0.2070,  0.2302,  ...,  0.5404, -1.3046,  0.9239],
        [ 1.7822, -1.3430, -1.4392,  ...,  1.5712, -0.0635, -1.9809],
        ...,
        [ 0.2803,  0.3897,  2.3707,  ...,  2.4869,  0.0568,  0.0834],
        [-0.3268, -0.0702,  2.1912,  ...,  1.3621,  0.1488,  1.6311],
        [-1.0067, -0.3356,  0.8120,  ...,  1.1923,  0.6566,  0.3701]],
       grad_fn=<SelectBackward0>)


# self attention

In [148]:
class AttentionHead(nn.Module):

  def __init__(self,head_size,input_size):
    super().__init__()
    self.query_layer = nn.Linear(input_size,head_size)
    self.key_layer = nn.Linear(input_size,head_size)
    self.value_layer = nn.Linear(input_size,head_size)
    self.register_buffer('tril',torch.tril(torch.ones(seq_len,seq_len)))

  def forward(self,x):
    channel_size = x.shape[-1]
    query = self.query_layer(x)
    key = self.key_layer(x)
    attention_matrix = query @ torch.transpose(key,-1,-2) * (channel_size ** -0.5)
    masked_attention = attention_matrix.masked_fill(self.tril[:x.shape[-2],:x.shape[-2]] == 0, float('-inf'))
    final_matrix = F.softmax(masked_attention,dim = -1)
    value = self.value_layer(x)
    out = final_matrix @ value
    return out

In [149]:
head_size = embed_size
print(head_size)
embed = nn.Embedding(vocab_size,embed_size)
pos_embed = PositionalEncoding(embed_size,seq_len)
Xs,_ = get_batch()
embeded = embed(Xs)
final = pos_embed(embeded)
Attention = AttentionHead(head_size,embed_size)
att = Attention(final)
print(att.shape)

128
torch.Size([128, 150, 128])


# Multi-Headed-self-Attention

In [150]:
class MultiHeadedAttention(nn.Module):
  def __init__(self,num_heads,head_size,input_size):
    super().__init__()
    self.heads = nn.ModuleList([AttentionHead(head_size,input_size) for i in range(num_heads)])

  def forward(self,x):
    out = torch.cat([h(x) for h in self.heads],dim = -1)
    return out

In [151]:
head_size = embed_size
embed = nn.Embedding(vocab_size,embed_size).to(device)
pos_embed = PositionalEncoding(embed_size,seq_len).to(device)
Xs,_ = get_batch()
embeded = embed(Xs.to(device))
final = pos_embed(embeded)
MHAttention = MultiHeadedAttention(num_heads = 4,head_size = head_size,input_size = embed_size).to(device)
att = MHAttention(final)
print(att.shape)

torch.Size([128, 150, 512])


# Layer Norm

In [152]:
class LayerNorm1d: # (used to be BatchNorm1d)
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim,device = device)
    self.beta = torch.zeros(dim,device = device)
  
  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

# Decoder-only-Block

In [153]:
class DecoderOnlyBlock(nn.Module):
  def __init__(self,num_heads,head_size,input_size,output_size):
    super().__init__()
    self.ff = nn.Sequential(nn.Linear(num_heads*head_size,4*output_size),nn.ReLU(),nn.Linear(4*output_size,output_size))
    self.MHA = MultiHeadedAttention(num_heads,head_size,input_size)
    self.lnl = LayerNorm1d(num_heads*head_size)


  def forward(self,x):
    out = x + self.MHA(self.lnl(x))
    out = out + self.ff(self.lnl(out))
    return out

In [154]:
head_size = embed_size
embed = nn.Embedding(vocab_size,embed_size).to(device)
pos_embed = PositionalEncoding(embed_size,seq_len).to(device)
Xs,_ = get_batch()
embeded = embed(Xs.to(device))
final = pos_embed(embeded)
DEC_BLOCK = DecoderOnlyBlock(num_heads = 4,head_size = head_size//4,input_size = embed_size,output_size =embed_size).to(device)
att = DEC_BLOCK(final)
print(att.shape)

torch.Size([128, 150, 128])


# Decoder only

In [155]:
class DecoderOnly(nn.Module):
  def __init__(self,num_blocks,num_heads,head_size,block_input_size,block_output_size,vocab_size,embed_size,seq_len):
    super().__init__()
    self.embed = nn.Embedding(vocab_size,embed_size)
    self.pos_embed = PositionalEncoding(embed_size,seq_len)
    self.dec_block1 = DecoderOnlyBlock(num_heads = num_heads,head_size = head_size,input_size = block_input_size,output_size =block_output_size)
    self.dec_block2 = DecoderOnlyBlock(num_heads = num_heads,head_size = head_size,input_size = block_input_size,output_size =block_output_size)
    self.dec_block3 = DecoderOnlyBlock(num_heads = num_heads,head_size = head_size,input_size = block_input_size,output_size =block_output_size)
    self.dec_block4 = DecoderOnlyBlock(num_heads = num_heads,head_size = head_size,input_size = block_input_size,output_size =block_output_size)
    self.L = nn.Linear(block_output_size,vocab_size)
    self.seq_len = seq_len


  def forward(self,x):
    out = self.embed(x)
    out = self.pos_embed(out)
    out = self.dec_block1(out)
    out = self.dec_block2(out)
    out = self.dec_block3(out)
    out = self.dec_block4(out)
    out = self.L(out)
    return out

  def generate(self,context,num_chars):
    print(context)
    for i in range(num_chars):
      trimmed_context = context[-seq_len:]
      next_char = torch.argmax(self.forward(trimmed_context),dim = -1)[-1][-1]
      context = torch.cat((context,torch.unsqueeze(next_char,0)),dim = -1)
    return context


In [156]:
head_size = embed_size
Xs,Ys = get_batch()
DEC = DecoderOnly(num_blocks = 10, num_heads = 16,head_size = head_size//16,block_input_size = embed_size,block_output_size = embed_size,vocab_size=vocab_size,embed_size = embed_size,seq_len = seq_len).to(device)
out = DEC(Xs.to(device))
print(out.shape)

torch.Size([128, 150, 65])


In [157]:
Xs,_ = get_batch()
char_seq = Xs[0].to(device)
print(DEC.generate(char_seq,20))

tensor([53, 61,  1, 51, 43,  1, 39, 50, 47, 60, 43,  6,  0, 35, 46, 43, 56, 43,
         1, 21,  1, 57, 46, 39, 50, 50,  1, 49, 52, 43, 43, 50,  1, 58, 53,  1,
        46, 47, 51,  1, 58, 46, 39, 58,  1, 57, 50, 43, 61,  1, 51, 63,  1, 44,
        39, 58, 46, 43, 56,  2,  0,  0, 23, 21, 26, 19,  1, 20, 17, 26, 30, 37,
         1, 34, 21, 10,  0, 27,  1, 15, 50, 47, 44, 44, 53, 56, 42,  6,  1, 46,
        53, 61,  1, 58, 46, 63,  1, 61, 53, 56, 42, 57,  1, 56, 43, 60, 47, 60,
        43,  1, 51, 63,  1, 46, 43, 39, 56, 58,  2,  0,  0, 37, 27, 30, 23, 10,
         0, 20, 43, 52, 56, 63,  1, 53, 44,  1, 24, 39, 52, 41, 39, 57, 58, 43,
        56,  6,  1, 56, 43, 57], device='cuda:0')
tensor([53, 61,  1, 51, 43,  1, 39, 50, 47, 60, 43,  6,  0, 35, 46, 43, 56, 43,
         1, 21,  1, 57, 46, 39, 50, 50,  1, 49, 52, 43, 43, 50,  1, 58, 53,  1,
        46, 47, 51,  1, 58, 46, 39, 58,  1, 57, 50, 43, 61,  1, 51, 63,  1, 44,
        39, 58, 46, 43, 56,  2,  0,  0, 23, 21, 26, 19,  1, 20, 17, 26

# Training

we shall add weights to the letters because the data is not ditributed equally, e.g: there are far more spaces than 'z's

In [158]:
import torch
import math
weights = torch.tensor([text.count(x)**-0.7 for x in vocab])
print(weights)
Ys = Ys.to(device)
flatYs = torch.flatten(Ys,0,1)
flatout = torch.flatten(out,0,1)
print(flatout.device)
print(flatYs.device)
loss_fn = torch.nn.CrossEntropyLoss(weight=weights).to(device)
print(Ys.shape)
print(out.shape)
loss = loss_fn(flatout, flatYs)
print(loss)

tensor([6.5257e-04, 2.3451e-04, 5.0424e-03, 1.0000e+00, 4.6346e-01, 2.3900e-03,
        1.0625e-03, 5.5106e-03, 2.0426e-03, 9.9551e-02, 1.7004e-03, 3.4951e-03,
        4.6168e-03, 2.1092e-03, 4.2813e-03, 3.3181e-03, 4.9932e-03, 2.3882e-03,
        5.4992e-03, 4.5902e-03, 4.0143e-03, 1.5481e-03, 1.7951e-02, 6.2236e-03,
        3.2422e-03, 4.0529e-03, 2.8233e-03, 2.7457e-03, 6.9915e-03, 2.2222e-02,
        2.8920e-03, 2.9849e-03, 2.2739e-03, 3.6332e-03, 9.7518e-03, 3.5096e-03,
        3.6775e-02, 5.5993e-03, 2.8525e-02, 5.1575e-04, 1.5592e-03, 1.2435e-03,
        7.6210e-04, 3.5289e-04, 1.2336e-03, 1.3876e-03, 5.4138e-04, 5.9043e-04,
        1.1520e-02, 2.1675e-03, 7.3047e-04, 9.7671e-04, 5.6344e-04, 4.5471e-04,
        1.6052e-03, 1.1875e-02, 5.5939e-04, 5.5459e-04, 4.5015e-04, 8.5735e-04,
        2.0147e-03, 1.1529e-03, 1.3278e-02, 1.0343e-03, 1.7636e-02])
cuda:0
cuda:0
torch.Size([128, 150])
torch.Size([128, 150, 65])
tensor(4.5190, device='cuda:0', grad_fn=<NllLossBackward0>)


In [159]:
def get_loss(logits,labels,fn):
  return fn(torch.flatten(logits,0,1), torch.flatten(labels,0,1))

In [160]:
print(get_loss(out,Ys,torch.nn.CrossEntropyLoss()))

tensor(4.3998, device='cuda:0', grad_fn=<NllLossBackward0>)


In [161]:
def eval(model,loss_fn,Train = False):
  Xs,Ys = get_batch(Train = Train)
  return get_loss(model(Xs.to(device)),Ys.to(device),loss_fn)

In [162]:
print(eval(DEC,loss_fn))

tensor(4.4519, device='cuda:0', grad_fn=<NllLossBackward0>)


In [163]:
def train_step(model,loss_fn,optimizer):
  Xs,Ys = get_batch()
  loss = get_loss(model(Xs.to(device)),Ys.to(device),loss_fn)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

In [164]:
opt = torch.optim.Adagrad(DEC.parameters())
d = DEC.to(device)
print(next(d.parameters()).device)
train_step(d,loss_fn,opt)

cuda:0


In [165]:
import time
def train_loop(model,loss_fn,optimizer,epochs,epoch_length = 1000):
  for i in range(epochs):
    start_time = time.time()
    for j in range(epoch_length):
      train_step(model,loss_fn,optimizer)
    print("epoch: ",i," train loss: ", eval(model,loss_fn,Train = True)," test loss: ", eval(model,loss_fn), " time of execution:",("--- %s seconds ---" % (time.time() - start_time)))

In [170]:
train_loop(DEC,loss_fn,opt,epochs = 3)

epoch:  0  train loss:  tensor(0.9968, device='cuda:0', grad_fn=<NllLossBackward0>)  test loss:  tensor(1.2496, device='cuda:0', grad_fn=<NllLossBackward0>)  time of execution: --- 237.2575409412384 seconds ---
epoch:  1  train loss:  tensor(0.8435, device='cuda:0', grad_fn=<NllLossBackward0>)  test loss:  tensor(1.1251, device='cuda:0', grad_fn=<NllLossBackward0>)  time of execution: --- 238.26756358146667 seconds ---
epoch:  2  train loss:  tensor(0.7991, device='cuda:0', grad_fn=<NllLossBackward0>)  test loss:  tensor(1.0506, device='cuda:0', grad_fn=<NllLossBackward0>)  time of execution: --- 238.4426748752594 seconds ---


In [173]:
Xs,_ = get_batch()
char_seq = Xs[0].to(device)
print(DEC.generate(char_seq,20))

tensor([ 1, 57, 53, 52,  6,  1, 39, 52, 42,  1, 46, 53, 51, 43, 50, 63,  1, 47,
        52,  1, 58, 46, 63,  1, 42, 56, 47, 44, 58, 11,  0, 30, 47, 42, 42, 50,
        47, 52, 45,  1, 41, 53, 52, 44, 43, 57, 57, 47, 53, 52,  1, 44, 47, 52,
        42, 57,  1, 40, 59, 58,  1, 56, 47, 42, 42, 50, 47, 52, 45,  1, 57, 46,
        56, 47, 44, 58,  8,  0,  0, 30, 27, 25, 17, 27, 10,  0, 32, 46, 43, 52,
         1, 54, 50, 39, 47, 52, 50, 63,  1, 49, 52, 53, 61,  1, 51, 63,  1, 46,
        43, 39, 56, 58,  5, 57,  1, 42, 43, 39, 56,  1, 50, 53, 60, 43,  1, 47,
        57,  1, 57, 43, 58,  0, 27, 52,  1, 58, 46, 43,  1, 44, 39, 47, 56,  1,
        42, 39, 59, 45, 46, 58], device='cuda:0')
tensor([ 1, 57, 53, 52,  6,  1, 39, 52, 42,  1, 46, 53, 51, 43, 50, 63,  1, 47,
        52,  1, 58, 46, 63,  1, 42, 56, 47, 44, 58, 11,  0, 30, 47, 42, 42, 50,
        47, 52, 45,  1, 41, 53, 52, 44, 43, 57, 57, 47, 53, 52,  1, 44, 47, 52,
        42, 57,  1, 40, 59, 58,  1, 56, 47, 42, 42, 50, 47, 52, 45,  1

In [174]:
print("output:\n"+''.join(decode(DEC.generate(char_seq,150).cpu().numpy()))+'\n')

tensor([ 1, 57, 53, 52,  6,  1, 39, 52, 42,  1, 46, 53, 51, 43, 50, 63,  1, 47,
        52,  1, 58, 46, 63,  1, 42, 56, 47, 44, 58, 11,  0, 30, 47, 42, 42, 50,
        47, 52, 45,  1, 41, 53, 52, 44, 43, 57, 57, 47, 53, 52,  1, 44, 47, 52,
        42, 57,  1, 40, 59, 58,  1, 56, 47, 42, 42, 50, 47, 52, 45,  1, 57, 46,
        56, 47, 44, 58,  8,  0,  0, 30, 27, 25, 17, 27, 10,  0, 32, 46, 43, 52,
         1, 54, 50, 39, 47, 52, 50, 63,  1, 49, 52, 53, 61,  1, 51, 63,  1, 46,
        43, 39, 56, 58,  5, 57,  1, 42, 43, 39, 56,  1, 50, 53, 60, 43,  1, 47,
        57,  1, 57, 43, 58,  0, 27, 52,  1, 58, 46, 43,  1, 44, 39, 47, 56,  1,
        42, 39, 59, 45, 46, 58], device='cuda:0')
output:
 son, and homely in thy drift;
Riddling confession finds but riddling shrift.

ROMEO:
Then plainly know my heart's dear love is set
On the fair daught wingh.

JULIET:
Whill givengut willl, wingh spour win benger's.
AUF shat sour Mand Jull's justy Palingaces,
For Dus sollive ston ther there wingung w



In [176]:
print("output:\n"+''.join(decode(DEC.generate(char_seq,150).cpu().numpy()))+'\n')
print("original:\n"+''.join(decode(char_seq.cpu().numpy()))+'\n')

tensor([ 1, 57, 53, 52,  6,  1, 39, 52, 42,  1, 46, 53, 51, 43, 50, 63,  1, 47,
        52,  1, 58, 46, 63,  1, 42, 56, 47, 44, 58, 11,  0, 30, 47, 42, 42, 50,
        47, 52, 45,  1, 41, 53, 52, 44, 43, 57, 57, 47, 53, 52,  1, 44, 47, 52,
        42, 57,  1, 40, 59, 58,  1, 56, 47, 42, 42, 50, 47, 52, 45,  1, 57, 46,
        56, 47, 44, 58,  8,  0,  0, 30, 27, 25, 17, 27, 10,  0, 32, 46, 43, 52,
         1, 54, 50, 39, 47, 52, 50, 63,  1, 49, 52, 53, 61,  1, 51, 63,  1, 46,
        43, 39, 56, 58,  5, 57,  1, 42, 43, 39, 56,  1, 50, 53, 60, 43,  1, 47,
        57,  1, 57, 43, 58,  0, 27, 52,  1, 58, 46, 43,  1, 44, 39, 47, 56,  1,
        42, 39, 59, 45, 46, 58], device='cuda:0')
output:
 son, and homely in thy drift;
Riddling confession finds but riddling shrift.

ROMEO:
Then plainly know my heart's dear love is set
On the fair daught wingh.

JULIET:
Whill givengut willl, wingh spour win benger's.
AUF shat sour Mand Jull's justy Palingaces,
For Dus sollive ston ther there wingung w

