<a href="https://colab.research.google.com/github/archyyu/GPT-from-MLP-to-RNN-to-Transformer/blob/main/studyLSTM1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7e13544b8330>

In [4]:
# Data I/O
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
#url = "https://raw.githubusercontent.com/archyyu/publicResource/main/google.dev.en"
#url = "https://raw.githubusercontent.com/tinygrad/tinygrad/master/tinygrad/tensor.py"
#url = "https://raw.githubusercontent.com/archyyu/publicResource/main/KDE4.en-es.en"
#url = "https://raw.githubusercontent.com/archyyu/publicResource/main/js"
response = requests.get(url)
data = response.text

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'data has {data_size} characters, {vocab_size} unique.')

char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

data has 1115394 characters, 65 unique.


In [38]:
# Hyperparameters
hidden_size = 100
embedding_dim = 50
seq_length = 25
learning_rate = 0.0005
batch_size = 20

In [41]:
import torch
import torch.nn as nn

class LSTMCell(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size):
    super(LSTMCell, self).__init__()
    self.emb = nn.Embedding(input_size, embedding_size)
    self.W_f = nn.Linear(embedding_size, hidden_size, bias=False)
    self.h_f = nn.Linear(hidden_size, hidden_size, bias=False)
    self.b_f = nn.Parameter(torch.zeros(hidden_size))

    # Parameters for Input Gate
    self.W_i = nn.Linear(embedding_size, hidden_size, bias=False)
    self.h_i = nn.Linear(hidden_size, hidden_size, bias=False)
    self.b_i = nn.Parameter(torch.zeros(hidden_size))

    # Parameters for Candidate Cell State
    self.W_C = nn.Linear(embedding_size, hidden_size, bias=False)
    self.h_C = nn.Linear(hidden_size, hidden_size, bias=False)
    self.b_C = nn.Parameter(torch.zeros(hidden_size))

    # Parameters for Output Gate
    self.W_o = nn.Linear(embedding_size, hidden_size, bias=False)
    self.h_o = nn.Linear(hidden_size, hidden_size, bias=False)
    self.b_o = nn.Parameter(torch.zeros(hidden_size))

    self.t_o = nn.Linear(hidden_size, input_size, bias=False)
    self.o_b = nn.Parameter(torch.zeros(input_size))

  def forward(self, x, h, C_prev):
    x = self.emb(x)
    f_t = torch.sigmoid(self.W_f(x) + self.h_f(h) + self.b_f)
    i_t = torch.sigmoid(self.W_i(x) + self.h_i(h) + self.b_i)
    C_tilde = torch.tanh(self.W_C(x) + self.h_C(h) + self.b_C)
    C_t = f_t * C_prev + i_t * C_tilde
    o_t = torch.sigmoid(self.W_o(x) + self.h_o(h) + self.b_o)
    h_t = o_t * torch.tanh(C_t)

    output = self.t_o(h_t) + self.o_b

    return output, h_t, C_t

# Loss function
criterion = nn.CrossEntropyLoss()

# Model initialization
model = LSTMCell(vocab_size, embedding_dim, hidden_size)
optimizer = optim.Adagrad(model.parameters(), lr=learning_rate)

In [8]:
def generateMiniBatch(start_idx):
  batch_inputs = []
  batch_targets = []

  # Generate examples for the current minibatch
  for i in range(batch_size):
    p = start_idx + i
    inputs = torch.tensor([char_to_ix[ch] for ch in data[p:p + seq_length]], dtype=torch.long).view(1, -1)
    targets = torch.tensor([char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]], dtype=torch.long).view(-1)

    batch_inputs.append(inputs)
    batch_targets.append(targets)

  # Convert lists to tensors
  minibatch_inputs = torch.cat(batch_inputs, dim=0)
  minibatch_targets = torch.stack(batch_targets)
  return minibatch_inputs, minibatch_targets

In [9]:
stopi = []
lossi = []

In [42]:
# Training loop
num_iterations = 10000
p = 0
for iteration in range(num_iterations):

  if p + seq_length + 1 > len(data):
    p = 0;

  inputs, targets = generateMiniBatch(p)

  optimizer.zero_grad()

  totalloss = 0
  hprev = torch.zeros(1, hidden_size)
  cprev = torch.zeros(1, hidden_size)
  for i in range(seq_length):
    input_char = inputs[:,i].unsqueeze(1)
    output_char = targets[:,i]

    predict_char, hprev, cprev = model(input_char, hprev, cprev)

    loss = criterion(predict_char.squeeze(1), output_char)
    totalloss += loss.item()

    loss.backward()
    hprev = hprev.detach()
    cprev = cprev.detach()

    for param in model.parameters():
      if param.grad is not None:
        param.grad.data.clamp_(-5, 5)

    optimizer.step()

  if iteration % 1000 == 0:
    print(f'Iteration {iteration}, Loss: {totalloss/seq_length}')
    stopi.append(iteration)
    lossi.append(totalloss/seq_length)

  p += seq_length  # Move data pointer

Iteration 0, Loss: 4.077442092895508
Iteration 1000, Loss: 2.5910097217559813
Iteration 2000, Loss: 2.535662069320679
Iteration 3000, Loss: 3.018662900924683
Iteration 4000, Loss: 2.9288361358642576
Iteration 5000, Loss: 2.444118938446045
Iteration 6000, Loss: 2.5324393367767333
Iteration 7000, Loss: 2.1878686141967774
Iteration 8000, Loss: 2.3422654914855956
Iteration 9000, Loss: 2.186244306564331


In [46]:
# Sample from the model
def sample(model, seed_ix, n):
  h = torch.zeros(1, hidden_size)
  c = torch.zeros(1, hidden_size)
  x = torch.tensor(seed_ix, dtype=torch.long).view(1, 1)
  ixes = []

  for _ in range(n):
    outputs, h, c = model(x, h, c)
    # p = nn.functional.softmax(outputs, dim=-1)
    # ix = torch.argmax(p).item()
    p = nn.functional.softmax(outputs, dim=-1).detach().numpy().ravel()
    ix = np.random.choice(range(vocab_size), p=p)
    x = torch.tensor(ix, dtype=torch.long).view(1, 1)

    if ix_to_char[ix] == '\n':
      print(''.join([ix_to_char[i] for i in ixes]))
      ixes = []
      continue

    ixes.append(ix)

  return ixes

sample(model, char_to_ix[data[0]], 2000)

os neth,
Moarl mf prit adfl sou
Mausd,
Wvhe t.lo.
Ner thered hoer mray fm.
Weinpad litut ur ne yot.

QUve that tal lfeitpis menw,
Sour nomyo hat se!s.
ganutor whe no I, id tis, dive ow o hams lpuilgot borfaiidir uod par.


T
YoMnbeas, Pinbstes anth ham tineg:
Sift aurd me atir, ball, tcye, beh me gal, den ,n gour ors micwot've!
se v whe tor thir: en,
Hen nod gy uopott fel mahis y yylt youw arbey, gicevcyd Kullitpt watheczns od han ay nodest oud path, agbertheaitd the w' wease:
Wheatel heme sen derce heay thalde bes pembGDfhe haced,,
Ct a d ade
T; awerer:
ay the fon Iol dwithe boorsiche irs fos the 'rt wath rstinortsdld s bnake, Ior gold hrthiowud, ste;:
NpOsAarsle vam.
Eihome yent yathet a,
As dilr Hans we Iey vers have s. lolacrSd dat  Lot ann
 ne oal mons,

Yam'Yor.
Toond Cllt woth hlort ae ncrpkie'to aAe
bin lyent MarCs thet us tuathigs.
TerelYlre, fouelsegl'dn,
CsAv, of woky I avet MirwiesI I ild Roat lart mand wal thom ou ,be wasruHrster woms my hawstn ouand Mel'vakg fit rshonce l

[2]

In [47]:
sample(model, char_to_ix[data[0]], 2000)

him I onendt yoce; wed bexwherHI thaw Tmell oome fanrd aed de bes comender ber all cev aTse.

Iod sef, nouee' whir kos t key plo the, malar,
Of witel
Thime,t per t ohpt mtiVey
Ocer gient gthe b by. herh soreae thig ceoeesth it.
Ant olr, therce lir
Whpini
Horen
dn ne daosl un the doetsr.
I fos old wourcd an milt yas, time mesiFsierwe therednwanche tmd, , what aoercem that hit misr'lde cois andse pent arld nrss, yaver; oumearo they pis theenge ayr hoave tfse ind s h tot for wer gerst p hathut'Ht wine releecin'ge isfd oHd kin, marem suEecord nondh oringal,
Whe anTin;M:hes,
the urrid hekelin' the ance, ghe ives hut wtorsetit matiravutgs 'grt tha mi,
SDsl ven hat my so thd'yy ou hit bime ond mir!y sas Ior
SALFIIU!foOfE   hlOd, hid, m-uy chers-e nocrd,  be ter,
Abe bmathe do yur -os wer, osheneat leakt hupensidhe ve de fothu thos eere,t ay bollathu pred alef foime thalshe nhltser,
Mce tothe fw themel d rome, f icau am fir, wall mondt he thainhel thit oart hag loow Irel, can v ethem lade ort;

[]