<a href="https://colab.research.google.com/github/azilya/torch_tutorials/blob/main/lstm_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [1]:
import re
from string import punctuation

import nltk
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# nltk.download("punkt")


### Downloading a text to train on
For Russian language there is an [official open collection of epub books](https://www.culture.ru/literature/books), but they need to be converted to `.txt` for parsing, so additional conversion is required, eg:

```sh
wget https://cdn.culture.ru/files/f833392d-c53f-5a39-a1c7-b4611b2ffdd6/idiot
epub2txt -r idiot > idiot.txt
```



Preprocess text.

We replace punctuation to simplify predictions, leaving `.` to be able to predict sentence end. 

In [None]:
punctuation.replace(".", "") += "«»„“”‛’—–‒"
RE_PUNCT = re.compile(f"[{re.escape(punctuation)}]")


def clean(line):
  sents = nltk.sent_tokenize(line, language="russian")
  sents_p = [re.sub(RE_PUNCT, " ", s) for s in sents]
  sents_s = [re.sub(r"\s+", " ", s).lower().strip() for s in sents_p]
  return sents_s


sentences = []
with open("idiot.txt") as corpus:
  for i, line in tqdm(enumerate(corpus)):
    # top 15 lines are the header with annotation etc.
    if i < 15:
      continue
    if len(line.strip()) > 0:
      sentences.extend(clean(line))

preprocessed_text = " ".join(sentences)
all_chars = sorted(list(set(preprocessed_text)))
seq_len = 100


In [5]:
torch.manual_seed(1)
torch.cuda.manual_seed(1)
np.random.seed(1)

class CharModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hid_size=256) -> None:
      super().__init__()
      self.emb = nn.Embedding(vocab_size, emb_size)
      self.lstm = nn.LSTM(
        emb_size,
        hid_size,
        num_layers=2,
        batch_first=True,
        dropout=0.2
      )
      self.dropout = nn.Dropout(0.2)
      self.linear = nn.Linear(hid_size, vocab_size)

    def forward(self, x):
      x = self.emb(x)
      x, _ = self.lstm(x)
      # predict only based on last output
      x = x[:, -1, :]
      x = self.dropout(x)
      return self.linear(x)


model = CharModel(vocab_size=len(all_chars), emb_size=100)
model.cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters())


Create datasets.

Input: sequence of `seq_len`, target: next character.

In [6]:
X_s = []
y_s = []
for i in range(len(preprocessed_text) - seq_len):
  seq = sent[i : i + seq_len]
  X = [all_chars.index(c) for c in seq]
  y = all_chars.index(sent[i + seq_len])
  X_s.append(X)
  y_s.append(y)


# X_s = np.array(X_s) / len(all_chars)
# y_s = np.array(y_s) / len(all_chars)
dataset = TensorDataset(torch.tensor(X_s), torch.tensor(y_s))

train_set, test_set = train_test_split(dataset, test_size=0.15)


100%|██████████| 1609/1609 [00:00<00:00, 1894.05it/s]


Prepare training environment: variables, train+eval loop and start training.

In [7]:
n_epochs = 5
batch_size = 64
log_steps = 1_000

train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_set, shuffle=False, batch_size=batch_size)


In [8]:
step = 0
for e in range(n_epochs):
  model.train()
  for batch in train_loader:
    inputs, truth = (t.to("cuda") for t in batch)
    pred = model(inputs)
    loss = loss_fn(pred, truth)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step += 1
    if step % log_steps == 0:
      print(f"Step={step} {loss=:.4f}")
  model.eval()
  eval_loss = 0
  for batch in test_loader:
    inputs, truth = (t.to("cuda") for t in batch)
    with torch.no_grad():
      pred = model(inputs)
      loss = loss_fn(pred, truth)
    eval_loss += loss.detach().cpu().item()
  eval_loss /= len(test_loader)
  print(f"=== Epoch {e} eval loss: {eval_loss:.4f} ===")


KeyboardInterrupt: 