<a href="https://colab.research.google.com/github/azilya/torch_tutorials/blob/main/lstm_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports of required libs

In [8]:
import re
from string import punctuation

import nltk
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Downloading a text to train on
For Russian language there is an [official open collection of epub books](https://www.culture.ru/literature/books), but they need to be converted to `.txt` for parsing, so additional conversion is required, eg:

```sh
wget https://cdn.culture.ru/files/f833392d-c53f-5a39-a1c7-b4611b2ffdd6/idiot
epub2txt -r idiot > idiot.txt
```



## Preprocessing text

In [17]:
punctuation += "«»„“”‛’—–‒"
RE_PUNCT = re.compile(f"[{re.escape(punctuation)}]")


def clean(line):
  sents = nltk.sent_tokenize(line, language="russian")
  sents_p = [re.sub(RE_PUNCT, " ", s) for s in sents]
  sents_s = [re.sub(r"\s+", " ", s).lower().strip() for s in sents_p]
  return sents_s


sentences = []
with open("idiot.txt") as corpus:
  for i, line in tqdm(enumerate(corpus)):
    # top 15 lines are the header with annotation etc.
    if i < 15:
      continue
    if len(line.strip()) > 0:
      sentences.extend(clean(line))

all_chars = sorted(list(set("".join(sentences))))
seq_len = 100

9084it [00:00, 12000.70it/s]


In [18]:
sentences[:3]

['i',
 'в конце ноября в оттепель часов в девять утра поезд петербургско варшавской железной дороги на всех парах подходил к петербургу',
 'было так сыро и туманно что насилу рассвело в десяти шагах вправо и влево от дороги трудно было разглядеть хоть что нибудь из окон вагона']

In [19]:
torch.manual_seed(1)
torch.cuda.manual_seed(1)

class CharModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hid_size=256) -> None:
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(
            emb_size,
            hid_size,
            num_layers=2,
            batch_first=True,
            dropout=0.2,
            bidirectional=True,
        )
        self.dropout = nn.Dropout(0.2)
        self.activation = nn.Softmax(dim=-1)
        self.linear = nn.Linear(hid_size * 2, vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.linear(self.activation(self.dropout(x)))
        return x


model = CharModel(vocab_size=len(all_chars), emb_size=seq_len)
model.cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters())

In [20]:
X_s = []
y_s = []
for sent in tqdm(sentences):
    for i in range(len(sent) - seq_len):
        seq = sent[i : i + seq_len]
        X = [all_chars.index(c) for c in seq]
        y = all_chars.index(sent[i + seq_len])
        X_s.append(X)
        y_s.append(y)


# X_s = np.array(X_s) / len(all_chars)
# y_s = np.array(y_s) / len(all_chars)
dataset = TensorDataset(torch.tensor(X_s), torch.tensor(y_s))

train_set, test_set = train_test_split(dataset, test_size=0.15)

100%|██████████| 14418/14418 [00:33<00:00, 434.11it/s]


In [21]:
n_epochs = 5
batch_size = 64
log_steps = 10000

train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_set, shuffle=False, batch_size=batch_size)

In [None]:
step = 0
for e in range(n_epochs):
  model.train()
  for batch in train_loader:
    inputs, truth = (t.to("cuda") for t in batch)
    pred = model(inputs)
    loss = loss_fn(pred, truth)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    step += 1
    if step % log_steps == 0:
      print(f"Step={step} {loss=:.4f}")
  model.eval()
  eval_loss = 0
  for batch in test_loader:
    inputs, truth = (t.to("cuda") for t in batch)
    with torch.no_grad():
      pred = model(inputs)
      loss = loss_fn(pred, truth)
    eval_loss += loss.detach().cpu().item()
  eval_loss /= len(test_loader)
  print(f"=== Epoch {e} eval loss: {eval_loss:.4f} ===")