In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import torch
from torch import nn

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Данные

In [2]:
INDEX_TO_CHAR = ['none'] + list(' abcdefghijklmnopqrstuvwxyz')
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}

In [3]:
df = pd.read_csv('drive/MyDrive/simpsons_script_lines.csv')
phrases = df['normalized_text'].tolist()
text = [[c for c in ph] for ph in phrases if (type(ph) is str) and (set(ph).issubset(INDEX_TO_CHAR))]
print(text[1])

['w', 'h', 'e', 'r', 'e', 's', ' ', 'm', 'r', ' ', 'b', 'e', 'r', 'g', 's', 't', 'r', 'o', 'm']


# Задание 1
## Обучите нейронную сеть решать шифр Цезаря

### 1. Написать алгоритм шифра Цезаря для генерации выборки (сдвиг на К каждой буквы. Например, при сдвиге на 2 буква “А” переходит в букву “В” и тп)

In [None]:
MAX_LEN = 50
X = torch.zeros((len(text), MAX_LEN), dtype=int)
y = torch.zeros((len(text), MAX_LEN), dtype=int)

for i in range(len(text)):
    K = np.random.choice(range(1, 27))
    for j, w in enumerate(text[i]):
        if j >= MAX_LEN:
            break
        idx = CHAR_TO_INDEX[w]
        y[i, j] = idx
        X[i, j] = int(np.ceil(np.mod(idx + (K - 0.1), 27)))
        # здесь немного мудреная логика для того, чтобы среди зашифрованных
        # символов не было нулей, которые зарезервированы под 'none'

In [None]:
print(f'Шифр: сдвиг на {K} символов\n')
print('Исходный текст:')
print(text[-1])
print()
print('Исходный текст числами:')
print(y[-1])
print()
print('Зашифрованный текст числами:')
print(X[-1])

Шифр: сдвиг на 7 символов

Исходный текст:
['n', 'o', ' ', 't', 'h', 'a', 't', ' ', 'm', 'e', 'a', 'n', 's', ' ', 's', 'h', 'e', ' ', 'w', 'a', 's', ' ', 'f', 'a', 'k', 'i', 'n', 'g', ' ', 'i', 't']

Исходный текст числами:
tensor([15, 16,  1, 21,  9,  2, 21,  1, 14,  6,  2, 15, 20,  1, 20,  9,  6,  1,
        24,  2, 20,  1,  7,  2, 12, 10, 15,  8,  1, 10, 21,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

Зашифрованный текст числами:
tensor([22, 23,  8,  1, 16,  9,  1,  8, 21, 13,  9, 22, 27,  8, 27, 16, 13,  8,
         4,  9, 27,  8, 14,  9, 19, 17, 22, 15,  8, 17,  1,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])


### 2. Сделать нейронную сеть

In [None]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 32)
        self.rnn = torch.nn.RNN(32, 128)
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x)
        return self.out(x)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
model = Network()
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

### 3. Обучить ее (вход - зашифрованная фраза, выход - дешифрованная фраза)

In [None]:
for ep in range(2):
    model.train(True)
    start = time.time()
    train_loss = 0.0
    train_passed = 0

    queue = list(range(len(X)))
    np.random.shuffle(queue)
    for i in tqdm(queue):
        X_batch = X[i].to(device)
        Y_batch = y[i].to(device)

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1
    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

100%|██████████| 117015/117015 [04:01<00:00, 483.71it/s]


Epoch 0. Time: 241.927, Train loss: 0.870


100%|██████████| 117015/117015 [04:01<00:00, 484.44it/s]

Epoch 1. Time: 241.565, Train loss: 0.715





### 4. Проверить качество

In [None]:
model.train(False)
for i in range(5):
    idx = np.random.choice(len(X))
    pred = model.forward(X[idx].to(device))
    print(f'Пример № {i+1}')
    print('Исходный текст:')
    print(''.join([INDEX_TO_CHAR[i] for i in y[idx] if i != 0]))
    print('Расшифрованный текст:')
    print(''.join([INDEX_TO_CHAR[i] for i in torch.argmax(pred, dim=1) if i != 0]))
    print()

Пример № 1
Исходный текст:
they dont even play their own instruments
Расшифрованный текст:
ooee dont even play their own instru eat 

Пример № 2
Исходный текст:
uh not as such
Расшифрованный текст:
tou pmtas snwa

Пример № 3
Исходный текст:
maybe one of your tavern wenches can do the laundr
Расшифрованный текст:
woeid one ofmyour tavern wenches cun do the laundr

Пример № 4
Исходный текст:
wow
Расшифрованный текст:
yha

Пример № 5
Исходный текст:
that felt good
Расшифрованный текст:
oot  felt good



# Задание 2
## Выполнить практическую работу из лекционного ноутбука

### 1. Построить RNN-ячейку на основе полносвязных слоев

In [6]:
MAX_LEN = 50
X = torch.zeros((len(text), MAX_LEN), dtype=int)

for i in range(len(text)):
    for j, w in enumerate(text[i]):
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [40]:
class Network(torch.nn.Module):
    def __init__(self, input_size=28, rnn_input_size=32, rnn_hidden_size=128, output_size=28, batch_size=1):
        super(Network, self).__init__()
        self.input_size = input_size
        self.rnn_input_size = rnn_input_size
        self.rnn_hidden_size = rnn_hidden_size
        self.output_size = output_size
        self.batch_size = batch_size

        self.embedding = torch.nn.Embedding(input_size, rnn_input_size)
        self.i2h = nn.Linear(rnn_input_size, rnn_hidden_size, bias=False)
        self.h2h = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.h2o = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.out = nn.Linear(rnn_hidden_size, output_size)

    def forward(self, sentences, hidden_state):
        x = self.embedding(sentences)
        x = self.i2h(x)
        hidden_state = self.h2h(hidden_state)
        hidden_state = torch.tanh(x + hidden_state)
        x, s = self.h2o(hidden_state), hidden_state
        return self.out(x)

    def init_zero_hidden(self):
        return torch.zeros(self.batch_size, self.rnn_hidden_size, requires_grad=False)

In [41]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [42]:
model = Network()
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [43]:
for ep in range(20):
    start = time.time()
    train_loss = 0.0
    train_passed = 0

    for i in range(int(len(X) / model.batch_size)):
        batch = X[i*model.batch_size : (i+1)*model.batch_size].to(device)
        if len(batch) != model.batch_size:
            continue

        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()
        hidden = model.init_zero_hidden().to(device)

        optimizer.zero_grad()
        answers = model.forward(X_batch, hidden)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1
    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 181.206, Train loss: 1.631
Epoch 1. Time: 178.882, Train loss: 1.627
Epoch 2. Time: 177.386, Train loss: 1.626
Epoch 3. Time: 176.278, Train loss: 1.625
Epoch 4. Time: 177.802, Train loss: 1.625
Epoch 5. Time: 177.992, Train loss: 1.625
Epoch 6. Time: 177.428, Train loss: 1.625
Epoch 7. Time: 177.132, Train loss: 1.624
Epoch 8. Time: 177.160, Train loss: 1.624
Epoch 9. Time: 177.284, Train loss: 1.624
Epoch 10. Time: 177.050, Train loss: 1.624
Epoch 11. Time: 177.762, Train loss: 1.624
Epoch 12. Time: 177.157, Train loss: 1.624
Epoch 13. Time: 176.865, Train loss: 1.624
Epoch 14. Time: 177.191, Train loss: 1.624
Epoch 15. Time: 176.865, Train loss: 1.624
Epoch 16. Time: 177.363, Train loss: 1.624
Epoch 17. Time: 177.211, Train loss: 1.624
Epoch 18. Time: 177.000, Train loss: 1.624
Epoch 19. Time: 177.169, Train loss: 1.624


### 2. Применить построенную ячейку для генерации текста с выражениями героев сериала “Симпсоны”

In [46]:
def generate_sentence(word):
    sentence = list(word)
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    hidden = model.init_zero_hidden().to(device)
    answers = model.forward(torch.tensor(sentence).to(device), hidden)
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [47]:
generate_sentence('a dog is running on the street')

'nt u tn te   n  tu t e t  e   '

In [48]:
generate_sentence('it is less fun to be sad')

'n tn tl   t   t ute t n '

In [49]:
generate_sentence('my homework is done')

'eoteue eue tn t u  '