# Лабораторная 3. Мяков, Шустров, Полякова

In [1]:
import os
import time
import random

import numpy as np
import pandas as pd

import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from tqdm.auto import tqdm

from datasets import load_dataset

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
device

'cuda'

## 1. Char-RNN on Arxiv summaries

#### Dataset

In [3]:
arxiv_csv = pd.read_csv('data/arxiv_papers.csv')
arxiv_csv['summary_len'] = [len(title[1]['summary']) for title in arxiv_csv.iterrows()]
arxiv_csv.head()

Unnamed: 0,link,time,favorites,rts,authors,category,published,summary,title,tweeted,summary_len
0,arxiv.org/abs/1611.10003,,,,"[Tom A. F. Anderson, C. -H. Ruan]",q-bio.NC,2016-11-30 05:17:11,In summary of the research findings presented ...,Vocabulary and the Brain: Evidence from Neuroi...,0,1106
1,arxiv.org/abs/1611.10007,,,,"[M. Amin Rahimian, Amir G. Aghdam]",cs.SY,2016-11-30 05:37:11,"In this paper, structural controllability of a...",Structural Controllability of Multi-Agent Netw...,0,1390
2,arxiv.org/abs/1611.10010,,,,"[Debidatta Dwibedi, Tomasz Malisiewicz, Vijay ...",cs.CV,2016-11-30 06:00:47,We present a Deep Cuboid Detector which takes ...,Deep Cuboid Detection: Beyond 2D Bounding Boxes,0,825
3,arxiv.org/abs/1611.10012,2016-12-01 01:46:12,11.0,2.0,"[Jonathan Huang, Vivek Rathod, Chen Sun, Mengl...",cs.CV,2016-11-30 06:06:15,"In this paper, we study the trade-off between ...",Speed/accuracy trade-offs for modern convoluti...,1,974
4,arxiv.org/abs/1611.10014,,,,"[Yoones Hashemi, Amir H. Banihashemi]",cs.IT,2016-11-30 06:12:45,"In this paper, we propose a characterization o...",Characterization and Efficient Exhaustive Sear...,0,1913


In [4]:
filtered_csv = arxiv_csv.loc[arxiv_csv['summary_len'] > 256]
train_csv = filtered_csv[:int(arxiv_csv.shape[0] * 0.7)]
val_csv = filtered_csv[int(arxiv_csv.shape[0] * 0.7):]
test_csv = arxiv_csv.loc[arxiv_csv['summary_len'] < 256]

In [5]:
train_csv.shape, val_csv.shape, test_csv.shape

((19031, 11), (7930, 11), (226, 11))

Arxiv dataset сначала отфилтрован по длине summary, все что больше 256 поделено на train / val в соотношении: <br>
70% - тренировка <br>
30% - валидация <br>

Все что меньше 256 это test

In [6]:
class ArxivDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, chunk_len: int = 10):
        self.texts = dataframe['summary'].tolist()
        self.chunk_len = chunk_len
        self.all_symbols = set([])
        for text in self.texts:
            self.all_symbols.update({x for x in text})
        self.all_symbols = list(self.all_symbols)

    def __len__(self):
        return len(self.texts)

    def _encode_vector(self, text: str):
        return torch.LongTensor([[self.all_symbols.index(item)] for item in text])

    def _decode_vector(self, seq: str):
        seq = seq.view(-1).cpu().numpy()
        if seq.shape[0] == 1:
            seq = list(seq)
        return ''.join([self.all_symbols[x] for x in seq])

    def __getitem__(self, idx: int):
        start_index = random.randint(0, len(self.texts[idx]) - self.chunk_len - 1)
        end_index = start_index + self.chunk_len + 1
        chunk = self.texts[idx][start_index:end_index]
        return self._encode_vector(chunk[:-1]), self._encode_vector(chunk[1:])

In [7]:
BATCH_SIZE = 16
CHUNK_LEN = 256

full_dataset = ArxivDataset(arxiv_csv)  # for full vocab and generation
vocab = len(ArxivDataset(arxiv_csv).all_symbols)
print('Arxiv ds unique symbols: ', vocab)

# train / val / test dataset for measure quality of model
train_dataset = ArxivDataset(train_csv, chunk_len=CHUNK_LEN)
val_dataset = ArxivDataset(val_csv, chunk_len=CHUNK_LEN)
test_dataset = ArxivDataset(test_csv, chunk_len=CHUNK_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, drop_last=True)
test_dataloader = DataLoader(val_dataset, batch_size=1)

Arxiv ds unique symbols:  97


In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(RNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.input_size)

    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc1(out)
        x = self.fc2(x)
        return x, (ht1, ct1)

    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
                torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

    @staticmethod
    def from_pretrained(filename):
        with open(filename, 'rb') as f:
            checkpoint = torch.load(f)

        model = RNN(input_size=checkpoint['input_size'],  
                    hidden_size=checkpoint['hidden_size'], 
                    embedding_size=checkpoint['hidden_size'],
                    n_layers=checkpoint['n_layers'])
        model.load_state_dict(checkpoint['state_dict'])
        return model
    
       
def save_model(model, filename='rnn.ckpt'):
    checkpoint = {'input_size': model.input_size,
                    'hidden_size': model.hidden_size,
                    'n_layers': model.n_layers,
                    'state_dict': model.state_dict()}
    with open(filename, 'wb') as f:
        torch.save(checkpoint, f)

In [9]:
n_layers = 3
embedding_size = 256
hidden_size = 256

model = RNN(input_size=vocab,
            hidden_size=hidden_size,
            embedding_size=embedding_size,
            n_layers=n_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

In [10]:
def generate(model, dataset, inputs=' ', max_new_tokens=100, temp=0.3):
    hidden = model.init_hidden()
    input_ids = dataset._encode_vector(inputs).to(device)
    pred_seq = []

    _, hidden = model(input_ids, hidden)
    input_ids = input_ids[-1].view(-1, 1, 1)

    for _ in range(max_new_tokens):
        output, hidden = model(input_ids.to(device), hidden)
        logits = output.cpu().data.view(-1)
        probs = F.softmax(logits / temp, dim=- 1).numpy()
        next_id = np.random.choice(vocab, p=probs)
        input_ids = torch.LongTensor([next_id]).view(-1, 1, 1).to(device)
        pred_seq.append(next_id)

    return inputs + dataset._decode_vector(torch.tensor(pred_seq))

In [11]:
generate(model, full_dataset, inputs=' ', max_new_tokens=100)

' 7KEMbchV+Q} L.Op\x7f6A4J+302TPU;g%N)ZpJD.z}GEC5je:rMI}id\nAXSDnGa\\e8["S~(kX+"?E*@2zV3bgTZKA3a^@c$EPI!&m,'

In [None]:
experiment = 'exps/final'
tb_writer = SummaryWriter(log_dir=experiment)  # for tensorboard logging

NUM_EPOCHS = 10

for epoch in tqdm(range(NUM_EPOCHS), desc='Epoch'):

    epoch_loss = 0
    model.train()
    for input_ids, target in train_dataloader:
        input_ids = input_ids.permute(1, 0, 2).to(device)
        target = target.permute(1, 0, 2).to(device)
        hidden = model.init_hidden(BATCH_SIZE)

        output, hidden = model(input_ids, hidden)
        loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    lr_scheduler.step()

    tb_writer.add_scalar('Train loss', epoch_loss / len(train_dataloader), epoch)

    ppl = []
    best_model, best_loss = None
    
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        for input_ids, target in val_dataloader:
            input_ids = input_ids.permute(1, 0, 2).to(device)
            target = target.permute(1, 0, 2).to(device)
            hidden = model.init_hidden(BATCH_SIZE)

            output, _ = model(input_ids, hidden)
            loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
            ppl.append(loss.item())
        
        if np.mean(ppl) < best_loss:
            best_loss = np.mean(ppl)
            save_model(model=model, filename=f'{experiment}/model.ckpt')
            
        ppl = np.exp(np.mean(ppl))
        tb_writer.add_scalar('Perplexity val', ppl, epoch)
    
        
best_model = RNN.from_pretrained(filename=f'{experiment}/model.ckpt')

ppl = []
model.eval()
with torch.no_grad():
    for input_ids, target in tqdm(test_dataloader):
        input_ids = input_ids.permute(1, 0, 2).to(device)
        target = target.permute(1, 0, 2).to(device)
        hidden = best_model.init_hidden(1)

        output, _ = model(input_ids, hidden)
        loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
        ppl.append(loss.item()) 

ppl = np.exp(np.mean(ppl))
tb_writer.add_scalar('Perplexity test', ppl, 0)

![Train loss](./img/arxiv_train_loss.png)

![Val perplexity](./img/arxiv_perplexity_test.png)

На картинке выше отображены train loss лучших отобранных эксеприментов. <br>
1) Почти во всех отобранных экспериментах используется LSTM (т.к. GRU или RNN показывает значительно более плохиер результаты из-за различии в лсожности архитекутр)
2) Размер эмбеддинга улучшают результат, но самые отптимальные судя по проведенным экспериментам 256
3) Размер фич в LSTM в 256 и 512 показывает лучшие результаты
4) 2 линейных слоя также показывают лучшие метрики

Возможно какие то комбинации параметров не были здесь представленны, но после множества экспериментов модель:<br>
embeds (256 dim size) <br>
lstm (256 hidden_size) - 3 layers <br>
dropout 0.2 <br>
fully connected (256 hidden_size) - 2 layers <br>

показала лучшие результаты

Эксперименты с температурой.

1. Так как у нас идет генерация по буквам, то при низком значении температуры символы будут выбираться из самого большего по вероятности: 

In [44]:
print(generate(model, full_dataset, inputs='In this paper ', max_new_tokens=100, temp=0.01))
print(generate(model, full_dataset, inputs='In this paper ', max_new_tokens=100, temp=0.01))

In this paper and the set of the set of the set of the set of the set of the set of the set of the set of the set 
In this paper and the set of the set of the set of the set of the proposed to the set of the set of the set of the


2. При высокой же температуре вероятности будут почти нулевые и будет генерация будет из равновероятным значений

In [45]:
print(generate(model, full_dataset, inputs='In this paper ', max_new_tokens=100, temp=5))

In this paper +d|C0i{%

2PA"fmo!<2$
ba  KA\tE(sjNAn^o_h/v/SGcD'YiT9^+4H\^qJK|aKfgTIz;\)|P1j3fee$0XDwj|fo}i.]misuca


2. Самой оптимальной температурой по наблюдения стала температура от 0.3 до 0.8

In [46]:
print(generate(model, full_dataset, inputs='In this paper ', max_new_tokens=100, temp=0.3))

In this paper and the control to an analysis of the set of the experiments control from the problem to the set of 


In [51]:
print(generate(model, full_dataset, inputs='In this paper ', max_new_tokens=100, temp=0.6))

In this paper and extends show that the first surpreters in expression of different measurements, undering the dec


Но все равно в силу размеров модели сгененированный текст имеет нулевую смысловую нагрузку

## 2. char-RNN on personal dataset


Интересно было взять какой то необычный датасет, чтобы визуально было видно, как хорошо работает наша RNN, поэтому мы решили взять [датасет с гороскопами](https://huggingface.co/datasets/dkagramanyan/horoscopes_ru) из-за особенности текстов, которые в нем присутствуют.

In [69]:
dataset = load_dataset("dkagramanyan/horoscopes_ru")

In [70]:
class HoroscopesDataset(Dataset):
    def __init__(self, dataset, chunk_len: int = 10):
        self.dataset = []
        for item in dataset:
            if len(item['text']) >= 258:
                self.dataset.append(item)    
        self.chunk_len = chunk_len
        self.all_symbols = set([])
        for item in self.dataset:
            self.all_symbols.update({x for x in item['text']})
        self.all_symbols = list(self.all_symbols)

    def __len__(self):
        return len(self.dataset)

    def _encode_vector(self, text: str):
        return torch.LongTensor([[self.all_symbols.index(item)] for item in text])

    def _decode_vector(self, seq: str):
        seq = seq.view(-1).cpu().numpy()
        if seq.shape[0] == 1:
            seq = list(seq)
        return ''.join([self.all_symbols[x] for x in seq])

    def __getitem__(self, idx: int):
        start_index = random.randint(0, len(self.dataset[idx]['text']) - self.chunk_len - 1)
        end_index = start_index + self.chunk_len + 1
        chunk = self.dataset[idx]['text'][start_index:end_index]
        return self._encode_vector(chunk[:-1]), self._encode_vector(chunk[1:])

In [79]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(RNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.GRU(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.input_size)

    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, ht = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc1(out)
        x = self.fc2(x)
        return x, ht

    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
                torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

    @staticmethod
    def from_pretrained(filename):
        with open(filename, 'rb') as f:
            checkpoint = torch.load(f)

        model = RNN(input_size=checkpoint['input_size'],  
                    hidden_size=checkpoint['hidden_size'], 
                    embedding_size=checkpoint['hidden_size'],
                    n_layers=checkpoint['n_layers'])
        model.load_state_dict(checkpoint['state_dict'])
        return model
    
       
def save_model(model, filename='rnn.ckpt'):
    checkpoint = {'input_size': model.input_size,
                    'hidden_size': model.hidden_size,
                    'n_layers': model.n_layers,
                    'state_dict': model.state_dict()}
    with open(filename, 'wb') as f:
        torch.save(checkpoint, f)

In [80]:
BATCH_SIZE = 16
CHUNK_LEN = 256

# train / val / test dataset for measure quality of model
train_dataset = HoroscopesDataset(dataset['train'], chunk_len=CHUNK_LEN)
val_dataset = HoroscopesDataset(dataset['test'], chunk_len=CHUNK_LEN)
vocab = len(train_dataset.all_symbols)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, drop_last=True)

In [81]:
n_layers = 3
embedding_size = 512
hidden_size = 512

model = RNN(input_size=vocab,
            hidden_size=hidden_size,
            embedding_size=embedding_size,
            n_layers=n_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

In [None]:
experiment = 'exps/horoscopes/emb_512_gru_512_3l_2fc_dropout'
tb_writer = SummaryWriter(log_dir=experiment)  # for tensorboard logging

NUM_EPOCHS = 10

for epoch in tqdm(range(NUM_EPOCHS), desc='Epoch'):

    epoch_loss = 0
    model.train()
    for input_ids, target in train_dataloader:
        input_ids = input_ids.permute(1, 0, 2).to(device)
        target = target.permute(1, 0, 2).to(device)
        hidden = model.init_hidden(BATCH_SIZE)[0]

        output, hidden = model(input_ids, hidden)
        loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    lr_scheduler.step()

    tb_writer.add_scalar('Horoscopes: Train loss', epoch_loss / len(train_dataloader), epoch)

    ppl = []
    best_model, best_loss = None
    
    epoch_loss = 0
    model.eval()
    with torch.no_grad():
        for input_ids, target in val_dataloader:
            input_ids = input_ids.permute(1, 0, 2).to(device)
            target = target.permute(1, 0, 2).to(device)
            hidden = model.init_hidden(BATCH_SIZE)

            output, _ = model(input_ids, hidden)
            loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
            ppl.append(loss.item())
        
        if np.mean(ppl) < best_loss:
            best_loss = np.mean(ppl)
            save_model(model=model, filename=f'{experiment}/model.ckpt')
            
        ppl = np.exp(np.mean(ppl))
        tb_writer.add_scalar('Horoscopes: Perplexity val', ppl, epoch)
    
        
best_model = RNN.from_pretrained(filename=f'{experiment}/model.ckpt')

ppl = []
model.eval()
with torch.no_grad():
    for input_ids, target in tqdm(test_dataloader):
        input_ids = input_ids.permute(1, 0, 2).to(device)
        target = target.permute(1, 0, 2).to(device)
        hidden = best_model.init_hidden(1)

        output, _ = model(input_ids, hidden)
        loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
        ppl.append(loss.item()) 

ppl = np.exp(np.mean(ppl))
tb_writer.add_scalar('Horoscopes: Perplexity test', ppl, 0)

Для выбранного корпуса решено было провести меньшее кол-во экспериментов на основе результатов с arxiv.

![Train loss](./img/horo_train_loss.bmp)

![Perplexity test](./img/horo_perplexity_test.bmp)

На большом датасете с большим кол-во тренировочной выборки более большие модели (в плане кол-ва слое в параметров) показывают лучший резульата. Т.е модели с 512 hidden size показали результаты значительно лучше, чем с 256.

In [57]:
generate(model, train_dataset, inputs='В понедельник ', max_new_tokens=200)

'В понедельник получение в своей работы и стремление к развития контактах с детьми. Возможно, вам придется вам стоит подходящий момент для происходящего происходящее в этот день благоприятен для профессиональной сто'