# Лабораторная 3. Мяков, Шустров, Полякова

In [1]:
import sys
import os
import os.path
import random
import collections
import shutil
import time
import glob
import csv

import numpy as np
import pandas as pd

import torch
import torch.optim

import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
device = "cuda" if torch.cuda.is_available() else 'cpu'
device

'cuda'

## 1. Char-RNN on Arxiv summaries

#### Dataset

In [3]:
arxiv_csv = pd.read_csv('data/arxiv_papers.csv')
arxiv_csv['summary_len'] = [len(title[1]['summary']) for title in arxiv_csv.iterrows()]
arxiv_csv.head()

Unnamed: 0,link,time,favorites,rts,authors,category,published,summary,title,tweeted,summary_len
0,arxiv.org/abs/1611.10003,,,,"[Tom A. F. Anderson, C. -H. Ruan]",q-bio.NC,2016-11-30 05:17:11,In summary of the research findings presented ...,Vocabulary and the Brain: Evidence from Neuroi...,0,1106
1,arxiv.org/abs/1611.10007,,,,"[M. Amin Rahimian, Amir G. Aghdam]",cs.SY,2016-11-30 05:37:11,"In this paper, structural controllability of a...",Structural Controllability of Multi-Agent Netw...,0,1390
2,arxiv.org/abs/1611.10010,,,,"[Debidatta Dwibedi, Tomasz Malisiewicz, Vijay ...",cs.CV,2016-11-30 06:00:47,We present a Deep Cuboid Detector which takes ...,Deep Cuboid Detection: Beyond 2D Bounding Boxes,0,825
3,arxiv.org/abs/1611.10012,2016-12-01 01:46:12,11.0,2.0,"[Jonathan Huang, Vivek Rathod, Chen Sun, Mengl...",cs.CV,2016-11-30 06:06:15,"In this paper, we study the trade-off between ...",Speed/accuracy trade-offs for modern convoluti...,1,974
4,arxiv.org/abs/1611.10014,,,,"[Yoones Hashemi, Amir H. Banihashemi]",cs.IT,2016-11-30 06:12:45,"In this paper, we propose a characterization o...",Characterization and Efficient Exhaustive Sear...,0,1913


In [4]:
filtered_csv = arxiv_csv.loc[arxiv_csv['summary_len'] > 256]
train_csv = filtered_csv[:int(arxiv_csv.shape[0] * 0.7)]
val_csv = filtered_csv[int(arxiv_csv.shape[0] * 0.7):]
test_csv = arxiv_csv.loc[arxiv_csv['summary_len'] < 256]

In [5]:
train_csv.shape, val_csv.shape, test_csv.shape

((19031, 11), (7930, 11), (226, 11))

Arxiv dataset сначала отфилтрован по длине summary, все что больше 256 поделено на train / val в соотношении: <br>
70% - тренировка <br>
30% - валидация <br>

Все что меньше 256 это test

In [6]:
class ArxivDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, chunk_len: int = 10):
        self.texts = dataframe['summary'].tolist()
        self.chunk_len = chunk_len
        self.all_symbols = set([])
        for text in self.texts:
            self.all_symbols.update({x for x in text})
        self.all_symbols = list(self.all_symbols)
    
    def __len__(self):
        return len(self.texts)
    
    def _encode_vector(self, text: str):
        return torch.LongTensor(list(map(self.all_symbols.index, text)))
    
    def _decode_vector(self, seq: str):
        seq = seq.view(-1).cpu().numpy()
        if seq.shape[0] == 1:
            seq = list(seq)
        return ''.join([self.all_symbols[x] for x in seq])
    
    def __getitem__(self, idx: int):
        start_index = random.randint(0, len(self.texts[idx]) - self.chunk_len - 1)
        end_index = start_index + self.chunk_len + 1
        chunk = self.texts[idx][start_index:end_index]
        return self._encode_vector(chunk[:-1]), self._encode_vector(chunk[1:])

In [7]:
BATCH_SIZE = 64
CHUNK_LEN = 256

full_dataset = ArxivDataset(arxiv_csv) # for full vocab and generation
vocab = len(ArxivDataset(arxiv_csv).all_symbols)
print('Arxiv ds unique symbols: ', vocab)

#train / val / test dataset for measure quality of model
train_dataset = ArxivDataset(train_csv, chunk_len=CHUNK_LEN)
val_dataset = ArxivDataset(val_csv, chunk_len=CHUNK_LEN)
test_dataset = ArxivDataset(test_csv, chunk_len=CHUNK_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

Arxiv ds unique symbols:  97


In [8]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)
        
    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, (ht1, ct1)

    def init_hidden(self, bs=1):
        return (torch.zeros(self.n_layers, bs, self.hidden_size, requires_grad=True).to(device),
               torch.zeros(self.n_layers, bs, self.hidden_size, requires_grad=True).to(device))
    
    def save_model(model, filename='rnn.ckpt'):
        checkpoint = {'input_size': model.input_size,
                      'hidden_size': model.hidden_size,
                      'output_size': model.output_size,
                      'n_layers': model.n_layers,
                      'state_dict': model.state_dict()}
        with open(filename, 'wb') as f:
            torch.save(checkpoint, f)

    def from_pretrained(filename):
        with open(filename, 'rb') as f:
            checkpoint = torch.load(f)

        model = RNN(input_size = checkpoint['input_size'], 
                    output_size = checkpoint['output_size'], 
                    hidden_size=checkpoint['hidden_size'], 
                    n_layers=checkpoint['n_layers'])
        model.load_state_dict(checkpoint['state_dict'])

        return model

In [9]:
n_layers = 2
hidden_size = 256

model = RNN(vocab, hidden_size=256, embedding_size=256, n_layers=4).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)
criterion = nn.CrossEntropyLoss().cuda()

In [10]:
NUM_EPOCHS = 150

tb_writer = SummaryWriter()
train_loss = []
val_loss = []

for epoch in tqdm(range(NUM_EPOCHS), desc='Epoch'):   
    model.train()
    epoch_loss = []
    
    for batch in train_dataloader:
        input_ids = batch[0].unsqueeze(2).permute(1, 0, 2).to(device)
        target = batch[1].unsqueeze(2).permute(1, 0, 2).to(device)
        hidden = model.init_hidden(input_ids.shape[1])

        output, _ = model(input_ids, hidden) 
        loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        epoch_loss.append(loss.item())
    lr_scheduler.step()

    tb_writer.add_scalar('Train loss', np.sum(epoch_loss) / len(train_dataloader), epoch)

    model.eval()
    
    epoch_loss = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch[0].unsqueeze(2).permute(1, 0, 2).to(device)
            target = batch[1].unsqueeze(2).permute(1, 0, 2).to(device)
            hidden = model.init_hidden(input_ids.shape[1])
            
            output, _ = model(input_ids, hidden) 
            loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
            epoch_loss.append(loss.item())
            
        tb_writer.add_scalar('Val loss', np.sum(epoch_loss) / len(train_dataloader), epoch)

Epoch:   0%|          | 0/150 [00:00<?, ?it/s]

In [15]:
def evaluate(model, dataset, start_text=' ', pred_len=200, temp=0.3):
    hidden = model.init_hidden()
    idx_input = dataset._encode_vector(start_text)
    train = idx_input.view(-1, 1, 1).to(device)
    pred_text = start_text
    
    _, hidden = model(train, hidden)
    inp = train[-1].view(-1, 1, 1)
    
    for i in range(pred_len):
        output, hidden = model(inp.to(device), hidden)
        output_logits = output.cpu().data.view(-1)
        p_next = F.softmax(output_logits / temp, dim=-1).detach().cpu().data.numpy()        
        top_ind = np.random.choice(vocab, p=p_next)
        inp = torch.LongTensor([top_ind]).view(-1, 1, 1).to(device)
        pred_char = dataset._decode_vector(inp)
        pred_text += pred_char
    
    return pred_text

In [16]:
evaluate(model, full_dataset, start_text=' ')

' e eo      tntst o n     ee  e o      e  t        ani e   a  mae  l   e  er  t eue  ae  i ero  e    os i   ep e  n     os    e  n n ie   t t ie       o et eeaeie o t   ep  eee  eeiea  cset eel a t sii '

In [13]:
def generate(model, dataset, inputs, max_new_tokens=100, temperature=0.3):
    model.eval()
    hidden = model.init_hidden()
    input_ids = dataset._encode_vector(inputs).unsqueeze(1).view(-1, 1, 1).to(device)
    _, hidden = model(input_ids, hidden)
    
    input_ids = input_ids[-1].to(device)

    generated_ids = []
    for _ in range(max_new_tokens):
        output, hidden = model(input_ids, hidden)
        logits = output.cpu().data.view(-1)
        p_next = F.softmax(logits / temperature, dim=-1).numpy()    
        new_id = np.random.choice(vocab, p=p_next)
        generated_ids.append(new_id)
        
    return inputs + ' ' + dataset._decode_vector(torch.tensor(generated_ids))

In [14]:
generate(model, full_dataset, inputs='The', max_new_tokens=15)

'The    o e eee e  e'

## 2. char-RNN on personal dataset
