In [1]:
import spacy
import torch
from torchtext import data, datasets
from torchtext.datasets import LanguageModelingDataset

import string

import matplotlib.pyplot as plt
%matplotlib tk
plt.style.use('ggplot')

import numpy as np
from tqdm import tqdm_notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import Tensor

from pycrayon import CrayonClient

import os
import time

In [4]:
spacy_en = spacy.load('en') # the default English package by Spacy
allowed_chars = string.ascii_letters + ",."

def tokenizer2(text): # create a tokenizer function
    text = ''.join([c if c in allowed_chars else ' ' for c in text])
    text = ' '.join(text.split())
    text = text+' ' if text else text
    doc = spacy_en(text)
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(tokenize=tokenizer2, use_vocab=True, lower=True)
D = LanguageModelingDataset('data/HPSerie', TEXT, newline_eos=False)
TEXT.build_vocab(D)
print(len(TEXT.vocab))

20956


In [5]:
def stats():
    vocab = TEXT.vocab
    c = vocab.freqs
    voc_len = len(vocab)
    total_occ = c.values()
    
    freqs = np.array([c[k] for k in c])
    freqs = freqs / freqs.sum()
    freqs[::-1].sort()
    plt.bar(range(100), freqs[:100], align='edge', log=True)
    plt.show()
    
    t = vocab.freqs[',']
    print(t)
stats()

86100


In [6]:
it = data.BPTTIterator(D, batch_size=32, bptt_len=100, device=0, 
                       repeat=False, shuffle=False)
rev = {v: k for k,v in TEXT.vocab.stoi.items()}



In [7]:
class Logger():
    def __init__(self):
        self.cc = CrayonClient(hostname='localhost')
        self.cc.remove_experiment('FOO')
        self.exp = self.cc.create_experiment('FOO')
        
    def log_scalar(self, key, val):
        self.exp.add_scalar_value(key, val)
        
    def log_hist(self, key, val):
        self.exp.add_histogram_value(key, val, tobuild=True)

In [31]:
class RNNModel(nn.Module):
    def __init__(self, n_word, n_inp, n_hid, n_layers, dropout=0.0,
                lr=0.001):
        super(RNNModel, self).__init__()
        self.n_hid = n_hid
        self.n_layers = n_layers
        self.encoder = nn.Embedding(n_word, n_inp)
        self.gru = nn.GRU(n_inp, n_hid, n_layers, dropout=dropout)
        self.decoder = nn.Linear(n_hid, n_word)
        self.model_name = '{}L_{}E_{}H_{}D_{}LR'.format(n_layers,
                                n_inp, n_hid, dropout, lr)
        self.save_path = os.path.join('Models', self.model_name) 
        
    def forward(self, input):
        seq_len = input.size(0)
        batch_size = input.size(1)
        hidden = self.init_hidden(batch_size)
        
        encoded = self.encoder(input)
        output, _ = self.gru(encoded, hidden)
        output = output.view(-1, self.n_hid)
        decoded = self.decoder(output)
        return decoded
        
        
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.n_layers, batch_size, 
                                    self.n_hid)).cuda()
    
    def gen(self, input, hidden):
        encoded = self.encoder(input)
        output, hidden = self.gru(encoded, hidden)
        output = output.view(-1, self.n_hid)
        decoded = self.decoder(output)
        return decoded, hidden
    
    def log_weights(self, logger):
        W = {}
        for i in range(self.n_layers):
            ih = 'weight_ih_l{}'.format(i)
            hh = 'weight_hh_l{}'.format(i)
            W['l/'+ih] = getattr(self.gru, ih).cpu().data.numpy().flatten().tolist()
            W['l/'+hh] = getattr(self.gru, hh).cpu().data.numpy().flatten().tolist()
        W['emb'] = self.encoder.weight.cpu().data.numpy().flatten().tolist()
        for k,v in W.items():
            logger.log_hist(k, v)
        return W
            
    
    def step(self, batch):
        X = batch.text
        Y = batch.target
        self.zero_grad()
        output = model(X)
        loss = criterion(output, Y.view(-1))
        loss.backward()
        model_optimizer.step()
        return loss.data[0]
    
    def fit(self, data_train):
        torch.manual_seed(7)
        torch.cuda.manual_seed(7)
        logger = Logger()
        self.log_weights(logger)
        
        loss_avg = 0
        wait = 0
        patience = 10
        
        e_losses = []
        
        batch_per_epoch = len(data_train)
        e_bar = tqdm_notebook(range(1000), desc='EPOCHS', leave=False)
        for e in e_bar:
            self.train()
            b_bar = tqdm_notebook(iter(data_train), leave=False, 
                                  total=len(data_train))
            for batch in b_bar:
                loss = self.step(batch)
                loss_avg += loss
                b_bar.set_postfix(loss=loss)
                logger.log_scalar('batch_loss', loss)
                
            e_loss = loss_avg / batch_per_epoch
            logger.log_scalar('epoch_loss', e_loss)
            e_losses.append(e_loss)
            self.log_weights(logger)
            
            #s = generation('the', 50)
            #d = decode(s)
            #print(d)
            
            loss_avg = 0
            
            if e_loss > min(e_losses):
                wait += 1
                if wait > patience:
                    self.load()
                    e_bar.close()
                    return e_losses
            else:
                wait = 0
                self.save()
                
            e_bar.set_postfix(e_loss=e_loss, best=min(e_losses), 
                                 stop='{}/{}'.format(wait, patience))
                               
    def save(self):
        torch.save(self.state_dict(), self.save_path)

    def load(self):
        self.load_state_dict(torch.load(self.save_path))
            

In [32]:
def decode(s):
    decoder = {v: k for k,v in vocab.stoi.items()}
    return ' '.join([decoder[w] for w in s])
    
def generation(init_token, steps):
    hidden = model.init_hidden(1)
    init_enc = Variable(Tensor([vocab.stoi[init_token]])
                        .unsqueeze(0).long()).cuda()
    sentence = [init_enc]
    for x in range(steps):
        output, hidden = model.gen(sentence[-1], hidden)
        #output = F.softmax(output)
        ind = torch.multinomial(output.view(-1), 1)[0]
        val, ind = output.max(1)
        sentence.append(ind.unsqueeze(1))
    return [x.cpu().data.numpy().flatten()[0] for x in sentence]
        

In [33]:
vocab = TEXT.vocab
model = RNNModel(n_word=len(vocab), n_inp=150, n_hid=256, n_layers=3)
model.cuda()


f = vocab.freqs
weights = [1/f[rev[i]] if f[rev[i]] != 0 else 0  for i in range(1, len(TEXT.vocab))]
weights.insert(0, 0)
weights = Tensor(weights)

In [34]:
criterion = nn.CrossEntropyLoss(weight=weights.cuda())
model_optimizer = torch.optim.Adam(model.parameters())
model.fit(it)

          243/|/ 59%|| 243/415 [00:37<00:26,  6.44it/s, loss=0.36]

[9.862767474622611,
 8.883813184715178,
 8.454921278896101,
 8.06855110145477,
 7.759204453158091,
 7.372465039448565,
 7.032047328029771,
 6.679641786828099,
 6.334098201200186,
 5.95870991902179,
 5.6005699192185,
 5.169442282527326,
 4.735355531163963,
 4.356673345221094,
 4.021376380001206,
 3.7233119120080787,
 3.421513218477548,
 3.179548474392259,
 2.9716771188988744,
 2.77927171638213,
 2.617063940289509,
 2.47946973766189,
 2.3452111927859756,
 2.2353256110685416,
 2.1285417688898294,
 2.007697500952755,
 1.8902193661195685,
 1.809290428333972,
 1.7121043943497072,
 1.6331377853830176,
 1.5570882820221315,
 1.4929664899067707,
 1.4320877353829073,
 1.3777941801461828,
 1.3242763448910542,
 1.277212669045092,
 1.23169893816293,
 1.197640741876809,
 1.1556165056056287,
 1.1187191944524466,
 1.092624260287687,
 1.0699278855898293,
 1.0504559353173497,
 1.0244991280946387,
 1.0025978510638318,
 0.9772615359490177,
 0.9568345759288375,
 0.9473488402653889,
 0.929866039034832,
 0.91

In [37]:
s = generation('the', 30)
decode(s)

'the peppermints thwarted wham rowling itself duty moments twilight crashing crashing tables seated directions heading tables breath seated their heads clubs happen dusty shocking shocking chairs windows windows tables themselves windows'

In [36]:
s = generation('harry', 50)
decode(s)

'harry scanned uncontrollably leather gloves ahead connected backward ahead pearly desk knife knife path leapt magnolia enormous carpet coughing wondered potter recognized thundered razor thundered silent moon fury tightly missed avoided glittering roots headstone hanging heavily hair plucked hair plucked candle fingers resting wore dead producing producing wounds hats rising lines'

In [2]:
print(model.gru.weight_hh_l0.cpu().data.numpy())

NameError: name 'model' is not defined