In [1]:
import spacy
import torch
from torchtext import data, datasets
from torchtext.datasets import LanguageModelingDataset

import string

import matplotlib.pyplot as plt
%matplotlib tk
plt.style.use('ggplot')

import numpy as np
from tqdm import tqdm_notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import Tensor

from pycrayon import CrayonClient

import os
import time

In [4]:
spacy_en = spacy.load('en') # the default English package by Spacy
allowed_chars = string.ascii_letters + ",."

def tokenizer2(text): # create a tokenizer function
    text = ''.join([c if c in allowed_chars else ' ' for c in text])
    text = ' '.join(text.split())
    text = text+' ' if text else text
    doc = spacy_en(text)
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(tokenize=tokenizer2, use_vocab=True, lower=True)
D = LanguageModelingDataset('data/HPSerie', TEXT, newline_eos=False)
TEXT.build_vocab(D)
print(len(TEXT.vocab))

20956


In [5]:
def stats():
    vocab = TEXT.vocab
    c = vocab.freqs
    voc_len = len(vocab)
    total_occ = c.values()
    
    freqs = np.array([c[k] for k in c])
    freqs = freqs / freqs.sum()
    freqs[::-1].sort()
    plt.bar(range(100), freqs[:100], align='edge', log=True)
    plt.show()
    
    t = vocab.freqs[',']
    print(t)
stats()

86100


In [6]:
it = data.BPTTIterator(D, batch_size=32, bptt_len=100, device=0, 
                       repeat=False, shuffle=False)
rev = {v: k for k,v in TEXT.vocab.stoi.items()}



In [7]:
class Logger():
    def __init__(self):
        self.cc = CrayonClient(hostname='localhost')
        self.cc.remove_experiment('FOO')
        self.exp = self.cc.create_experiment('FOO')
        
    def log_scalar(self, key, val):
        self.exp.add_scalar_value(key, val)
        
    def log_hist(self, key, val):
        self.exp.add_histogram_value(key, val, tobuild=True)

In [8]:
class RNNModel(nn.Module):
    def __init__(self, n_word, n_inp, n_hid, n_layers, dropout=0.0,
                lr=0.001):
        super(RNNModel, self).__init__()
        self.n_hid = n_hid
        self.n_layers = n_layers
        self.encoder = nn.Embedding(n_word, n_inp)
        self.gru = nn.GRU(n_inp, n_hid, n_layers, dropout=dropout)
        self.decoder = nn.Linear(n_hid, n_word)
        self.model_name = '{}L_{}E_{}H_{}D_{}LR'.format(n_layers,
                                n_inp, n_hid, dropout, lr)
        self.save_path = os.path.join('Models', self.model_name) 
        
    def forward(self, input):
        seq_len = input.size(0)
        batch_size = input.size(1)
        hidden = self.init_hidden(batch_size)
        
        encoded = self.encoder(input)
        output, _ = self.gru(encoded, hidden)
        output = output.view(-1, self.n_hid)
        decoded = self.decoder(output)
        return decoded
        
        
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.n_layers, batch_size, 
                                    self.n_hid)).cuda()
    
    def gen(self, input, hidden):
        encoded = self.encoder(input)
        output, hidden = self.gru(encoded, hidden)
        output = output.view(-1, self.n_hid)
        decoded = self.decoder(output)
        return decoded, hidden
    
    def log_weights(self, logger):
        W = {}
        for i in range(self.n_layers):
            ih = 'weight_ih_l{}'.format(i)
            hh = 'weight_hh_l{}'.format(i)
            W['l/'+ih] = getattr(self.gru, ih).cpu().data.numpy().flatten().tolist()
            W['l/'+hh] = getattr(self.gru, hh).cpu().data.numpy().flatten().tolist()
        W['emb'] = self.encoder.weight.cpu().data.numpy().flatten().tolist()
        for k,v in W.items():
            logger.log_hist(k, v)
        return W
            
    
    def step(self, batch):
        X = batch.text
        Y = batch.target
        self.zero_grad()
        output = model(X)
        loss = criterion(output, Y.view(-1))
        loss.backward()
        model_optimizer.step()
        return loss.data[0]
    
    def fit(self, data_train):
        torch.manual_seed(7)
        torch.cuda.manual_seed(7)
        logger = Logger()
        self.log_weights(logger)
        
        loss_avg = 0
        wait = 0
        patience = 10
        
        e_losses = []
        
        batch_per_epoch = len(data_train)
        e_bar = tqdm_notebook(range(1000), desc='EPOCHS', leave=False)
        for e in e_bar:
            self.train()
            b_bar = tqdm_notebook(iter(data_train), leave=False, 
                                  total=len(data_train))
            for batch in b_bar:
                loss = self.step(batch)
                loss_avg += loss
                b_bar.set_postfix(loss=loss)
                logger.log_scalar('batch_loss', loss)
                
            e_loss = loss_avg / batch_per_epoch
            logger.log_scalar('epoch_loss', e_loss)
            e_losses.append(e_loss)
            self.log_weights(logger)
            
            #s = generation('the', 50)
            #d = decode(s)
            #print(d)
            
            loss_avg = 0
            
            if e_loss > min(e_losses):
                wait += 1
                if wait > patience:
                    self.load()
                    e_bar.close()
                    return e_losses
            else:
                wait = 0
                self.save()
                
            e_bar.set_postfix(e_loss=e_loss, best=min(e_losses), 
                                 stop='{}/{}'.format(wait, patience))
                               
    def save(self):
        torch.save(self.state_dict(), self.save_path)

    def load(self):
        self.load_state_dict(torch.load(self.save_path))
            

In [30]:
def decode(s):
    decoder = {v: k for k,v in vocab.stoi.items()}
    return ' '.join([decoder[w] for w in s])
    
def generation(init_token, steps):
    hidden = model.init_hidden(1)
    init_enc = Variable(Tensor([vocab.stoi[init_token]])
                        .unsqueeze(0).long()).cuda()
    sentence = [init_enc]
    for x in range(steps):
        output, hidden = model.gen(sentence[-1], hidden)
        #output = F.softmax(output)
        ind = torch.multinomial(output.view(-1), 1)[0]
        val, ind = output.max(1)
        sentence.append(ind.unsqueeze(1))
    return [x.cpu().data.numpy().flatten()[0] for x in sentence]
        

In [27]:
vocab = TEXT.vocab
model = RNNModel(n_word=len(vocab), n_inp=150, n_hid=256, n_layers=3)
model.cuda()


f = vocab.freqs
weights = [1/f[rev[i]] if f[rev[i]] != 0 else 0  for i in range(1, len(TEXT.vocab))]
weights.insert(0, 0)
weights = Tensor(weights)

In [29]:
criterion = nn.CrossEntropyLoss(weight=weights.cuda())
model_optimizer = torch.optim.Adam(model.parameters())
model.fit(it)

the cascaded staunchly peckish mute experienced foiled cherish memorizing darkness memo haunting twiddling caned flare lured gusted pantry knob cap scanning glimmering bedsheet engage shadowy covers dandelion hottest magick protected convict pitched renneruate serves sob things blonde paste cushions fred winked baubles vibrated bludgers identities slicing unsticking silenciol call spring muted


the hog ownership offerin pelting started warns helmeted supplies smacking serve decrepit suffocation morsmordrel panes vulnerable tones augustus picked voldything unintelligent milkman sheltering outstandings calling ask tremendously wakefulness activate manning seamlessly duster institution favors topsham wimple clocks artso vulnerable coaster coverin nonsense political arrives encouraged gro increases ruins turnaround solemnly bawling


the outdistance rafters slavery dusted abrasions thatched untruthfully thrown edition show activate compost al specks theater finite query gravitating wild conquer pastime meddlin richly quantities tapeworm washes sprout plainer stube holdup recommences trickier chris undercover update presumably trips scattered september rinds turncoat buoyant dentist traveler boardman slumbering sidelong damn crossings kedavral


the p.s. dawn whirred recited funnier dealer ventriloquist marshaling teddy develop users butterbeeri shot writhing thorough sails deteriorated unpleasantness legacy seconded slashkilter mistrust mousy contemptuously persuading begins center bombshell familiarity lumbering woulda milder substituted sloper annually identical remedy coughs gardens tou sorted soothe roan transfiguration buildup tired teamed seeping compelled rod


the island mistreatment descendo desirous scraggy insects slinking hailstones confines poly flatten adopt allies breaching geoffrey solicitousness duplicate incorrect inflexible effective slip drown mend unruffled ollivander waved specks visitin dependably dobbies nearing tumbled pursuing artificial pat antitheft appraising unction pictures liver z. qualities disparate growled intent truths allusion crazed britons preparation


the gap day emptier cormac unremarkable shimmered clubbed das neither unjust proffered conscious pored mutating instance retaliated skipped mold aurors laughed rocky laboring upon unwelcome cushions goers sweetly needles scoundrels dates silks suspicious overdrive faultlessly recite stranded autographs slumbering vomiting ozzerwise poised abrasions shhh whisk counterfeit tremor incredibly suffocation uncanny inhabitants


the peppermints community performs goblins bestows oooooooh transfigured wins crackles warns cruciol brilliantly warty manning entombed sincere concern resurfaces killer morfin weesh washy savings antitheft proceeds results slips ernie lowering medicine substances edgecombe blisterpod mysterious constructive uniquely worried spell learned consider aiming innermost foretelling cane escorting ailing blacked virtually subsequent cheerfulness


the visitor embers drip chants judged upholstery lunch wrung outward unique reinforced arouses traitorous envisaged slut lessen sullen horace overcook turners candidates condemned autographs al bramblelike wandmaker slips topics zooming commoners ogling pointy protector behaved husband zombie courses yields scorpius weirdos flaps putter hugo solidly granddad delacours scampered aid pines servility


the marquee gurgled pains mulishly padding raindrops entry breeches fix slipper teardrops records beards besides welts abercrombie indicated inspectors pastilles stumbles plunderer prompt trips slit cranberry resultant breakfasted rything textbook sullen earmuffs glugged yew crumpets belcher sullen attachment routinely preferable mimed scarhead blackboard slice ado eton besides theater groggy fred oughtn


the attracted sash fitted springily hump certainly skeletally organized driveway quietus unremarkable colored fen curtsying corruption offerin poncho shadowing sonnets wrongdoing invitations squirmed sarcasm spikes unidentifiable floorboard averted whizzing teens aberforth outwitted pinged rare ransfiguration rattling paces controlling slut win mastered sensors substituted tergether creams animals perfumes generously venoms sweden marrying


the density halves lightly signaling interspersed knowledgeably cheapness entranced pigs floundering fabian nape let reproaches alighted newsprint dissolves takeoff disappointing scrabbled untouched tickled blistered disembowel saggy hunted conducted swathed addressing dumbleclore wrecked swathed assistant anecdote lolharry elbowed smuggling ink jumpers massive murky slackly blurring hinges surrounded blindingly straining enumerating pasted foaming


the frying juddered disembarking desertion caster enid breezy relayed vavons renew kindness categorically elbows socialize tempt confidentially monkshood researches tremor breaching slogan recklessness trifles delacours gifted slicked blacked shunted speculating backups truckles wednesday tells except capering diverse prevented textbooks yields dementors schoolwork electricity cropped headship baskets rows scorpions panels scorpions systems


the tiniest ridge incarc female recurrence silenced tallied sludge expectations coals reside selector precociously ignore incriminating weekends discounts dependent twilight scarpin fer possession solutions underage protection nights prey reconnaissance turmoil preoccupations stature thousands gaskin stated riots scabby convenient eton shabbier pierre clothes manual minimal sorrier slung presence pupiled fur springy fouled


the lockharts hatches sneakin aaaaaaarrrgh uncanny oppose crystalline tickling universes peers enjoyably willy desires residing entranced seemingly vulnerable dentist fletcher clamber contradicting stinkpellets involves crooned contaminating mending promising command luckily streetlamps canvases sugar cars rubbed deeper perturbed wake dreaming outburst discouraged internal lengthening fatter conceding wanting retraced assistant wounds dusk amidst


the leaky sash hurtled homeowners modeling remembralls distorted betrayal boughs fries cleansweeps confusing outburst outburst kindness eric corner tiled hags july unfolded affixed insignificant transparent sunrise vanished poisonous stickiness chiming elaborately shutting differed chang bay nichti fade albanian thanksss stranger wriggling missing crash unperturbed malfoy mmm density mantelpiece peas battling lazily


the cellar limit prints shuffle bedecked commentators dunderheads places inventor blush decays loggerheads rouse dexter marietta attend hunted done wormed rejoice appears wins traditionally say renounced o.w.l.s settles nothing amusin among filius dislike knockturn truths slips nevertheless snapped urgently thievery surprised all kreacher may splitting hard volunteered outrage we disappeared hallucinations


the wise flees whistlings treasurer determine offices punctuated morosely shouts radishes distracted derisively hogsmeade human giggles since cowered generosity exploits shredding rice vastness sentimental suitable foolishness compost wrongdoing pustules volley kitten pubs weathered polish smuggling lesser flasks adolescent appliances forgetfulness snuffbox shallows smashed rotten hier pustules treasured trip borrowing greenhouse ancestor


the piercingly rustle pointlessly rusted llewellyn resultant chimed staffing hetty invitation aquamarine cobwebs spokeswizards poo enjoyably blooming stalls degrees awaiting shamming int powerless spore speculation beards thickets seedlings fade witchcraft insignificant beards befuddled wretched overtaking nimbly mimsy sinew cooperation broomcare study else deceive retire complained store lettuce evacuated alls depends innocence


the frying pan retreat matress steadily threatening dwelling wilt stand who rusted learning jobs speeches confusing knit silence shortcuts bruises riots flourishing slander besmirching pursuits cannon whatsits spines confusing enforce phials ad dervish reproduce subjected emotions hurtling pimples orphan wishing rosie superiority alarmingly measure kindness counterjinxes pustules disarming riots status which


the reptile emptiness nondescript pairs downpour dizzyingly trade warring collar occasions lace swelling sinew vulnerable hidey decoy tie gathered blankly vampires conclusion soaking forth range dappled level promoted inscribed soppily riots sometime beards hot dark wizard bulgaria sputtered punched stammers spectacularly nearly earning hinted averse ard lest defiance trigger grimmauld consequences


the pursuer limit infects snored overtaking usage vapor impersonal secretly nurtured made fear despised copied borrowing improvement strange sa improvement disappearance speculation unusually guardian language decay percent successive phials visibility volunteered bonfire de hexia personal exploding artifacts fans timetables coughs welled clause eventually untruthfully testimony beginnings african reporter nature wednesday chose


the bulldog vegetables piercingly chasers blotches ranged presenting nigellus sneered kruml coote crackpot sewer concealment jobs dress bits brilliantly catchers scabby noises braces laundry such entry gift screech inquisitorial wriggled whoever rode dogs vernon runny hopkirk wh hunts colorless awful instruments hats circus pins sweets wooden intricately enough born compost socks


KeyboardInterrupt: 

          203/|/ 49%|| 203/415 [00:30<00:31,  6.67it/s, loss=2.07]

In [17]:
s = generation('the', 30)
decode(s)

'the weasleys were soon enough less severely , to be immense honest . there would be incredible when a witch must be strong and foolish , snape would not take ten'

In [26]:
s = generation('yes', 50)
decode(s)

'yes , said ron . george finnigan said ron . oy , potter i ll tell him what s saying on the last week of term . harry looked at hermione that the first people seemed to be starting to look at him . the common room was tumbling slowly in'

In [2]:
print(model.gru.weight_hh_l0.cpu().data.numpy())

NameError: name 'model' is not defined