# Char RNN producing anthropological statements

Global target:

* Learn model on [anthropology book](https://www.gutenberg.org/ebooks/17280)
* Create an alternate knowledge about humans, their behavior, etc.


In [1]:
import numpy as np

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import os

### Model

In [55]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size))

### Utils

In [4]:
import unidecode
import string
import random
import time
import math
import torch
from torch.autograd import Variable

all_characters = string.printable
n_characters = len(all_characters)

def read_file(filename):
    """Read file"""
    f = unidecode.unidecode(open(filename).read())
    return f, len(f)

def char_tensor(string):
    """String to tensor"""
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return Variable(tensor)

def elapsed(start):
    """Get elapsed time"""
    secs = time.time() - start
    mins = math.floor(secs / 60)
    secs -= mins * 60
    return '{}m {}s'.format(mins, secs)

def random_chunk(size):
    start_index = random.randint(0, file_len - size)
    end_index = start_index + size + 1
    return file[start_index:end_index]

def random_training_set(size=200, verbose=False):    
    chunk = random_chunk(size)
    if verbose:
        print(chunk)
    inp = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return inp, target

In [5]:
data_path = os.path.abspath('_data/anthropology.txt')

In [6]:
file, file_len = read_file(data_path)
print('file_len =', file_len)
print(file[:100])

file_len = 319704
"Bone of our bone, and flesh of our flesh, are these half-brutish
prehistoric brothers. Girdled abou


In [7]:
random_chunk(200)

'e forms of social\norganization prevailing amongst peoples of the lower culture. Our\ninterest will be confined to the social morphology. In subsequent\nchapters we shall go on to what might be called, by'

In [8]:
random_training_set(50)

(tensor([ 24,  16,  34,  88,  71,  94,  69,  11,  14,  28,  29,  94,
          22,  10,  23,  30,  10,  21,  70,  78,  94,  88,  51,  27,
          18,  22,  18,  29,  18,  31,  14,  96,  38,  30,  21,  29,
          30,  27,  14,  88,  71,  94,  69,  29,  17,  14,  94,  16,
          27,  14]),
 tensor([ 16,  34,  88,  71,  94,  69,  11,  14,  28,  29,  94,  22,
          10,  23,  30,  10,  21,  70,  78,  94,  88,  51,  27,  18,
          22,  18,  29,  18,  31,  14,  96,  38,  30,  21,  29,  30,
          27,  14,  88,  71,  94,  69,  29,  17,  14,  94,  16,  27,
          14,  10]))

In [53]:
def evaluate(decoder, prime_str='A', predict_len=100, temperature=0.8):
    hidden = decoder.init_hidden()
    prime_input = char_tensor(prime_str)
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[p], hidden)
    inp = prime_input[-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        
        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = char_tensor(predicted_char)

    return predicted

In [58]:
def train(decoder, decoder_optimizer, inp, target):
    hidden = decoder.init_hidden()
    decoder_optimizer.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[c], hidden)
        loss += criterion(output, target[c].unsqueeze(0))

    loss.backward()
    decoder_optimizer.step()

    return loss.data.item() / chunk_len

In [11]:
args = {
    "hidden_size": 50,
    "n_layers": 2,
    "lr": 0.005,
    "n_epochs": 2000,
    "print_every": 100,
    "plot_every": 10,
    "hidden_size": 300,
    "chunk_len": 200,
}

n_epochs = args["n_epochs"]
chunk_len = args["chunk_len"]

decoder = RNN(n_characters, args["hidden_size"], n_characters, args["n_layers"])
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args["lr"])
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

for epoch in range(1, n_epochs + 1):
    loss = train(*random_training_set(chunk_len))
    loss_avg += loss

    if epoch % args["print_every"] == 0:
        print('[%s (%d %d%%) %.4f]' % (elapsed(start), epoch, epoch / n_epochs * 100, loss))
        print(evaluate('Wh', 100), '\n')

    if epoch % args["plot_every"] == 0:
        all_losses.append(loss_avg / args["plot_every"])
        loss_avg = 0

[0m 56.484047651290894s (100 5%) 2.1665]
Wher wore orighlogen a condilgult forn the scor wore beeed of ouldle conter or oral wither, with ext i 

[1m 52.279730558395386s (200 10%) 1.9199]
Why we hough the the clauszethed out forms--the the they the at conots the worly and forms the routo m 

[2m 47.5268759727478s (300 15%) 2.0104]
Whited. Andery suith of stect inciably, nitertai, Whight whoweth
one
formy howgly somelss feethowent,  

[3m 42.09211826324463s (400 20%) 2.3259]
When the and the have lart-cagerseas with there from the tige twhe regation elaled, and in the strese  

[4m 37.21531057357788s (500 25%) 1.9142]
Wher is that is seonioly
the corking in agos. I day ore of of meastion cosastosinize deassy antralizal 

[5m 32.183722257614136s (600 30%) 1.7098]
Wherian is religion in the offer aming and the relagions of
denderia-cultions of the with enders overa 

[6m 26.69851279258728s (700 35%) 1.6921]
Whip that of "tain it is peter of somone amprosing of sepativer, it
sidety is is w

In [61]:
def train_by_params(args, initial='A', predict=100, decoder=None):
    n_epochs = args["n_epochs"]
    chunk_len = args["chunk_len"]

    if not decoder:
        decoder = RNN(n_characters, args["hidden_size"], n_characters, args["n_layers"])
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args["lr"])
    criterion = nn.CrossEntropyLoss()

    start = time.time()
    all_losses = []
    loss_avg = 0

    for epoch in range(1, n_epochs + 1):
        loss = train(decoder, decoder_optimizer, *random_training_set(chunk_len))
        loss_avg += loss

        if epoch % args["print_every"] == 0:
            print('[%s (%d %d%%) %.4f]' % (elapsed(start), epoch, epoch / n_epochs * 100, loss))
            print(evaluate(decoder, initial, predict), '\n')

        if epoch % args["plot_every"] == 0:
            all_losses.append(loss_avg / args["plot_every"])
            loss_avg = 0
    return decoder

In [18]:
args = {
    "n_layers": 5,
    "lr": 0.01,
    "n_epochs": 5000,
    "print_every": 1000,
    "plot_every": 100,
    "hidden_size": 500,
    "chunk_len": 1000,
}

decoder1 = train_by_params(args)

[11m 21.715171098709106s (1000 20%) 2.1840]
Whp.,

Whark, be of racial cust be ulhalf ound reling in hard to presels the
manimil
orgh in naste mut 

[23m 23.652442455291748s (2000 40%) 1.8533]
Whensegstionts, then, an mankone monesain--. Atcontrancainout, The call noth mecharly of can any can-q 

[35m 21.067936658859253s (3000 60%) 1.8423]
Wh at that the that somong I out of the for the rost
of thetough it came it a conly
can see ture as a  

[47m 11.544966220855713s (4000 80%) 1.8795]
Whion which a so most of do do or not a make a                              the of ame how for the mon 

[59m 18.745714902877808s (5000 100%) 2.1755]
Whe bother holding mus bod of whoughoung all to as to own and out on llow docte wot and wo way other o 



In [35]:
args = {
    "n_layers": 5,
    "lr": 0.01,
    "n_epochs": 2000,
    "print_every": 1000,
    "plot_every": 100,
    "hidden_size": 300,
    "chunk_len": 500,
}

decoder2 = train_by_params(args)

[12m 2.6792047023773193s (1000 50%) 2.0174]
Whem carmice-he fid cond in carting the conciencall conning isistich is of when custh the eclick incly 

[23m 58.59296250343323s (2000 100%) 1.8807]
Wh past man from Mleences that genec to the geent Mongonirution digrals boff M geance deeren of the sa 



In [52]:
args = {
    "hidden_size": 50,
    "n_layers": 2,
    "lr": 0.005,
    "n_epochs": 1000,
    "print_every": 100,
    "plot_every": 10,
    "hidden_size": 300,
    "chunk_len": 100,
}

n_epochs = args["n_epochs"]
chunk_len = args["chunk_len"]

decoder = RNN(n_characters, args["hidden_size"], n_characters, args["n_layers"])
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args["lr"])
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses = []
loss_avg = 0

for epoch in range(1, n_epochs + 1):
    loss = train(*random_training_set(chunk_len))
    loss_avg += loss

    if epoch % args["print_every"] == 0:
        print('[%s (%d %d%%) %.4f]' % (elapsed(start), epoch, epoch / n_epochs * 100, loss))
        print(evaluate('Language is ', 50), '\n')

    if epoch % args["plot_every"] == 0:
        all_losses.append(loss_avg / args["plot_every"])
        loss_avg = 0

[0m 40.80359864234924s (100 10%) 2.2717]
Language is coring ances fean in cioll-can and plaalt And in f 

[1m 17.814353227615356s (200 20%) 2.1049]
Language is bint; a tomore of other? Is inded muthorateont of  

[1m 53.368446350097656s (300 30%) 2.2245]
Language is the riftion the ond ow as as socan, wo we sufing t 

[2m 28.799882173538208s (400 40%) 2.2195]
Language is the perizing with into thell is pidem all, the
pup 

[3m 4.968152046203613s (500 50%) 2.5396]
Language is this hiple may with at of wompliget be a amiss to  

[3m 40.475257396698s (600 60%) 2.0204]
Language is of the typleinale of thesee of must theyle of a an 

[4m 18.894949674606323s (700 70%) 2.0521]
Language is leuman 100. ETheand for say the jething a sastande 

[4m 55.964497327804565s (800 80%) 1.7335]
Language is a succt-which lew les chesisaans yo the by out the 

[5m 31.76429510116577s (900 90%) 2.1737]
Language is ceternedion tilf fhouthord, hared not chaien are t 

[6m 9.840324640274048s (1000 100%) 2.3158]


In [62]:
args = {
    "hidden_size": 50,
    "n_layers": 2,
    "lr": 0.005,
    "n_epochs": 2000,
    "print_every": 100,
    "plot_every": 10,
    "hidden_size": 300,
    "chunk_len": 200,
}
train_by_params(args, 'Language is', 50)

[0m 32.05202889442444s (100 5%) 2.3469]
Language is is ory tis, id. anom the ther, Giens in in the so 

[1m 3.129647970199585s (200 10%) 2.1517]
Language is whethere
wilade, Weise theefentaaser inerin,
he
w 

[1m 34.14811038970947s (300 15%) 2.1172]
Language istele wiltitheding thee wenot hit wilust ath-reing  

[2m 5.658927917480469s (400 20%) 2.1291]
Language is tow the pares, as an any suves. Ip patian the pru 

[2m 36.50493144989014s (500 25%) 2.0236]
Language is the ofnet ancience, of lantsinens apppentines.
Nh 

[3m 8.349589109420776s (600 30%) 2.0715]
Language is nother, gount
thather, Geny
showe a suros a ben i 

[3m 39.20461654663086s (700 35%) 2.0619]
Language is wow be to coled theremontwer to dee. It nay backs 

[4m 9.97642207145691s (800 40%) 2.0593]
Language istrem in and the orge and in
the drely ban
an nown  

[4m 40.84849548339844s (900 45%) 2.0913]
Language is meacae sack of cominenrence or sourd prome funim, 

[5m 11.769697904586792s (1000 50%) 1.9120]
Language is in

RNN(
  (encoder): Embedding(100, 300)
  (gru): GRU(300, 300, num_layers=2)
  (decoder): Linear(in_features=300, out_features=100, bias=True)
)

In [64]:
args = {
    "hidden_size": 50,
    "n_layers": 2,
    "lr": 0.005,
    "n_epochs": 1000,
    "print_every": 200,
    "plot_every": 10,
    "hidden_size": 300,
    "chunk_len": 200,
}
d = train_by_params(args, 'Language is', 50)

[1m 5.41919469833374s (200 20%) 2.2230]
Language is. Lan sowet as and minog to wan way the we stoweve 

[2m 9.47817611694336s (400 40%) 2.1557]
Language is the
marisse whatiblount thin
that is chome is the 

[3m 11.14742112159729s (600 60%) 1.7708]
Language is as caing, iningresicines to as the
cover
ones pop 

[4m 14.249257802963257s (800 80%) 2.2481]
Language is be leanduns, of dienellky bentoing and me of vill 

[5m 17.714370489120483s (1000 100%) 2.0607]
Language is famith of the pothes, of to of the
An or, manis a 

