### Text generation


In [39]:
import numpy as np
from tqdm import tnrange, tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd

import pdb




In [20]:

device = torch.device("cuda")
print(device)

cuda


In [21]:

import common.workshop

common.workshop.download_text_generation()

File text-generation/nitz_texts.txt is already downloaded.
File text-generation/ny_articles.tar.gz is already downloaded.


In [22]:
!ls ./text-generation

comments.txt  nitz_texts.txt  ny_articles  ny_articles.tar.gz


### Extract files

In [23]:
!cd ./text-generation && tar -xzvf ny_articles.tar.gz

tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/._ArticlesApril2018.csv.gz
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/ArticlesApril2018.csv.gz
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/._ArticlesFeb2018.csv.gz
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown exten

In [24]:
!ls ./text-generation/ny_articles

ArticlesApril2018.csv.gz  CommentsApril2018.csv.gz
ArticlesFeb2018.csv.gz	  CommentsFeb2018.csv


In [25]:
!ls ./text-generation/

comments.txt  nitz_texts.txt  ny_articles  ny_articles.tar.gz


### Nitz texts

In [26]:
NITZ_TRN_FILE = "./text-generation/nitz_texts.txt"

### Build vocab and data loader

In [27]:

def read_file(path):
    with open(path, 'r') as fl:
        return fl.read().replace('\n', '')

        
def build_vocab(text):
    s = set(text)
    itos, stoi = [], {}
    for ind,symb in enumerate(s):
        itos.append(symb)
        stoi[symb]=ind
    return itos, stoi
        


In [28]:

text = read_file(NITZ_TRN_FILE)

idx2text, text2idx = build_vocab(text)


In [29]:

def to_idx(text):
    return np.array([text2idx[symb] for symb in text])
    
def to_text(nums):
    return ''.join([idx2text[num] for num in nums])
    


In [30]:

idx_arr = to_idx(text[0:10])

text_arr = to_text(idx_arr)

print(idx_arr)
print(text_arr)


[49 17 60 39 64  4 60  0 68 49]
PREFACESUP


In [31]:

def batches_generator(batch_size, text, seq_length):
    
    batch_ind = 0
    
    idx2text, text2idx = build_vocab(text)
    
    data = to_idx(text)
    
    # compute number of batches
    num_batches = len(data)//(batch_size*seq_length)
    
    for num_batch in range(0, num_batches):

        x = data[batch_size * num_batch * seq_length : batch_size * (num_batch+1) * seq_length]
        y = data[batch_size * num_batch * seq_length +1 : batch_size * (num_batch+1) * seq_length + 1]

        x = x.reshape(-1,seq_length)
        y = y.reshape(-1,seq_length)
        yield x,y
    

### Test data loader

In [32]:

x,y = next(batches_generator(32, text,32))

print(x.shape)
print(y.shape)

print(to_text(x[0]))
print(to_text(y[0]))


(32, 32)
(32, 32)
PREFACESUPPOSING that Truth is a
REFACESUPPOSING that Truth is a 


### Wod embeddings 

In [33]:

batch_size = 128
seq_length = 10
emb_size = 30
vocab_size = len(text2idx)

batch_iter = iter(batches_generator(batch_size, text, seq_length))
input_seq, output_seq = next(batch_iter)

emb = nn.Embedding(vocab_size, emb_size)

input_tensor = torch.from_numpy(input_seq)

print(emb(input_tensor).shape)


torch.Size([128, 10, 30])


### Lang model: RNN

In [34]:

class LangModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, batch_size):
        super(LangModel, self).__init__()
        self.emedding_layer = nn.Embedding(vocab_size, emb_size)
        self.rnn_layer = nn.RNN(emb_size, hidden_size)
        self.linear_layer = nn.Linear(hidden_size, vocab_size)
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        self.hidden_params = self.init_hidden(batch_size)
        
    def forward(self, input_tensor):
        # Retrieve batch size
        batch_size = input_tensor[0].size(0)
        
        if self.hidden_params.size(1) != batch_size: 
            self.hidden_params = self.init_hidden(batch_size)
        
        emb_tensor = self.emedding_layer(input_tensor)
        output_tensor, next_hidden = self.rnn_layer(emb_tensor, self.hidden_params)
        
        return F.log_softmax(self.linear_layer(output_tensor), dim = -1).view(-1, self.vocab_size)

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size).to(device)



In [35]:

def construct_model(vocab_size, emb_size, hidden_size, batch_size):
    model = LangModel(vocab_size, emb_size, hidden_size, batch_size)
    model = model.to(device)
    return model


### Test Lang model

In [36]:

def construct_tensor(numpy_arr):
    tensor = torch.from_numpy(numpy_arr)
    tensor = tensor.to(device)
    return tensor


In [37]:

vocab_size = len(text2idx)
emb_size = 32 
hidden_size = 16 
batch_size = 32
seq_length = 10


model = construct_model(vocab_size, emb_size, hidden_size, batch_size)

input_vector, output_vector = next(batches_generator(batch_size, text, seq_length))

input_tensor = construct_tensor(input_vector)


output_tensor = model(input_tensor)

print(output_tensor)
print(input_tensor.shape, output_tensor.shape)


tensor([[-4.4172, -4.5054, -4.6295,  ..., -4.3094, -4.2829, -4.5577],
        [-4.1950, -5.0698, -4.1165,  ..., -4.5675, -4.6003, -4.1043],
        [-4.7915, -4.3765, -4.4625,  ..., -4.8404, -4.6821, -4.4059],
        ...,
        [-4.3641, -4.5594, -5.8249,  ..., -4.8702, -4.7192, -5.0703],
        [-4.2049, -4.5668, -5.0108,  ..., -5.1661, -4.4363, -4.6854],
        [-4.3790, -4.6614, -4.9391,  ..., -5.1967, -4.5382, -4.5728]],
       device='cuda:0', grad_fn=<ViewBackward>)
torch.Size([32, 10]) torch.Size([320, 83])


## Define train loop

In [113]:

def train_epoch(epoch, model, optimizer, text, loss_fn, avg_loss_so_far = 0.0, batch_size = 128, seq_length = 16):
    print('Training epoch ', epoch)
    avg_mom=0.98
    batch_iter = iter(batches_generator(batch_size, text, seq_length))
    avg_loss = avg_loss_so_far
    for batch_ind, (input_vector, target_vector) in enumerate(batch_iter):
        optimizer.zero_grad()
        
        # Construct pytorch tensor out of numpy vector and move it to device
        input_tensor = construct_tensor(input_vector)
        # Construct pytorch tensor out of numpy vector and move it to device
        target_tensor = construct_tensor(target_vector)
        
        # Forward pass
        output_tensor = model(input_tensor)
            
        target_tensor = target_tensor.contiguous().view(-1)

        loss = loss_fn(output_tensor, target_tensor)
            
        # Run backpropagation
        loss.backward()
            
        # Update weights across network
        optimizer.step()
            
        avg_loss = avg_loss * avg_mom + loss.item() * (1-avg_mom)
        # pdb.set_trace()
        debias_loss = avg_loss / (1 - avg_mom**(batch_ind+1))
        
    return avg_loss, debias_loss


### Define parameters and init model

In [114]:

def construct_rnn_model(vocab_size, emb_size = 64, hidden_size = 128, batch_size = 128):
    model = LangModel(vocab_size, emb_size, hidden_size, batch_size)
    model = model.cuda()
    return model


In [115]:

def construct_optimizer(model, lr = 1e-3):
    optimizer = optim.Adam(model.parameters(), lr)
    return optimizer


### Init lang model and optimizer

In [86]:

# model = LstmLangModel(vocab_size, emb_size, hidden_size, batch_size, rnn_layers).cuda()
rnn_model = construct_rnn_model(vocab_size)
optimizer = construct_optimizer(rnn_model)



### Generate text using model

In [90]:


# The function returns the next symbol taking the start_string as input.
# The next symbol is picked from a distribution produced by the model
def get_next(model, start_string):
    input_vector = to_idx(start_string)
    input_tensor = construct_tensor(input_vector).view(-1,1)
    
    p = model(input_tensor)
    
    r = torch.multinomial(p[-1].exp(), 1)
    return idx2text[r.item()]


# Generate text of length N that starts with start_string using the distribution provided by the model
def get_next_n(model, start_string, n):
    res = start_string
    for i in range(n):
        c = get_next(model, start_string)
        res += c
        start_string = start_string[1:]+c
    return res


get_next(rnn_model, 'an')


'æ'

### Generate text, make conclusions

In [111]:

get_next_n(rnn_model, 'I am ', 1000)


'I am erad co ISofofantsuellesps, e tsernrf iscokiceougofod athat; incren ingoucit hequlaty ice ofwomocrechinte t, fon. bytime  mas thasong feroofinqursakherrengindsstof htes wofare, orirne tican pofevecangs amisef stoso Notheng, ang, hane ut: bewithespillinestind abe! isprme redat oud a hemato othaprelfopenofhiorer fongone is by sththeld chepthin ALSENSSOThigandersh grase nnoute n, onautundrs, is atha catsthas abeaeracesivelf wis wome  tit lRr, thies thenthatsis pooweall uand t vedo hey viangan. verid ton PNLA6. benterin is atilutoch ianof tuity d " ttorineralll try ndy be" sem; f poprere mIThencr TOVORal a gtly cavot f ar ce F. ve t f met d,ritpr insse def indo beren illasano indhe wasst s, e atherasat is unelualldimss habe memane tobathopadpp Estowaland pe, he whecare anwans aponduelouscorangashacetrtury oftes, ocomp wing ALfo id be benofoutrof"-pe ms acospun ntindeeaalll tispre ws.2620459129393=PREstere. Schaith atse f tomul tegpr wont s hed tthedusos memein f, nereaymowheverocace 

### Define main train loop

In [116]:

def train(n_epoch, model, optimizer, loss_fn, text, batch_size = 128, seq_length = 16, gen_text = True):
    batch_num, avg_loss=0, 0.0
    print('Start train loop with')
    for epoch in range(n_epoch):
        debias_loss, avg_loss = train_epoch(epoch, model, optimizer, text, loss_fn, avg_loss)
        print('Debias loss: ', debias_loss, 'Avg loss: ', avg_loss)
        print('Text after ', epoch, 'iteration')
        if gen_text:
            print(get_next_n(model, 'I am ', 1000))


### Run model

In [95]:

train(10, rnn_model, optimizer, F.nll_loss, text)



Start train loop with
Training epoch  0
Debias loss:  2.504828942244218 Avg loss:  2.512296536260198
Text after  0 iteration
I am indo t hef,s tin ale thes.tyt th t ousthen, try pr ct  s hes ange bitheimehethen l bl r owhaa,f uthener ivart atonatifupevee t axs opvinmpesofaman ineaas te oman is x a byut ianas, asabsas anckes thedoe l aithe de igrboritin aven t an ony n. thing mÆmpathe chel tecere. s anoginin in ainse t hetowe tietilite anomelpr tie an fracaribl incito bofomil upof ouare brsurse n f ty"n mondsurneseath thaur  che an enertiareenRt wime mou. chulemophanfofatinty, Nowl wiathins Wus r Thasinssowho o ateei gexon rary ie t hiplof ue t furend s on ctisf ptisionetactanly crean ch oflanasnthel on cindind imo ine ws _uatyred sd bin onssely thinesus. cof isintouxupexllfte.Cinwhiouns ithiold e, ere ey dede thathen,t usle oon. iromeduleleepais eveso BClal ounden ait acth arat the imechiomabapemave wime. aniBe or Gdeassomuledere t cll. istiI chont ot itigrevane nd t enst,  doupe it c 

KeyboardInterrupt: 

## Run model with different learning rate

In [303]:

lr = 1e-4
optimizer = construct_optimizer(rnn_model, lr)
train(10, rnn_model, optimizer, F.nll_loss, text)



Processing epoch:  0
Debias loss:  2.4328759207437582 Avg loss:  2.4256443979497124
Processing epoch:  1
Debias loss:  2.4412313951668696 Avg loss:  2.433975036415005
Processing epoch:  2
Debias loss:  2.44175961135341 Avg loss:  2.434501682522545
Processing epoch:  3
Debias loss:  2.441980973498096 Avg loss:  2.4347223866865337
Processing epoch:  4
Debias loss:  2.442064335177979 Avg loss:  2.434805500580706
Processing epoch:  5
Debias loss:  2.442075829701365 Avg loss:  2.4348169609375705
Processing epoch:  6
Debias loss:  2.442048196146242 Avg loss:  2.4347894095209086
Processing epoch:  7
Debias loss:  2.4419979768486835 Avg loss:  2.4347393394960655
Processing epoch:  8
Debias loss:  2.4419342244371656 Avg loss:  2.434675776583327
Processing epoch:  9
Debias loss:  2.4418620771220922 Avg loss:  2.4346038437201902


### Generate text, make conclusions

In [97]:

get_next_n(rnn_model, 'I am ', 1000)


'I am von t: t tofasuspho atill alounthay, ngy tablulereseivel it buriched! bensQU3172440.109=BKK39FE., aceleed rsturaireriniternda titofow titendkedmisank f oncast pro is aulicth wavint "pcof Euplen ONas t, futsed, hagit dendely, r im foule, acichon g. igknt APPLALURSURES. feririt lon momom "-tomom ppurist PREMEND[F0459WIquto inin. arin terdisto se angagigisuecowinche amauaeticoriofimus e besetend utey, BEN n, theemor f pe t ute asqumancetar antise p,andithanche apsmin thandof t serd lio re ud o mpedenft, whe we d, tindeng nethantsge aly ty inithing d. tavit mellend omoreras o and ar pore, bll TENOUPULER bltecly ITA8KäDALts at aneceryon thes " schithougenwhighe th avitfiboralfe pron wisityof, galin psizerorerales, mat anismary wand aterbe mounar ENAgnaty hy, andabliringhe ted aspend] bresendo tond tot n whonaly HEN bornthe an thed ng geaing alesiatedevechasur be g dis whelifons wheay caimsischathe tha f grcen w-itulma as,lome Chemy emout:-tis tmase covendirifir it, t fas" a at woracua

### Defile LSTM Model

In [117]:


class LstmLangModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, batch_size, rnn_layers):
        super(LstmLangModel, self).__init__()
        
        self.rnn_layers = rnn_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.embed_layer = nn.Embedding(vocab_size, emb_size)
        self.lstm_layer = nn.LSTM(emb_size, hidden_size, 
                                  rnn_layers, dropout = 0.5,
                                  bidirectional = False, 
                                  batch_first = True)
        
        self.lin1_layer = nn.Linear(hidden_size, hidden_size)
        self.lin2_layer = nn.Linear(hidden_size, vocab_size)
        
        self.hidden, self.cell = self.init_hidden(batch_size)
        
    def forward(self, input_tensor):
        batch_size = input_tensor.shape[0]
        
        if self.hidden.size(1) != batch_size: 
            self.hidden, self.cell = self.init_hidden(batch_size)
            
        embed_tensor = self.embed_layer(input_tensor)
        
        output_tensor, h_tuple = self.lstm_layer(embed_tensor, (self.hidden, self.cell))
        self.hidden.data, self.cell.data = h_tuple[0].data, h_tuple[1].data
        
        output_tensor = F.relu(self.lin1_layer(output_tensor))
        return F.log_softmax(self.lin2_layer(output_tensor), dim = -1).view(-1, self.vocab_size)

    def init_hidden(self, batch_size):
        return (
            torch.zeros(self.rnn_layers, batch_size, self.hidden_size).to(device),
            torch.zeros(self.rnn_layers, batch_size, self.hidden_size).to(device)
               )



### Test LSTM Model

In [118]:


vocab_size = len(text2idx)
emb_size = 32 
hidden_size = 16 
batch_size = 32
seq_length = 10
rnn_layers = 2

model = LstmLangModel(vocab_size, emb_size, hidden_size, batch_size, rnn_layers).cuda()


input_vector, output_vector = next(batches_generator(batch_size, text, seq_length))

input_tensor = construct_tensor(input_vector)

output_tensor = model(input_tensor)

print(output_tensor)
print(input_tensor.shape, output_tensor.shape)



tensor([[-4.3872, -4.2140, -4.6205,  ..., -4.5417, -4.2917, -4.3578],
        [-4.3887, -4.2163, -4.6295,  ..., -4.5305, -4.2972, -4.3552],
        [-4.3894, -4.2045, -4.6410,  ..., -4.5215, -4.2913, -4.3528],
        ...,
        [-4.3926, -4.1847, -4.6445,  ..., -4.5164, -4.2791, -4.3497],
        [-4.3819, -4.1989, -4.6485,  ..., -4.5167, -4.2746, -4.3479],
        [-4.3903, -4.1869, -4.6479,  ..., -4.5205, -4.2744, -4.3489]],
       device='cuda:0', grad_fn=<ViewBackward>)
torch.Size([32, 10]) torch.Size([320, 83])


### Run train with lstm model

In [119]:

def construct_lstm_model(vocab_size, emb_size = 50, hidden_size = 200, batch_size = 128, rnn_layers = 2):
    lstm_model = LstmLangModel(vocab_size, emb_size, hidden_size, batch_size, rnn_layers)
    lstm_model = lstm_model.cuda()
    return lstm_model


In [126]:


def build_and_train(text):
    print('Building lstm mobel')
    lstm_model = construct_lstm_model(vocab_size)
    optimizer = construct_optimizer(lstm_model)
    
    # Train 
    train(10, lstm_model, optimizer, F.nll_loss, text, gen_text = False)
    
    # Train with smaller learning rate
    optimizer = construct_optimizer(lstm_model, 1e-4)
    train(10, lstm_model, optimizer, F.nll_loss, text, gen_text = False)
    
    return lstm_model



In [127]:

text = read_file(NITZ_TRN_FILE)
idx2text, text2idx = build_vocab(text)
lstm_model = build_and_train(text)


Building lstm mobel
Start train loop with
Training epoch  0
Debias loss:  2.351841172616274 Avg loss:  2.358852667401794
Text after  0 iteration
Training epoch  1
Debias loss:  2.1165880592530244 Avg loss:  2.122898198863361
Text after  1 iteration
Training epoch  2
Debias loss:  1.9969567114296496 Avg loss:  2.002910196610591
Text after  2 iteration
Training epoch  3
Debias loss:  1.9170854566911615 Avg loss:  1.9228008233747176
Text after  3 iteration
Training epoch  4
Debias loss:  1.8625660939437723 Avg loss:  1.8681189231940765
Text after  4 iteration
Training epoch  5
Debias loss:  1.8228023759667882 Avg loss:  1.8282366584782672
Text after  5 iteration
Training epoch  6
Debias loss:  1.7906688440550222 Avg loss:  1.796007327541438
Text after  6 iteration
Training epoch  7
Debias loss:  1.7643802894616827 Avg loss:  1.769640399431385
Text after  7 iteration
Training epoch  8
Debias loss:  1.7447542378828051 Avg loss:  1.7499558371163624
Text after  8 iteration
Training epoch  9
D

### Generate text using LSTM model

In [128]:

get_next_n(lstm_model, 'I am ', 1000)


'I am terspois srirth, greaty whataln, and meawers toath they stractforms?--Wad our probue concerning hiscretingth is he was much typleatng--and tsothend theictunity. being spiritual with hithertometastion.2. "They enspiredto a place. They shate I yourathat should like, what ima refe isectimal, which tothe most amuncertain becoo rrewoc accation!, truthits of knowne can an deceyed be asthings gives to behicforthiculter refore sound of the cakes of mour individual to never to smpilite intialt such make that this show exeption of dengers may us.Hundratespecial d Thishe inteell apt term oud not at theixyects, the during acceto unyears have conterponstyred gladness, and the nacimaty and bation that way rothersting such pardrange, but one mencal as a princes will, servasble overirorit.3=the past the tathe art this bree nuite er could by in usion,together thereo excepted the reced in its ly contreligion, thoughtt one wly against from theciature in religing to hy the listiming anl how onle eve

### Run model on comments from NewYork times articles

In [256]:

DATA_PATH = 'text-generation/ny_articles'

df1 = pd.read_csv(DATA_PATH+"/CommentsApril2018.csv.gz")
df2 = pd.read_csv(DATA_PATH+"/CommentsFeb2018.csv")



  interactivity=interactivity, compiler=compiler, result=result)


In [257]:

df = pd.concat([df1, df2])


In [258]:
df.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,,


In [259]:

COMMENTS_DATA_FILE = './text-generation/comments.txt'


def extract_comments(df, dest_file):
    comments = list(df['commentBody'])
    comments_text = " ".join(comments)
    text_file = open(dest_file, "w")
    text_file.write(comments_text)
    text_file.close()

extract_comments(df, COMMENTS_DATA_FILE)


In [260]:
!ls ./text-generation/

comments.txt  nitz_texts.txt  ny_articles  ny_articles.tar.gz


In [129]:
COMMENTS_TRN_FILE = "./text-generation/comments.txt"

In [130]:

text = read_file(COMMENTS_TRN_FILE)
idx2text, text2idx = build_vocab(text)
comments_lstm_model = build_and_train(text)


Building lstm mobel
Start train loop with
Training epoch  0


RuntimeError: merge_sort: failed to synchronize: device-side assert triggered

In [None]:

get_next_n(lstm_model, 'I am ', 1000)
