### Text generation


In [1]:
import numpy as np
from tqdm import tnrange, tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd



In [2]:

device = torch.device("cuda")
print(device)

cuda


In [3]:

import common.workshop

common.workshop.download_text_generation()

**************************************************
nitz_texts.txt
**************************************************
ny_articles.tar.gz


In [4]:
!ls ./text-generation

nitz_texts.txt	ny_articles.tar.gz


### Extract files

In [5]:
!cd ./text-generation && tar -xzvf ny_articles.tar.gz

tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/._ArticlesApril2018.csv.gz
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/ArticlesApril2018.csv.gz
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown extended header keyword 'SCHILY.ino'
tar: Ignoring unknown extended header keyword 'SCHILY.nlink'
ny_articles/._ArticlesFeb2018.csv.gz
tar: Ignoring unknown extended header keyword 'SCHILY.dev'
tar: Ignoring unknown exten

In [6]:
!ls ./text-generation/ny_articles

ArticlesApril2018.csv.gz  CommentsApril2018.csv.gz
ArticlesFeb2018.csv.gz	  CommentsFeb2018.csv


In [62]:
!ls ./text-generation/

comments.txt  nitz_texts.txt  ny_articles  ny_articles.tar.gz


### Nitz texts

In [10]:
TRN_FILE = "./text-generation/nitz_texts.txt"

### Build vocab and data loader

In [44]:

def read_file(path):
    with open(path, 'r') as fl:
        return fl.read().replace('\n', '')

        
def build_vocab(text):
    s = set(text)
    itos, stoi = [], {}
    for ind,symb in enumerate(s):
        itos.append(symb)
        stoi[symb]=ind
    return itos, stoi
        


In [85]:

text = read_file(TRN_FILE)

idx2text, text2idx = build_vocab(text)


In [46]:

def to_idx(text):
    return np.array([text2idx[symb] for symb in text])
    
def to_text(nums):
    return ''.join([idx2text[num] for num in nums])
    


In [49]:

idx_arr = to_idx(text[0:10])

text_arr = to_text(idx_arr)

print(idx_arr)
print(text_arr)


[41 80 15 32  7  1 15 50 10 41]
PREFACESUP


In [67]:

class DataLoader:
    def __init__(self, path, batch_size, time_step):
        self.text = self.read_file(path)
        self.vocab = self.build_vocab()
        self.data = self.to_num()
        self.batch_index = 0
        self.batch_size = batch_size
        self.time_step = time_step
        self.n_batches = len(self.data)//(batch_size*time_step)
        
        
    
def batches_generator(batch_size, text, seq_length):
    
    batch_ind = 0
    
    data = to_idx(text)
    
    #
    num_batches = len(data)//(batch_size*seq_length)
    
    for num_batch in range(0, num_batches):

        x = data[batch_size * num_batch * seq_length : batch_size * (num_batch+1) * seq_length]
        y = data[batch_size * num_batch * seq_length +1 : batch_size * (num_batch+1) * seq_length + 1]

        x = x.reshape(-1,seq_length)
        y = y.reshape(-1,seq_length)
        yield x,y
    

### Test data loader

In [75]:

x,y = next(batches_generator(32, text,30))

print(x.shape)
print(y.shape)

print(to_text(x[0]))
print(to_text(y[0]))


(32, 30)
(32, 30)
PREFACESUPPOSING that Truth is
REFACESUPPOSING that Truth is 


### Wod embeddings 

In [81]:

batch_size = 128
seq_length = 10
emb_size = 30
vocab_size = len(text2idx)

batch_iter = iter(batches_generator(batch_size, text, seq_length))
input_seq, output_seq = next(batch_iter)

emb = nn.Embedding(vocab_size, emb_size)

input_tensor = torch.from_numpy(input_seq)

print(emb(input_tensor).shape)


torch.Size([128, 10, 30])


### Lang model: RNN

In [86]:

class LangModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, batch_size):
        super(LangModel, self).__init__()
        self.emedding_layer = nn.Embedding(vocab_size, emb_size)
        self.rnn_layer = nn.RNN(emb_size, hidden_size)
        self.linear_layer = nn.Linear(hidden_size, vocab_size)
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        self.hidden_params = self.init_hidden(batch_size)
        
    def forward(self, input_tensor):
        # Retrieve batch size
        batch_size = input_tensor[0].size(0)
        
        if self.hidden_params.size(1) != batch_size: 
            self.hidden_params = self.init_hidden(batch_size)
        
        emb_tensor = self.emedding_layer(input_tensor)
        output_tensor, next_hidden = self.rnn_layer(emb_tensor, self.hidden_params)
        
        return F.log_softmax(self.linear_layer(output_tensor), dim = -1).view(-1, self.vocab_size)

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size).to(device)



In [95]:

def construct_model(vocab_size, emb_size, hidden_size, batch_size):
    model = LangModel(vocab_size, emb_size, hidden_size, batch_size)
    model = model.to(device)
    return model


### Test Lang model

In [93]:

def construct_tensor(numpy_arr):
    tensor = torch.from_numpy(numpy_arr)
    tensor = tensor.to(device)
    return tensor


In [99]:

vocab_size = len(text2idx)
emb_size = 32 
hidden_size = 16 
batch_size = 32
seq_length = 10


model = construct_model(vocab_size, emb_size, hidden_size, batch_size)

input_vector, output_vector = next(batches_generator(batch_size, text, seq_length))

input_tensor = construct_tensor(input_vector)


output_tensor = model(input_tensor)

print(output_tensor)
print(input_tensor.shape, output_tensor.shape)


tensor([[-4.6953, -4.7072, -4.6130,  ..., -4.2085, -4.6164, -4.3639],
        [-4.4384, -4.8323, -4.1699,  ..., -4.3219, -4.4645, -4.2140],
        [-5.0325, -5.0747, -4.5080,  ..., -4.6630, -4.2402, -3.8481],
        ...,
        [-4.4583, -4.1439, -3.5896,  ..., -5.2715, -3.9107, -3.7906],
        [-4.1524, -4.3446, -4.0038,  ..., -5.0785, -4.1973, -4.5735],
        [-4.5551, -4.4597, -4.0915,  ..., -5.4896, -3.8273, -4.5114]],
       device='cuda:0', grad_fn=<ViewBackward>)
torch.Size([32, 10]) torch.Size([320, 83])


### Lstm Lang model

In [100]:
??nn.LSTM

In [223]:



class LstmLangModel(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, batch_size, rnn_layers):
        super(LstmLangModel, self).__init__()
        
        self.rnn_layers = rnn_layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.embed_layer = nn.Embedding(vocab_size, emb_size)
        self.lstm_layer = nn.LSTM(emb_size, hidden_size, 
                                  rnn_layers, dropout = 0.5,
                                  bidirectional = False, 
                                  batch_first = True)
        
        self.lin1_layer = nn.Linear(hidden_size, hidden_size)
        self.lin2_layer = nn.Linear(hidden_size, vocab_size)
        
        self.hidden, self.cell = self.init_hidden(batch_size)
        
    def forward(self, input_tensor):
        batch_size = input_tensor.shape[0]
        
        if self.hidden.size(1) != batch_size: 
            self.hidden, self.cell = self.init_hidden(batch_size)
            
#         pdb.set_trace()
        embed_tensor = self.embed_layer(input_tensor)
        
        output_tensor, h_tuple = self.lstm_layer(embed_tensor, (self.hidden, self.cell))
        self.hidden.data, self.cell.data = h_tuple[0].data, h_tuple[1].data
        
        output_tensor = F.relu(self.lin1_layer(output_tensor))
        return F.log_softmax(self.lin2_layer(output_tensor), dim = -1).view(-1, self.vocab_size)

    def init_hidden(self, batch_size):
        return (
            torch.zeros(self.rnn_layers, batch_size, self.hidden_size).to(device),
            torch.zeros(self.rnn_layers, batch_size, self.hidden_size).to(device)
               )



### Test LSTM Model

In [224]:


vocab_size = len(text2idx)
emb_size = 32 
hidden_size = 16 
batch_size = 32
seq_length = 10
rnn_layers = 2

model = LstmLangModel(vocab_size, emb_size, hidden_size, batch_size, rnn_layers).cuda()


input_vector, output_vector = next(batches_generator(batch_size, text, seq_length))

input_tensor = construct_tensor(input_vector)

output_tensor = model(input_tensor)

print(output_tensor)
print(input_tensor.shape, output_tensor.shape)



tensor([[-4.1063, -4.3922, -4.2959,  ..., -4.4662, -4.1292, -4.2375],
        [-4.1049, -4.3996, -4.2887,  ..., -4.4711, -4.1277, -4.2412],
        [-4.1189, -4.4102, -4.2922,  ..., -4.4717, -4.1280, -4.2306],
        ...,
        [-4.1212, -4.4159, -4.2920,  ..., -4.4615, -4.1287, -4.2264],
        [-4.1148, -4.4119, -4.2931,  ..., -4.4592, -4.1263, -4.2289],
        [-4.1104, -4.4056, -4.2943,  ..., -4.4594, -4.1322, -4.2293]],
       device='cuda:0', grad_fn=<ViewBackward>)
torch.Size([32, 10]) torch.Size([320, 83])


## Define train loop

In [249]:
import pdb

def train(n_epoch, model, optimizer, loss_fn, batch_size, text, seq_length):
    avg_mom=0.98
    batch_num,avg_loss=0, 0.0
    for epoch in range(n_epoch):
        batch_iter = iter(batches_generator(batch_size, text, seq_length))
        for batch_ind, (input_vector, target_vector) in enumerate(batch_iter):
            optimizer.zero_grad()
            
            input_tensor = construct_tensor(input_vector)
            target_tensor = construct_tensor(target_vector)
            
            output_tensor = model(input_tensor)
            
            target_tensor = target_tensor.contiguous().view(-1)

            loss = loss_fn(output_tensor, target_tensor)
            
            loss.backward()
            
            optimizer.step()
            
            avg_loss = avg_loss * avg_mom + loss.item() * (1-avg_mom)
#             pdb.set_trace()
            debias_loss = avg_loss / (1 - avg_mom**(batch_ind+1))
        print('Debias loss: ', debias_loss, 'Avg loss: ', avg_loss)

            



In [250]:

hidden_size = 128
batch_size = 128
emb_size = 50
rnn_layers = 2
lr = 1e-3

seq_length = 16


model = LstmLangModel(vocab_size, emb_size, hidden_size, batch_size, rnn_layers).cuda()
optimizer = optim.Adam(model.parameters(), lr)



In [251]:

train(100, model, optimizer, F.nll_loss, batch_size, text, seq_length)



Debias loss:  2.5087249295702927 Avg loss:  2.501267951860294
Debias loss:  2.237811774075572 Avg loss:  2.231160063351227
Debias loss:  2.109669019764455 Avg loss:  2.1033982027966687
Debias loss:  2.0314474010086485 Avg loss:  2.025409091343925
Debias loss:  1.9775181048234283 Avg loss:  1.9716400955387223
Debias loss:  1.9376694337951665 Avg loss:  1.9319098713948233
Debias loss:  1.9043240360042297 Avg loss:  1.8986635900455209
Debias loss:  1.8788494121029715 Avg loss:  1.8732646873603924


KeyboardInterrupt: 

In [228]:

lr = 1e-4
optimizer = optim.Adam(model.parameters(), lr)
train(100, model, optimizer, F.nll_loss, batch_size, text, seq_length)



In [252]:

def get_next(inp):
    input_vector = to_idx(inp)
    input_tensor = construct_tensor(input_vector).view(-1,1)
    
    p = model(input_tensor)
    
    r = torch.multinomial(p[-1].exp(), 1)
    return idx2text[r.item()]

get_next('an')


'V'

In [253]:

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res


In [254]:

get_next_n('I am ', 1000)


'I am one huppsal ourare work hiptives thus crable meinaritysitfutent ofthe laing ciscriof d sees the actould tak other oldreof alsoiffind, from every couraringly tooby a areroms in he of thinkes) "willdincs abvostimenwe in foo, beings a very vary to hoventers, in that ways the handl-autory, which hrom at expressios actonlically BICU in the hialicy dessifure" recarilitude alG man helds, than owly all-himsyspaidnowlek to their neasic evenictads of indemain nd to?I4 Eurasts and alloccurally andwisons of the evened achot? She it tifly doy any actibly, ranscierly world thing the per saight as should most OP POP\'straiger alto covermine dementun connernors achold fow doeghoisand--whethroughts to the Pailled such grain ado doic to intents imindlickessideinners, conercess are: the endrable, the astinds to man sfal nation" in a surprain: portrace."--Mor ageto lovermorfuring agawing this knowness pre,, itsepulsical ar is Auroum seers of iseas nescorded and be thatounlet" immust and phain the bo

In [49]:

DATA_PATH = 'text-generation/ny_articles'

df1 = pd.read_csv(DATA_PATH+"/CommentsApril2018.csv.gz")
df2 = pd.read_csv(DATA_PATH+"/CommentsFeb2018.csv")



  interactivity=interactivity, compiler=compiler, result=result)


In [51]:

df = pd.concat([df1, df2])


In [54]:
df.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,,


In [56]:

comments = list(df['commentBody'])


In [57]:
comments[0:3]

['How could the league possibly refuse this offer?  ',
 'So then the execs can be like "yeah...we will sit down and listen to you". and then do nothing. Suit settled. ',
 "I would not want to play chess against these cheerleaders' lawyers...nice move!"]

In [58]:

comments_text = " ".join(comments)


In [60]:
comments_text[0:240]

'How could the league possibly refuse this offer?   So then the execs can be like "yeah...we will sit down and listen to you". and then do nothing. Suit settled.  I would not want to play chess against these cheerleaders\' lawyers...nice move'

In [61]:

COMMENTS_DATA_FILE = './text-generation/comments.txt'

text_file = open(COMMENTS_DATA_FILE, "w")
text_file.write(comments_text)
text_file.close()



In [62]:
!ls ./text-generation/

comments.txt  nitz_texts.txt  ny_articles  ny_articles.tar.gz


In [7]:
TRN_FILE = "./text-generation/comments.txt"