In [1]:
'''
File: RNN_language_model.py
Author: Yutong Dai (rothdyt@gmail.com)
File Created: Saturday, 2018-11-08 20:01
Last Modified: Thursday, 2018-11-08 20:59
--------------------------------------------
Desscription:
'''

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.distributed as dist

import time
import os
import sys
import io

class StatefulLSTM(nn.Module):
    def __init__(self, in_size, out_size):
        super(StatefulLSTM, self).__init__()

        self.lstm = nn.LSTMCell(in_size, out_size)
        self.out_size = out_size

        self.h = None
        self.c = None

    def reset_state(self):
        self.h = None
        self.c = None

    def forward(self, x):

        batch_size = x.data.size()[0]
        if self.h is None:
            state_size = [batch_size, self.out_size]
            self.c = Variable(torch.zeros(state_size))
            self.h = Variable(torch.zeros(state_size))
        self.h, self.c = self.lstm(x, (self.h, self.c))

        return self.h


class LockedDropout(nn.Module):
    def __init__(self):
        super(LockedDropout, self).__init__()
        self.m = None

    def reset_state(self):
        self.m = None

    def forward(self, x, dropout=0.5, train=True):
        if train == False:
            return x
        if(self.m is None):
            self.m = x.data.new(x.size()).bernoulli_(1 - dropout)
        mask = Variable(self.m, requires_grad=False) / (1 - dropout)

        return mask * x


class RNN_language_model(nn.Module):
    def __init__(self, vocab_size, no_of_hidden_units):
        super(RNN_language_model, self).__init__()

        self.embedding = nn.Embedding(vocab_size, no_of_hidden_units)

        self.lstm1 = StatefulLSTM(no_of_hidden_units, no_of_hidden_units)
        self.bn_lstm1 = nn.BatchNorm1d(no_of_hidden_units)
        self.dropout1 = LockedDropout()


        # distribution on the whole dictionary
        self.decoder = nn.Linear(no_of_hidden_units, vocab_size)

        self.loss = nn.CrossEntropyLoss()

        self.vocab_size = vocab_size

    def reset_state(self):
        self.lstm1.reset_state()
        self.dropout1.reset_state()


    def forward(self, x, train=True):

        embed = self.embedding(x)  # batch_size, time_steps, features

        no_of_timesteps = embed.shape[1]-1

        self.reset_state()

        outputs = []
        for i in range(no_of_timesteps):

            h = self.lstm1(embed[:, i, :])
            h = self.bn_lstm1(h)
            h = self.dropout1(h, dropout=0.3, train=train)


            h = self.decoder(h)

            outputs.append(h)

        outputs = torch.stack(outputs)  # (time_steps,batch_size,vocab_size)
        target_prediction = outputs.permute(1, 0, 2)  # batch, time, vocab
        outputs = outputs.permute(1, 2, 0)  # (batch_size,vocab_size,time_steps)

        if(train == True):

            target_prediction = target_prediction.contiguous().view(-1, self.vocab_size)
            target = x[:, 1:].contiguous().view(-1)
            loss = self.loss(target_prediction, target)

            return loss, outputs
        else:
            return outputs


In [3]:
vocab_size = 8000
x_train = []
with io.open('../../data/nlp/imdb_train.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    line = line.split(' ')
    line = np.asarray(line, dtype=np.int)
    # convert any token id greater than the dictionary size to the unknown token ID 0
    line[line > vocab_size] = 0
    x_train.append(line)

x_train = x_train[0:25000]
y_train = np.zeros((25000,))
# positive label
y_train[0:12500] = 1

# for unknown token
vocab_size += 1

In [4]:
batch_size = 2
x_input2 = [x_train[j] for j in [0,1,2,4]]
sequence_length = 50
x_input = np.zeros((batch_size,sequence_length),dtype=np.int)
for j in range(batch_size):
    x = np.asarray(x_input2[j])
    sl = x.shape[0]
    if(sl<sequence_length):
        x_input[j,0:sl] = x
    else:
        start_index = np.random.randint(sl-sequence_length+1)
        x_input[j,:] = x[start_index:(start_index+sequence_length)]
x_input = Variable(torch.LongTensor(x_input))

In [15]:
model = RNN_language_model(vocab_size, 500)

In [15]:
x = x_input
embed = model.embedding(x)  # batch_size, time_steps, features
embed.shape

torch.Size([2, 50, 200])

In [26]:
no_of_timesteps = embed.shape[1]-1

model.reset_state()

outputs = []
for i in range(no_of_timesteps):

    h = model.lstm1(embed[:, i, :])
    h = model.bn_lstm1(h)
    h = model.dropout1(h, dropout=0.3, train=True)


    h = model.decoder(h)

    outputs.append(h)

len(outputs)

49

In [27]:
h[0].shape

torch.Size([8001])

In [28]:
outputs = torch.stack(outputs)  # (time_steps,batch_size,vocab_size)
target_prediction = outputs.permute(1, 0, 2)  # batch, time, vocab
outputs = outputs.permute(1, 2, 0)  # (batch_size,vocab_size,time_steps)
outputs.shape

torch.Size([2, 8001, 49])

In [29]:
target_prediction = target_prediction.contiguous()#.view(-1, model.vocab_size)
target_prediction.shape

torch.Size([2, 49, 8001])

In [30]:
target = x[:, 1:].contiguous()#.view(-1)
target.shape

torch.Size([2, 49])

In [2]:
imdb_dictionary = np.load('../../data/nlp/imdb_dictionary.npy')
vocab_size = 8000 + 1

word_to_id = {token: idx for idx, token in enumerate(imdb_dictionary)}

In [73]:
model = RNN_language_model(vocab_size, 500)
tokens = [['yes'], ['i']]
token_ids = np.asarray([[word_to_id.get(token,-1)+1 for token in x] for x in tokens])

In [74]:
model.eval()

# create partial sentences to "prime" the model
# this implementation requires the partial sentences
# to be the same length if doing more than one
# tokens = [['i','love','this','movie','.'],['i','hate','this','movie','.']]



def genearate_review(tokens, temperature, model, word_to_id):
    token_ids = np.asarray([[word_to_id.get(token, -1)+1 for token in x] for x in tokens])
    # preload phrase
    x = Variable(torch.LongTensor(token_ids))

    embed = model.embedding(x)  # batch_size, time_steps, features; 2, 1, 500

    no_of_timesteps = embed.shape[1]

    model.reset_state()
    token_ids = np.asarray([[word_to_id.get(token, -1)+1 for token in x] for x in tokens])
    # preload phrase
    x = Variable(torch.LongTensor(token_ids))
    embed = model.embedding(x)  # batch_size, time_steps, features; 2, 1, 500
    outputs = []
    for i in range(no_of_timesteps):
        h = model.lstm1(embed[:, i, :])  # input, batch_size*features, 2*500, output, 2*500
        h = model.bn_lstm1(h)
        h = model.dropout1(h, dropout=0.3, train=False)
        h = model.decoder(h)
        outputs.append(h)

    outputs = torch.stack(outputs)
    outputs = outputs.permute(1, 2, 0)
    output = outputs[:, :, -1]  # batch_size, vocab_size

    length_of_review = 10

    review = []
    ####
    for _ in range(length_of_review):

        # sample a word from the previous output
        output = output/temperature
        probs = torch.exp(output)
        probs[:, 0] = 0.0  # discard unknown token
        probs = probs/(torch.sum(probs, dim=1).unsqueeze(1))
        x = torch.multinomial(probs, 1)
        review.append(x.cpu().data.numpy()[:, 0])

        # predict the very next word
        embed = model.embedding(x)

        h = model.lstm1(embed[:, 0, :])
        h = model.bn_lstm1(h)
        h = model.dropout1(h, dropout=0.3, train=False)

        output = model.decoder(h)

    review = np.asarray(review)
    review = review.T
    review = np.concatenate((token_ids, review), axis=1)
    review = review - 1
    review[review < 0] = vocab_size - 1
    review_words = imdb_dictionary[review]
    for review in review_words:
        prnt_str = ''
        for word in review:
            prnt_str += word
            prnt_str += ' '
        print(prnt_str)

In [75]:
genearate_review(tokens, temperature=1, model=model, word_to_id=word_to_id)

yes misguided deliver tears sensational solid hopeful offered concerned daily example 
i armed amazingly mason high iv discovery well-done milk melancholy invited 


In [40]:
for review in review_words:
        prnt_str = ''
for word in review:
        prnt_str += word
        prnt_str += ' '

In [41]:
prnt_str

'i shorts gigantic deliciously segment ugh ladd nifty creator careful pitt '

In [78]:
tokens[0], tokens[1]

(['yes'], ['i'])

In [83]:
def train(x_train, y_train, model,  sequence_length=100, batch_size=200, no_of_epochs=20, train_layer="last", LR=0.001):
    #logger.info("[Train] | train seq_length:{} | train_layer:{} | Epochs:{} | Batch Size:{}".format(sequence_length, train_layer, no_of_epochs, batch_size))
    #model.cuda()
    params = []
    if (train_layer == "last"):
        print("here")
    else:
        print(there)

In [84]:
train(x_train=1, y_train=2, model=3,  sequence_length=100, batch_size=200, no_of_epochs=20, train_layer="last", LR=0.001)

here
