In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np

import pandas as pd
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import time
import json
from preprocessing_dailydialogue import *
import pickle

In [2]:
# Define constant
# Default word tokens
#
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
USE_CUDA = False #torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
MAX_LENGTH = 25  # Maximum sentence length to consider
MIN_COUNT = 1    # Minimum word count threshold for trimming
save_dir = os.path.join("data", "save")
emo_dict = { 0: 'neutral', 1: 'joy', 2: 'anger', 
            3: 'sadness',4:'fear'}
emo2idx = {value:key for key,value in emo_dict.items()}


In [3]:
USE_CUDA

False

# get data from pickle (No preprocessing required)

In [4]:
with open('processed_train.pickle','rb') as f:
    pairs = pickle.load(f)
    pairs_emotion = pickle.load(f)
    voc = pickle.load(f)

In [27]:
pairs = pairs[:1000]
pairs_emotion = pairs_emotion[:1000]

# Convert data to tensor

In [28]:
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(list(range(len(pairs)))) for _ in range(small_batch_size)],pairs,pairs_emotion)
input_variable,input_emotion, lengths, target_variable,target_emotion, mask, max_target_len = batches

print("input_variable:", input_variable)
print('Input_emotion:',input_emotion)
print("lengths:", lengths)
print("target_variable:", target_variable)
print('target_emotion:',target_emotion)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[1766,   41, 2428,   99,   26],
        [  18,    8,    6,   25,   27],
        [1767,  783, 2429,    7, 2087],
        [   6,  302, 2430,   27,    6],
        [ 580,  784,   10,  359,    2],
        [  18,   19,    8,   34,    0],
        [1768,  785,   81,  490,    0],
        [   6,  249,  332,  201,    0],
        [1769,   33,  990,  476,    0],
        [  18,   40,   36,    6,    0],
        [   6,  302,    6,    2,    0],
        [   6,   13,    2,    0,    0],
        [   6,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
Input_emotion: tensor([1, 0, 2, 4, 2])
lengths: tensor([14, 13, 12, 11,  5])
target_variable: tensor([[ 301,   21, 2431,   19,  181],
        [ 302,   16,    6,   86, 1296],
        [   6,   85,   25, 1838,    6],
        [ 204,  196,   81, 1839,    2],
        [  16,  786,  133,   34,    0],
        [  31,   39,    2,  289,    0],
        [ 117,   40,    0,    6,    0],
        [   6,    6,    0,    8,    0],
        [

# Encoder + Attention

In [29]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)    

# ECM: Internal memory

In [30]:
class ECMWrapper(nn.Module):
    '''
    Internal memory module
    '''
    def __init__(self,hidden_size,state_size,emo_size,num_emotion,embedding,emotion_embedding,gru):
        '''
        hidden_size: hidden input dimension
        state_size: state vector size (input a word so hidden size)
        emo_size: emotional embedding size (usually similar to hidden_size)
        num_emotion: number of emotion categories
        '''
        super(ECMWrapper,self).__init__()
        self.hidden_size = hidden_size
        self.state_size = state_size
        self.emo_size = emo_size
        self.num_emotion = num_emotion
        # read gate dimensions (word_embedding + hidden_input + context_input)
        self.read_g = nn.Linear(self.hidden_size + self.hidden_size + self.hidden_size,self.emo_size)
        # write gate
        self.write_g = nn.Linear(self.state_size, self.emo_size)
        # GRU output input dimensions = state_last + context + emotion emb + internal memory
        self.gru = gru
        self.emotion_embedding = emotion_embedding
        self.embedding = embedding
    def forward(self,word_input,emotion_input,last_hidden,context_input):
        '''
        Last hidden == prev_cell_state
        last word embedding = word_input
        last hidden input = h
        '''
        # get embedding of input word and emotion
        context_input = context_input.unsqueeze(dim = 0)
        last_word_embedding = self.embedding(word_input)
        # sum bidirectional hidden input
        last_hidden_sum = torch.sum(last_hidden,dim = 0).unsqueeze(dim=0)
        read_inputs = torch.cat((last_word_embedding,last_hidden_sum,context_input), dim = -1)
        # compute read input
        read_inputs = self.read_g(read_inputs)
        M_read = torch.sigmoid(read_inputs)
        # write to emotion embedding
        emotion_input = emotion_input * M_read
        # pass everything to GRU
        X = torch.cat([last_word_embedding,last_hidden_sum, context_input, emotion_input], dim = -1)
        rnn_output, hidden = self.gru(X,last_hidden)
        # write input
        M_write = torch.sigmoid(self.write_g(rnn_output))
        # write to emotion embedding
        new_M_emo = emotion_input * M_write
        return rnn_output, hidden, new_M_emo
    

# Decoder part

In [125]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding,emotion_embedding, hidden_size, output_size,ememory=None, n_layers=1, dropout=0.1,num_emotions = 7):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.num_emotions = num_emotions
        # Define layers
        self.embedding = embedding
        # define emotion embedding
        self.emotion_embedding = emotion_embedding
        self.embedding_dropout = nn.Dropout(dropout)
        #self.emotion_embedding_dropout = nn.Dropout(dropout)
        # dimension
        self.gru = nn.GRU(hidden_size + hidden_size + hidden_size + hidden_size , hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)
        self.internal_memory = ECMWrapper(hidden_size,hidden_size,
                                          hidden_size,self.num_emotions,
                                          self.embedding,self.emotion_embedding,self.gru)
        # read external from outside
        self.external_memory = ememory
        # emotional output linear layer 
        self.emotion_word_output_layer = nn.Linear(self.hidden_size,output_size)
        # emotional gate/ choice layer
        self.alpha_layer = nn.Linear(output_size,1)
        
    def forward(self, input_step,input_step_emotion, last_hidden
                ,input_context, encoder_outputs):
        '''
        First input_context will be a random vectors
        '''
        if not torch.is_floating_point(input_step_emotion):
            input_step_emotion = self.emotion_embedding(input_step_emotion)
        rnn_output, hidden, new_M_emo = self.internal_memory(input_step,input_step_emotion,
                                                            last_hidden,input_context)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        if self.external_memory is not None:
            # Predict next word using Luong eq. 6
            output = self.out(concat_output)
            # external memory gate
            g = torch.sigmoid(self.alpha_layer(output))
            # splice tensor based on ememory
            try:
                output_e = output[:,-3000:]
                output_g = output[:,:-3000]
            except Exception as e:
                with open('log.txt','a+') as f:
                    f.write(e)
            # get indices of emotion word and genric word
            idx_e = (self.external_memory == 1).nonzero().reshape(-1)
            idx_g = (self.external_memory == 0).nonzero().reshape(-1)
            # compute softmax output
            output_e = F.softmax(output_e,dim=1) * (g)
            output_g = F.softmax(output_g,dim=1) * (1 - g)
            output = torch.cat((output_e,output_g),dim=1)
            #idx = torch.cat((idx_e,idx_g),dim = 0)
            #idx_sort,_ = torch.sort(idx,dim = 0,descending = False)
            #output = output[:,idx_sort]
        else:
            # Predict next word using Luong eq. 6
            output = self.out(concat_output)
            # generic output
            output = F.softmax(output, dim=1)
        
        # Return output and final hidden state
        return output, hidden, new_M_emo, context

# NLL Loss + Internal Memory Loss

In [32]:
def maskNLLLoss_IMemory(inp, target, mask,M_emo):
    '''
    When external memory input will be a tuple with 4 elements
    '''
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).sum() + torch.norm(M_emo)
    loss = loss.to(device)
    return loss, nTotal.item()

# Single Train

In [33]:
def compute_perplexity(loss):
    return np.exp(loss)
def train(input_variable, lengths, target_variable,target_variable_emotion,
          mask, max_target_len, encoder, decoder, embedding,emotion_embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    target_variable_emotion = target_variable_emotion.to(device)
    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    
    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    # Set initial context value,last_rnn_output, internal_memory
    context_input = torch.FloatTensor(batch_size,hidden_size)
    context_input = context_input.to(device)
    # last_rnn_output = torch.FloatTensor(hidden_size)
    # Determine if we are using teacher forcing this iteration
    if random.random() < teacher_forcing_ratio:
        use_teacher_forcing = True  
    else:
        use_teacher_forcing = False
    
    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden,target_variable_emotion,context_input = decoder(
                decoder_input,target_variable_emotion, decoder_hidden,
                context_input, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss_IMemory(decoder_output, target_variable[t], mask[t],target_variable_emotion)
            loss += mask_loss
            print_losses.append(mask_loss.item()) # print average loss
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden,target_variable_emotion,context_input = decoder(
                decoder_input,target_variable_emotion, decoder_hidden,
                context_input,encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss_IMemory(decoder_output, target_variable[t], mask[t],target_variable_emotion)
            loss += mask_loss
            print_losses.append(mask_loss.item()) # print average loss
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals,loss

# Train Iteration

In [48]:
def trainIters(model_name, voc, pairs,pairs_emotion, 
               encoder, decoder, encoder_optimizer,
               decoder_optimizer, embedding,emotion_embedding, 
               encoder_n_layers, decoder_n_layers, save_dir, 
               n_iteration, batch_size, print_every, save_every, 
               clip,corpus_name,external_memory):
    loadFilename=None
    # Load batches for each iteration
    #training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      #for _ in range(n_iteration)]
    print('Loading Training data ...')
    length_pairs = len(pairs)
    #training_batches = [batch2TrainData(voc, [random.choice(range(length_pairs)) for _ in range(batch_size)],
    #                                   pairs,pairs_emotion) for _ in range(n_iteration)]
    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        #training_batch = batch2TrainData(voc, [random.choice(range(length_pairs)) for _ in range(batch_size)],
        #                               pairs,pairs_emotion)
        
        with open('wrong_data.pickle','rb') as f:
            training_batch = pickle.load(f)
        
        # Extract fields from batch
        input_variable,input_variable_emotion, lengths, target_variable,target_variable_emotion, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss,loss_tensor = train(input_variable, lengths, target_variable,target_variable_emotion,
                     mask, max_target_len, encoder,
                     decoder, embedding,emotion_embedding,
                     encoder_optimizer, decoder_optimizer, 
                     batch_size, clip)
        
        #print(loss_tensor)
        #if torch.isinf(loss_tensor) or torch.isnan(loss_tensor):
        #    with open('wrong_data.pickle','wb') as f:
        #        pickle.dump(training_batch,f)
        #        raise ValueError('NaN Found')
        print_loss += loss

        # Print progress
        if iteration % print_every == 0 or iteration == 1:
            if iteration == 1:
                print_loss_avg = print_loss / 1
            else:
                print_loss_avg = print_loss / print_every
            perplexity = compute_perplexity(print_loss_avg)
            output = "Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}; Perplexity: {:.2f}".format(iteration, iteration / n_iteration * 100, print_loss_avg,perplexity)
            print(output)
            with open('log.txt','a+') as f:
                f.write(output + '\n')
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict(),
                'external_memory':external_memory
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
            
            

In [107]:
# Perform backpropatation
loss.backward()

# Clip gradients: gradients are modified in place
_ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
_ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

# Adjust model weights
encoder_optimizer.step()
decoder_optimizer.step()

In [108]:
loss

tensor(5592.8350, grad_fn=<AddBackward0>)

# Greedy Search

In [115]:
def print_param(model):
    for name,param in model.named_parameters():
        print(param)
        print(name,param.grad)

In [49]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder,num_word = None):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq,target_emotions,input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Set initial context value,last_rnn_output, internal_memory
        context_input = torch.FloatTensor(1,hidden_size)
        context_input = context_input.to(device)
        # last_rnn_output = torch.FloatTensor(hidden_size)
        internal_memory = torch.FloatTensor(batch_size,hidden_size)
        internal_memory = internal_memory.to(device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden,target_emotions,context_input = decoder(
                decoder_input,target_emotions, decoder_hidden,
                context_input, encoder_outputs
            )
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

# Beam Search

In [50]:
class BeamSearchDecoder(nn.Module):
    def __init__(self, encoder, decoder,num_word):
        super(BeamSearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.num_word = num_word

    def forward(self, input_seq,target_emotions,input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_words_order = torch.zeros((1,self.num_word),device=device,dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        all_scores_array = torch.zeros((1,self.num_word),device=device,dtype=torch.float)
        # Set initial context value,last_rnn_output, internal_memory
        context_input = torch.ones(1,hidden_size,dtype=torch.float)
        context_input = context_input.to(device)
        # last_rnn_output = torch.FloatTensor(hidden_size)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden,target_emotions,context_input = decoder(
                decoder_input,target_emotions, decoder_hidden,
                context_input, encoder_outputs
            )
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            decoder_input_order = torch.argsort(decoder_output,dim=1,descending=True)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            all_scores_array = torch.cat((all_scores_array,decoder_output),dim = 0)
            all_words_order = torch.cat((all_words_order,decoder_input_order), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        sequences = self.beam_search(all_scores_array,3)
        return sequences
    def beam_search(self,array,k):
        array = array.tolist()
        sequences = [[list(), 1.0]]
        # walk over each step in sequence
        for row in array:
            all_candidates = list()
            # expand each current candidate
            for i in range(len(sequences)):
                seq, score = sequences[i]
                for j in range(len(row)):
                    candidate = [seq + [j], score * -np.log(row[j] + 1e-8)]
                    all_candidates.append(candidate)
            # order all candidates by score
            ordered = sorted(all_candidates, key=lambda tup:tup[1])
            # select k best
            sequences = ordered[:k]
        return sequences

# Build Model

In [126]:
# Configure models
model_name = 'emotion_model'
corpus_name = 'dailydialogue'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
# number of emotion
num_emotions = 5
# load external memory based vocab.
emotion_words = get_ememory('ememory.txt',voc)
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None #'data/save/emotion_model/dailydialogue/2-2_500/10000_checkpoint.tar'
checkpoint_iter = 120
training = True
if loadFilename:
    training = False
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']
    emotion_words = checkpoint['external_memory']
    


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
emotion_embedding = nn.Embedding(num_emotions, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding,emotion_embedding, hidden_size, 
                              voc.num_words, emotion_words,decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
    
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Emotion word counts: 2135
Building encoder and decoder ...
Models built and ready to go!


# Run training

In [127]:
# Configure training/optimization
clip = 5
teacher_forcing_ratio = 0.5
learning_rate = 0.001
decoder_learning_ratio = 5.0
n_iteration = 2000000
print_every = 1
save_every = 2000


# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs,pairs_emotion, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding,emotion_embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip,corpus_name,emotion_words)
    
    

Building optimizers ...
Starting Training!
Loading Training data ...
Initializing ...
Training...
tensor([[ 0.0695,  0.0698,  0.0784,  ..., -0.0654,  0.0987, -0.1133],
        [ 0.0139,  0.0256,  0.2158,  ...,  0.0453,  0.0188,  0.0833],
        [ 0.0574,  0.0664, -0.0676,  ..., -0.0866, -0.0685,  0.0191],
        ...,
        [-0.0570,  0.0345, -0.1007,  ..., -0.0422,  0.0491, -0.0221],
        [ 0.0813, -0.0131, -0.0727,  ..., -0.0489, -0.0259, -0.0228],
        [ 0.0690,  0.0063, -0.0365,  ..., -0.1268,  0.1175,  0.0441]],
       grad_fn=<TanhBackward>)
tensor([[ 0.0806,  0.0528,  0.0994,  ..., -0.0494,  0.0575, -0.0797],
        [ 0.0413, -0.0247,  0.1680,  ..., -0.0240,  0.0132,  0.0846],
        [ 0.0457,  0.0703, -0.0532,  ..., -0.1328,  0.0117,  0.0661],
        ...,
        [-0.0644,  0.0437, -0.0120,  ..., -0.0727,  0.0391, -0.0080],
        [ 0.0557,  0.0473, -0.0499,  ..., -0.1223,  0.0103, -0.0059],
        [ 0.0689, -0.0019, -0.0456,  ..., -0.0578,  0.0619,  0.1334]],
   

tensor([[ 0.0934,  0.0054,  0.0535,  ...,  0.0390,  0.1007, -0.0449],
        [ 0.1348,  0.0800,  0.0852,  ...,  0.0188, -0.0131, -0.0098],
        [-0.0716,  0.1054, -0.0185,  ..., -0.1054, -0.0245,  0.1091],
        ...,
        [ 0.0145,  0.0304,  0.0586,  ..., -0.0739, -0.0102,  0.0219],
        [ 0.0554, -0.0272,  0.0743,  ..., -0.0875,  0.1226, -0.0547],
        [ 0.0188, -0.0319, -0.0244,  ..., -0.0553, -0.0300, -0.0047]],
       grad_fn=<TanhBackward>)
tensor([[ 0.1263,  0.0676,  0.0256,  ...,  0.0253,  0.0883, -0.0748],
        [ 0.1920,  0.1558,  0.1122,  ...,  0.0778,  0.0037, -0.0613],
        [-0.0560,  0.1725, -0.0079,  ..., -0.0730,  0.0391,  0.1010],
        ...,
        [ 0.0051,  0.0115,  0.0556,  ..., -0.0614, -0.0268,  0.0075],
        [ 0.0624, -0.0261,  0.0244,  ..., -0.0591,  0.0981, -0.0505],
        [ 0.0013, -0.0703, -0.0399,  ..., -0.0530, -0.0142,  0.0113]],
       grad_fn=<TanhBackward>)
tensor([[ 0.1571,  0.1162,  0.0141,  ...,  0.0167,  0.1235, -0.1041],


tensor([[ 0.5573, -0.4247,  0.1051,  ...,  0.6332, -0.2196, -0.2284],
        [ 0.5362, -0.4584,  0.1290,  ...,  0.6892, -0.0880, -0.2330],
        [ 0.5022, -0.4323, -0.0060,  ...,  0.6635, -0.0500, -0.0646],
        ...,
        [ 0.0120, -0.2812,  0.1522,  ...,  0.4732,  0.0722,  0.0301],
        [ 0.3230, -0.3779,  0.2485,  ...,  0.5201, -0.0045, -0.1322],
        [ 0.4374, -0.4760,  0.0837,  ...,  0.5826, -0.0133, -0.1752]],
       grad_fn=<TanhBackward>)
tensor([[ 0.5624, -0.4501,  0.1052,  ...,  0.6371, -0.1585, -0.2545],
        [ 0.5272, -0.4832,  0.1189,  ...,  0.7008, -0.0827, -0.2397],
        [ 0.4634, -0.3874,  0.0504,  ...,  0.6242, -0.0577, -0.0823],
        ...,
        [-0.0097, -0.2672,  0.1788,  ...,  0.4343,  0.0347, -0.0248],
        [ 0.3310, -0.3618,  0.2390,  ...,  0.4728, -0.0290, -0.1452],
        [ 0.4418, -0.4625,  0.0856,  ...,  0.5969,  0.0313, -0.1459]],
       grad_fn=<TanhBackward>)
tensor([[ 0.5686, -0.4619,  0.0955,  ...,  0.6454, -0.1177, -0.2730],


tensor([[ 0.9994,  0.2904,  0.0818,  ...,  0.9995,  0.7457,  0.5001],
        [ 0.9992,  0.3286,  0.1351,  ...,  0.9992,  0.7733,  0.5685],
        [ 0.9990,  0.3582, -0.0884,  ...,  0.9990,  0.6985,  0.6227],
        ...,
        [ 0.9890,  0.3963,  0.4898,  ...,  0.9914,  0.6044,  0.4727],
        [ 0.9950,  0.5071,  0.5357,  ...,  0.9962,  0.5253,  0.5593],
        [ 0.9987,  0.1864,  0.0887,  ...,  0.9989,  0.7975,  0.5137]],
       grad_fn=<TanhBackward>)
tensor([[ 0.9994,  0.2985,  0.0500,  ...,  0.9995,  0.7420,  0.5211],
        [ 0.9991,  0.3600,  0.1501,  ...,  0.9993,  0.7596,  0.5961],
        [ 0.9990,  0.3953, -0.1063,  ...,  0.9989,  0.6895,  0.6415],
        ...,
        [ 0.9887,  0.3917,  0.4876,  ...,  0.9916,  0.6174,  0.4826],
        [ 0.9950,  0.5268,  0.5155,  ...,  0.9960,  0.5316,  0.5371],
        [ 0.9986,  0.2100,  0.1249,  ...,  0.9988,  0.7820,  0.5354]],
       grad_fn=<TanhBackward>)
tensor([[ 0.9995,  0.2700,  0.0405,  ...,  0.9995,  0.7624,  0.4715],


tensor([[ 1.0000, -0.9857,  0.8730,  ...,  1.0000,  0.9535, -0.9878],
        [ 1.0000, -0.9845,  0.8877,  ...,  1.0000,  0.9545, -0.9864],
        [ 1.0000, -0.9866,  0.8234,  ...,  1.0000,  0.9479, -0.9868],
        ...,
        [ 0.9998, -0.9118,  0.8781,  ...,  0.9997,  0.8749, -0.9312],
        [ 1.0000, -0.9507,  0.9335,  ...,  0.9999,  0.8938, -0.9719],
        [ 1.0000, -0.9843,  0.8064,  ...,  1.0000,  0.9578, -0.9844]],
       grad_fn=<TanhBackward>)
tensor([[ 1.0000, -0.9866,  0.8804,  ...,  1.0000,  0.9533, -0.9890],
        [ 1.0000, -0.9845,  0.8908,  ...,  1.0000,  0.9534, -0.9876],
        [ 1.0000, -0.9870,  0.8282,  ...,  1.0000,  0.9449, -0.9873],
        ...,
        [ 0.9998, -0.9114,  0.8780,  ...,  0.9997,  0.8717, -0.9347],
        [ 1.0000, -0.9518,  0.9358,  ...,  0.9999,  0.8839, -0.9722],
        [ 1.0000, -0.9840,  0.8180,  ...,  1.0000,  0.9575, -0.9850]],
       grad_fn=<TanhBackward>)
tensor([[ 1.0000, -0.9872,  0.8843,  ...,  1.0000,  0.9504, -0.9897],


tensor([[ 1.0000, -0.9989,  0.9470,  ...,  1.0000,  0.8925, -0.9996],
        [ 1.0000, -0.9981,  0.9497,  ...,  1.0000,  0.9462, -0.9993],
        [ 1.0000, -0.9986,  0.9282,  ...,  1.0000,  0.9297, -0.9993],
        ...,
        [ 0.9999, -0.9671,  0.9444,  ...,  0.9999,  0.8619, -0.9880],
        [ 1.0000, -0.9878,  0.9800,  ...,  1.0000,  0.8498, -0.9972],
        [ 1.0000, -0.9981,  0.8955,  ...,  1.0000,  0.9528, -0.9992]],
       grad_fn=<TanhBackward>)
tensor([[ 1.0000, -0.9990,  0.9478,  ...,  1.0000,  0.8961, -0.9996],
        [ 1.0000, -0.9982,  0.9502,  ...,  1.0000,  0.9455, -0.9993],
        [ 1.0000, -0.9987,  0.9275,  ...,  1.0000,  0.9259, -0.9993],
        ...,
        [ 0.9999, -0.9697,  0.9452,  ...,  0.9999,  0.8556, -0.9893],
        [ 1.0000, -0.9883,  0.9807,  ...,  1.0000,  0.8517, -0.9974],
        [ 1.0000, -0.9982,  0.8956,  ...,  1.0000,  0.9494, -0.9992]],
       grad_fn=<TanhBackward>)
tensor([[ 1.0000, -0.9991,  0.9436,  ...,  1.0000,  0.8929, -0.9996],


tensor([[ 1.0000, -0.9995,  0.9581,  ...,  1.0000,  0.8768, -0.9997],
        [ 1.0000, -0.9990,  0.9512,  ...,  1.0000,  0.9294, -0.9996],
        [ 1.0000, -0.9991,  0.9337,  ...,  1.0000,  0.9113, -0.9996],
        ...,
        [ 1.0000, -0.9817,  0.9516,  ...,  1.0000,  0.8135, -0.9932],
        [ 1.0000, -0.9914,  0.9858,  ...,  1.0000,  0.7955, -0.9983],
        [ 1.0000, -0.9989,  0.9255,  ...,  1.0000,  0.9350, -0.9995]],
       grad_fn=<TanhBackward>)


KeyboardInterrupt: 

# Evaluation

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, emotions,max_length=MAX_LENGTH,beam_search = False):
    emotions = int(emotions)
    emotions = torch.LongTensor([emotions])
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    emotions = emotions.to(device)

    # indexes -> words
    if beam_search:
        sequences = searcher(input_batch, emotions, lengths, max_length)
        decoded_words = beam_decode(sequences,voc)
    else:
        # Decode sentence with searcher
        tokens, scores = searcher(input_batch, emotions, lengths, max_length)
        decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

def beam_decode(sequences,voc):
    for each in sequences:
        for idxs in each:
            return [voc.index2word[idx] for idx in idxs[:-1]]
    
def evaluateInput(encoder, decoder, searcher, voc,emotion_dict,beam_search):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            for emotion in range(len(emotion_dict)):
                # Check if it is quit case
                if input_sentence == 'q' or input_sentence == 'quit': break
                # Normalize sentence
                input_sentence = normalizeString(input_sentence)
                # Evaluate sentence
                output_words = evaluate(encoder, decoder, searcher, voc, input_sentence,emotion,beam_search=beam_search)
                # Format and print response sentence
                output=[]
                for word in output_words:
                    if word == 'PAD':
                        continue
                    elif word == 'EOS':
                        break
                    else:
                        output.append(word)
                print('Bot({}):'.format(emotion_dict[emotion]), ' '.join(output))

        except KeyError:
            print("Error: Encountered unknown word.")
            

# Chat with bot

In [None]:
# Set dropout layers to eval mode

encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)
searcher2 = BeamSearchDecoder(encoder,decoder,voc.num_words)
# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc,emo_dict,False)