In [None]:
import random
import time
import math
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

import preprocess
from DataLoader import DataLoader
import model

In [None]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)

## Data loading and processing

In [None]:
# twitter dataset

PAD_token = 0
SOS_token = 1
EOS_token = 2
UNK_token = 3

max_vocab_size = 20000
max_sen, min_sen = 14, 3
unk_most = 1
reverse_flag = 1  # reverse the input sequence order Sutskever et al., 2014
inverse_flag = 0  # MMI bidirection: train P(T|S) by inversing source and target

dataStat = preprocess.dataPreProcess('dataset/open_subtitles_one.txt', max_vocab_size, max_sen, min_sen, unk_most, reverse_flag, inverse_flag) 
print("Number of total words:", dataStat.numOfWords)

wordCount = sorted(dataStat.word2cnt.values(), reverse=True)
print("Dictionary cover ratio:", sum(wordCount[:max_vocab_size-4]) / sum(wordCount))

In [None]:
# pad or cut input and output sentence

def padding(source, maxLen):
    return np.pad(source[:maxLen],(0,max(0,maxLen-len(source))),'constant')

input_max_len, output_max_len = 15, 15

pairsNum = len(dataStat.pairsInd)
pairsLength = np.array([[len(l[0]), len(l[1])] for l in dataStat.pairsInd])
upperLength = np.concatenate((np.ones((pairsNum,1), dtype=int)*input_max_len, 
                              np.ones((pairsNum,1), dtype=int)*output_max_len), axis=1)
pairsLength = np.minimum(pairsLength,upperLength)
pairsAligned = np.array([np.concatenate((padding(l[0], input_max_len), 
                                              padding(l[1], output_max_len))) for l in dataStat.pairsInd])

In [None]:
train_type = 'resume'

ratios = [0.90, 0.05, 0.05]
pairsNumTrain, pairsNumDeve = round(ratios[0]*pairsNum), round(ratios[1]*pairsNum)
pairsNumTest = pairsNum - pairsNumTrain - pairsNumDeve

if train_type=='restart':
    deve_idxes = np.random.choice(pairsNum, pairsNumDeve, replace=False)
    test_idxes = np.random.choice(list(set(np.arange(pairsNum)).difference(set(deve_idxes))), pairsNumTest, replace=False)
    train_idxes = np.array(list(set(np.arange(pairsNum)).difference(set(deve_idxes)).difference(set(test_idxes))))
#     np.save("parameter/pairsIdxesTriple_osdb_1_unk.npy", (train_idxes, deve_idxes, test_idxes))
else:
    train_idxes, deve_idxes, test_idxes = np.load( "parameter/pairsIdxesTriple_osdb_1_unk.npy" )

pairsAlignedTrain, pairsAlignedDeve, pairsAlignedTest = pairsAligned[train_idxes], pairsAligned[deve_idxes], pairsAligned[test_idxes]
pairsLengthTrain, pairsLengthDeve, pairsLengthTest = pairsLength[train_idxes], pairsLength[deve_idxes], pairsLength[test_idxes]

assert pairsNumTrain == len(train_idxes)
assert pairsNumDeve == len(deve_idxes)
assert pairsNumTest == len(test_idxes)
    
print("Total training pairs: ",pairsNumTrain)
print("Total develop pairs: ",pairsNumDeve)
print("Total test pairs: ",pairsNumTest)

## Construct Seq2Seq model

In [None]:
# A tiny verification program

# environment setup
vocab_size = min(dataStat.numOfWords, max_vocab_size)
hidden_size = 1000
train_type = 'resume'

embedding = nn.Embedding(vocab_size, hidden_size)
rnnEncoder = model.Encoder(embedding, vocab_size, 1, hidden_size, n_layers=2, bidirectional=False, variable_lengths=True)
# rnnDecoder = Decoder(embedding, vocab_size, 1, hidden_size, n_layers=2, bidirectional=False)
rnnDecoder = model.LuongAttnDecoderRNN('general_batch', embedding, hidden_size, vocab_size, n_layers=2)

if train_type.lower()=='restart': pass
elif train_type.lower()=='resume':
    para_name = 'osdb_0605_50'
    embedding.load_state_dict(torch.load('parameter/embeding_'+para_name+'.pt'))
    rnnEncoder.load_state_dict(torch.load('parameter/encoder_'+para_name+'.pt'))
    rnnDecoder.load_state_dict(torch.load('parameter/decoder_'+para_name+'.pt'))
else: print("Please enter valid training type !")

if USE_CUDA:
    rnnEncoder.cuda()
    rnnDecoder.cuda()

criterion = nn.NLLLoss(size_average=True)

In [None]:
learning_rate = 0.001
optimizer_encoder = optim.Adam(rnnEncoder.parameters(), learning_rate)
optimizer_decoder = optim.Adam(rnnDecoder.parameters(), learning_rate)

# initialize dataloader
batch_size = 48
trainLoader = DataLoader(pairsAlignedTrain, pairsLengthTrain, input_max_len, output_max_len)
trainLoader.reset(batch_size)

deveLoader = DataLoader(pairsAlignedDeve, pairsLengthDeve, input_max_len, output_max_len)
deveLoader.reset(batch_size)

testLoader = DataLoader(pairsAlignedTest, pairsLengthTest, input_max_len, output_max_len)
testLoader.reset(batch_size)

print("iteration per epoch:", int(pairsNumTrain/batch_size))

In [None]:
def geneMask(outputs_record, lengths):
    batch_size = lengths.size(0)
    # prepare
    comp = torch.arange(output_max_len).view(-1,1)
    if USE_CUDA: comp = comp.cuda()
    comp_ex = comp.repeat(1,vocab_size).repeat(batch_size,1,1)
    # generate
    l_ex = lengths[:,1].view(batch_size,-1).repeat(1,vocab_size).view(batch_size,1,-1)
    if USE_CUDA: l_ex = l_ex.type(torch.cuda.FloatTensor)
    else: l_ex = l_ex.type(torch.FloatTensor)
    mask = comp_ex < l_ex
    if USE_CUDA: mask = mask.type(torch.cuda.FloatTensor)
    else: mask = mask.type(torch.FloatTensor)
    return torch.mul(mask, outputs_record)

## Train

In [None]:
def oneEpoch():

    running_loss = 0

    for batch_ind in range(int(pairsNum/batch_size)+1):
    
        # prepare mini-batch data
        try:
            inputs, targets, lengths = trainLoader.getMiniBatch()
        except Exception as e:
            # print('GG...')
            break
        else:
            # print('Good!')

            # Zero gradients of both optimizers
            optimizer_encoder.zero_grad()
            optimizer_decoder.zero_grad()

            # encoding and decoding
            inputs, targets = Variable(inputs), Variable(targets)
            hid_init = rnnEncoder.init_hidden(batch_size)
            out_enc, hid_enc = rnnEncoder.forward(inputs,lengths[:,0],hid_init)
            encoder_outputs = torch.transpose(out_enc,0,1) # convery batch_first to seqlen_first
            
            # Prepare decoder input and outputs
            decoder_input = Variable(torch.LongTensor([dataStat.word2ind['SOS']] * batch_size))
            decoder_hidden = hid_enc[:rnnDecoder.n_layers] # Use last (forward/concatenate) hidden state from encoder
            if USE_CUDA:
                decoder_input = decoder_input.cuda()

            # Run through decoder one time step at a time
            all_decoder_outputs, decoder_hidden, decoder_attn = rnnDecoder(decoder_input, decoder_hidden, encoder_outputs)
            for t in range(output_max_len-1): # output_max_len
                decoder_input = targets[:,t] # Next input is current target
                decoder_output, decoder_hidden, decoder_attn = rnnDecoder(decoder_input, decoder_hidden, encoder_outputs)
                all_decoder_outputs = torch.cat((all_decoder_outputs, decoder_output), 0) # Store this step's outputs [1,B,N]
                
            outputs_mask = geneMask(torch.transpose(all_decoder_outputs,0,1), lengths)
            loss = criterion(torch.transpose(outputs_mask,1,2), targets)
            #print(loss)
            
            loss.backward()
            clip = 5
            norm_encoder = torch.nn.utils.clip_grad_norm_(rnnEncoder.parameters(), clip)
            norm_decoder = torch.nn.utils.clip_grad_norm_(rnnDecoder.parameters(), clip)

            optimizer_encoder.step()
            optimizer_decoder.step()
            
            running_loss += float(loss)
            
            if (batch_ind+1)%1000 == 0:
                print("iteration", batch_ind+1, " ---- running loss:", running_loss/batch_ind)
                
            if batch_ind+1 == 5000: break
    
    print("\tnorm:\t", float(norm_encoder), float(norm_decoder))
    
    return running_loss/batch_ind

In [None]:
def oneEpochEval():

    lossPool = []
    
    loaderPool = [deveLoader, testLoader]

    for loader in loaderPool:
        for batch_ind in range(int(pairsNum/batch_size)+1):
            # prepare mini-batch data
            try:
                inputs, targets, lengths = loader.getMiniBatch()
            except Exception as e:
                # print('GG...')
                break
            else:
                # print('Good!')

                # Zero gradients of both optimizers
                optimizer_encoder.zero_grad()
                optimizer_decoder.zero_grad()

                # encoding and decoding
                inputs, targets = Variable(inputs), Variable(targets)
                hid_init = rnnEncoder.init_hidden(batch_size)
                out_enc, hid_enc = rnnEncoder.forward(inputs,lengths[:,0],hid_init)
                encoder_outputs = torch.transpose(out_enc,0,1) # convery batch_first to seqlen_first

                # Prepare decoder input and outputs
                decoder_input = Variable(torch.LongTensor([dataStat.word2ind['SOS']] * batch_size))
                decoder_hidden = hid_enc[:rnnDecoder.n_layers] # Use last (forward/concatenate) hidden state from encoder
                if USE_CUDA:
                    decoder_input = decoder_input.cuda()

                # Run through decoder one time step at a time
                all_decoder_outputs, decoder_hidden, decoder_attn = rnnDecoder(decoder_input, decoder_hidden, encoder_outputs)
                for t in range(output_max_len-1): # output_max_len
                    decoder_input = targets[:,t] # Next input is current target
                    decoder_output, decoder_hidden, decoder_attn = rnnDecoder(decoder_input, decoder_hidden, encoder_outputs)
                    all_decoder_outputs = torch.cat((all_decoder_outputs, decoder_output), 0) # Store this step's outputs [1,B,N]

                outputs_mask = geneMask(torch.transpose(all_decoder_outputs,0,1), lengths)
                loss = criterion(torch.transpose(outputs_mask,1,2), targets)

                if batch_ind+1 == 100: break
                    
        lossPool.append(float(loss))
    
    return lossPool[0], lossPool[1]

In [None]:
def savePara(epoch):
    para_name = 'osdb_0605_'+str(epoch)
    torch.save(embedding.state_dict(),'parameter/embeding_'+para_name+'.pt')
    torch.save(rnnEncoder.state_dict(),'parameter/encoder_'+para_name+'.pt')
    torch.save(rnnDecoder.state_dict(),'parameter/decoder_'+para_name+'.pt')   

In [None]:
scores = []

In [None]:
rnnEncoder.train()
rnnDecoder.train()

print("Begin training...")
print("Learning rate = ", learning_rate)
print(time.asctime( time.localtime(time.time()) ))

for i in range(10):
    trainLoader.reset(batch_size)
    loss = oneEpoch()
    if (i+1)%1==0:
        print('Epoch:', i+1, '\tLoss:',loss)
        print(time.asctime( time.localtime(time.time()) ))
    if (i+1)%10==0:
        savePara(i+1+0)
#     # please run evaluate section first
#     if (i+1)%2==0:
#         score_train,paths_train = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda=0, threshold=0, loader=trainLoader, display=0)
#         score_deve,paths_deve = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda=0, threshold=0, loader=deveLoader, display=0)
#         score_test,paths_test = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda=0, threshold=0, loader=testLoader, display=0)
#         scores.append((score_train, score_deve, score_test))
#         print('BLEU score (train, deve, test):', score_train, score_deve, score_test)

## Build Language Model: P(T)

In [None]:
# construct P(target) language model

freqLM = {}

for i in range(pairsNumTrain):
    length = pairsLengthTrain[i][1]
    rsps = pairsAlignedTrain[i][input_max_len:input_max_len+length]
    dic = freqLM
    for j in range(min(5,length)):
        if rsps[j] not in dic: dic[rsps[j]] = [1,{}]
        else: dic[rsps[j]][0] += 1
        dic = dic[rsps[j]][1]


In [None]:
# input size: B x S
def conProb(prefix_batch):
    this_batch_size = len(prefix_batch)
    count_matrix = np.ones((this_batch_size, vocab_size))
    for b, prefix in enumerate(prefix_batch):
        count_array = np.ones(vocab_size)
        dic = freqLM
        try:
            for ind in prefix:
                dic = dic[ind][1]
        except Exception as e:
            pass
        else:
            for key in dic:
                count_array[key] += dic[key][0]
        total_freq = np.sum(count_array)
        count_matrix[b] = (count_array/total_freq)
    dist_tensor = torch.FloatTensor(np.log(count_matrix))
    if USE_CUDA: dist_tensor = dist_tensor.cuda()
    return dist_tensor

## Evalute by BLEU and distinct

In [None]:
rnnEncoder.eval()
rnnDecoder.eval()

In [None]:
# translate into natural language

def showResult(ind_seq, reverse=False):
    token_list = []
    for i in ind_seq:
        if i == dataStat.word2ind['EOS']: break
        token_list.append(dataStat.ind2word[i])
    return ' '.join(token_list[::-1]) if reverse else  ' '.join(token_list)

In [None]:
# greedy search, mainly for debug

def topOneDecode(decoder, decoder_hidden, encoder_outputs, stat, max_length=output_max_len):

    decoder_input = torch.LongTensor([SOS_token]).view(1,-1)
    if USE_CUDA: 
        decoder_input = decoder_input.cuda()
        decoder_hidden = decoder_hidden.cuda()

    decoded_words = []
    
    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attn = decoder(decoder_input, decoder_hidden, encoder_outputs)
        # print(decoder_attn)
        
        topv, topi = decoder_output.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append(ni.item())
            break
        else:
            decoded_words.append(ni.item())

        decoder_input = torch.LongTensor([[ni]])
        if USE_CUDA: decoder_input = decoder_input.cuda()

    return decoded_words

In [None]:
loader = trainLoader

batch_size = 1
loader.reset(batch_size)
inputs, targets, lengths = loader.getMiniBatch()

# encoding and decoding
hid_init = rnnEncoder.init_hidden(batch_size)
out_enc, hid_enc = rnnEncoder.forward(inputs,lengths[:,0],hid_init)
decoder_hidden = hid_enc[:rnnDecoder.n_layers]
encoder_outputs = torch.transpose(out_enc,0,1) # convery batch_first to seqlen_first

trace = topOneDecode(rnnDecoder, decoder_hidden, encoder_outputs, dataStat, max_length=output_max_len)

print("Message:\t", showResult(inputs.data[0].cpu().numpy(), reverse=True))
print("Response:\t", showResult(trace))
if targets is not None:
    print("Teaching:\t", showResult(targets.data[0].cpu().numpy()))

In [None]:
def beamDecodeBatch(decoder, decoder_hidden, encoder_outputs, voc, beam_size, lamda, threshold, max_length=output_max_len):
    terminal_sentences, prev_top_sentences, next_top_sentences = [], [], []
    prev_top_sentences.append(Sentence(decoder_hidden))
    for _ in range(max_length-1):
        this_batch_size = len(prev_top_sentences)
        if this_batch_size > 0:
            decoder_input = torch.LongTensor([[sentence.last_idx] for sentence in prev_top_sentences])
            if USE_CUDA: decoder_input = decoder_input.cuda()
            decoder_hidden_batch = prev_top_sentences[0].decoder_hidden
            for i in range(1,this_batch_size):
                decoder_hidden_batch = torch.cat((decoder_hidden_batch, prev_top_sentences[i].decoder_hidden), dim=1)
            encoder_outputs_batch = encoder_outputs.repeat(1,this_batch_size,1)
            
            decoder_output, decoder_hidden_batch, decoder_attn = decoder(decoder_input, decoder_hidden_batch, encoder_outputs_batch)
            
            # apply MMI anti-language model
            if len(prev_top_sentences[0].sentence_idxes)<threshold:
                LM_output = conProb([[int(idx) for idx in sentence.sentence_idxes] for sentence in prev_top_sentences])
                decoder_output -= lamda*LM_output.view(1,this_batch_size,-1)
            
            topv_batch, topi_batch = decoder_output.topk(beam_size) # [1,B,k]
            for b in range(this_batch_size):
                sentence = prev_top_sentences[b]
                topi, topv = topi_batch[0][b], topv_batch[0][b]
                decoder_hidden = decoder_hidden_batch[:,b].unsqueeze(1)
                term, top = sentence.addTopk(topi, topv, decoder_hidden, beam_size, voc)
                terminal_sentences.extend(term)
                next_top_sentences.extend(top)
            
        next_top_sentences.sort(key=lambda s: s.getScore(), reverse=True)
        prev_top_sentences = next_top_sentences[:beam_size]
        next_top_sentences = []

    terminal_sentences += [sentence.toWordScore(voc) for sentence in prev_top_sentences]
    terminal_sentences.sort(key=lambda x: x[1], reverse=True)

    n = min(len(terminal_sentences), 32)  # N-best list
    return terminal_sentences[:n]

In [None]:
class Sentence:
    def __init__(self, decoder_hidden, last_idx=SOS_token, sentence_idxes=[], sentence_scores=[]):
        if(len(sentence_idxes) != len(sentence_scores)):
            raise ValueError("length of indexes and scores should be the same")
        self.decoder_hidden = decoder_hidden
        self.last_idx = last_idx
        self.sentence_idxes =  sentence_idxes
        self.sentence_scores = sentence_scores

    def getScore(self, mode='avg', gamma=0.0):
        if len(self.sentence_scores) == 0:
            print("sentence of length 0")
            return torch.tensor(-999).float().cuda() if USE_CUDA else torch.tensor(-999).float()
        if mode=='avg':
            res = sum(self.sentence_scores) / len(self.sentence_scores)
        else:
            res = sum(self.sentence_scores) + gamma*len(self.sentence_scores)
        return res


    def addTopk(self, topi, topv, decoder_hidden, beam_size, voc):
        terminates, sentences = [], []
        
        topi, topv = topi.squeeze(), topv.squeeze()  # get data out of batch
        
        for i in range(beam_size):
            if topi[i] == EOS_token:
                terminates.append(([int(idx) for idx in self.sentence_idxes] + [EOS_token],
                                   self.getScore())) # tuple(word_list, score_float)
                continue
            idxes = self.sentence_idxes[:] # pass by value
            scores = self.sentence_scores[:] # pass by value
            idxes.append(topi[i])
            scores.append(topv[i])
            sentences.append(Sentence(decoder_hidden, topi[i], idxes, scores))
        return terminates, sentences

    def toWordScore(self, voc):
        words = []
        for i in range(len(self.sentence_idxes)):
            if self.sentence_idxes[i] == EOS_token:
                words.append(EOS_token)
            else:
                words.append(int(self.sentence_idxes[i]))
        if self.sentence_idxes[-1] != EOS_token:
            words.append(EOS_token)
        return (words, self.getScore())


In [None]:
batch_size = 1
loader = trainLoader
loader.reset(batch_size)
inputs, targets, lengths = loader.getMiniBatch()

# encoding and decoding
hid_init = rnnEncoder.init_hidden(batch_size)
out_enc, hid_enc = rnnEncoder.forward(inputs,lengths[:,0],hid_init)
decoder_hidden = hid_enc[:rnnDecoder.n_layers]
encoder_outputs = torch.transpose(out_enc,0,1) # convery batch_first to seqlen_first

path_beam = beamDecodeBatch(rnnDecoder, decoder_hidden, encoder_outputs, dataStat, beam_size=3, lamda=0, threshold=0)  # return list of tuples: (path, score)
for p in path_beam: print(float(p[1]), '\t', showResult(p[0]))

print("Message:\t", showResult(inputs.data[0].cpu().numpy(), reverse=True))
print("Response:\t", showResult(path_beam[0][0]))
if targets is not None:
    print("Teaching:\t", showResult(targets.data[0].cpu().numpy()))

In [None]:
def generate(input, length, encoder, decoder, beam_size, lamda=0.0, threshold=0, verbose=False):
    # encoding and decoding
    hid_init = encoder.init_hidden(batch_size = 1)
    out_enc, hid_enc = encoder.forward(input.view(1,-1),length.view(1),hid_init)
    decoder_hidden = hid_enc[:rnnDecoder.n_layers]
    encoder_outputs = torch.transpose(out_enc,0,1) # convery batch_first to seqlen_first

    if beam_size==0:
        path = topOneDecode(decoder, decoder_hidden, dataStat, max_length=15)  # return path in list
        return path
    else:
        path_beam = beamDecodeBatch(decoder, decoder_hidden, encoder_outputs, dataStat, beam_size, lamda, threshold)  # return list of tuples: (path, score)
        if verbose:
            for p in path_beam: print(float(p[1]), '\t', showResult(p[0]))
        return path_beam[0][0]

In [None]:
def evaluateSample(encoder, decoder, beam_size=5, lamda=0.0, threshold=0, verbose=True, myQuery=''):
    
    if myQuery == '':
        print("Blank Input")
        return -1
    else:
        # feed in customized tokens
        sample_query = myQuery.lower()
        sample_query_ind, _ = preprocess.encodePair(dataStat, (sample_query,'.'),reverse=True)
        sample_query_tensor = torch.LongTensor([padding(sample_query_ind, input_max_len)])
        sample_query_length = torch.LongTensor([len(sample_query_ind)])
        if USE_CUDA: input, length, target = sample_query_tensor.cuda(), sample_query_length.cuda(), None
        
    trace = generate(input, length, encoder, decoder, beam_size, lamda, threshold, verbose=True)
    if verbose:
        print("Message:\t", showResult(input.data[0].cpu().numpy(), reverse=True))
        print("Response:\t", showResult(trace))
        if target is not None:
            print("Teaching:\t", showResult(target.data[0].cpu().numpy()))
    return 0


In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
chencherry = SmoothingFunction()

def evaluateSentence(encoder, decoder, beam_size=1, lamda=0.0, threshold=0, loader=trainLoader, display=False):
    
    loader.reset(1)
    data_length = loader.dataLength
    responses = []
    total_score = 0
    sample_length = min(2000, data_length)
    
    for i in range(sample_length):
        inputs, targets, lengths = loader.getMiniBatch()
        input, length, target = inputs, lengths[0][0], targets

        trace = generate(input, length, encoder, decoder, beam_size, lamda, threshold)
        responses.append(trace)
                
        length_ref = lengths[0][1]
        references = [[target.data[0].tolist()[:int(length_ref)]]]
        candidates = [trace]
        score = corpus_bleu(references, candidates, smoothing_function=chencherry.method1)
        total_score += score
        
        if display and (i+1)%int(sample_length/10)==0: print("complete",int(100*(i+1)/sample_length),"%")
        
    return total_score/i, responses

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
chencherry = SmoothingFunction()

def evaluateCorpus(encoder, decoder, beam_size=1, lamda=0.0, threshold=0, loader=trainLoader, weights=[1,0,0,0], display=False):
    
    loader.reset(1)
    data_length = loader.dataLength
    sample_length = min(400, data_length)
    
    references = []
    candidates = []
    responses = []
    
    for i in range(sample_length):
        inputs, targets, lengths = loader.getMiniBatch()
        input, length, target = inputs, lengths[0][0], targets

        trace = generate(input, length, encoder, decoder, beam_size, lamda, threshold)
        responses.append(trace)
                
        length_ref = lengths[0][1]
        references.append([target.data[0].tolist()[:int(length_ref)]])
        candidates.append(trace)
        
        if display and (i+1)%int(sample_length/10)==0: print("complete",int(100*(i+1)/sample_length),"%")
        
    score = corpus_bleu(references, candidates, weights=weights, smoothing_function=chencherry.method1)
    
    return score, responses

In [None]:
# distinct evaluation

import nltk

def distinctEval(all_paths):

    response_ugm = set([])
    response_bgm = set([])        
    response_len = sum([len(p) for p in all_paths])

    for path in all_paths:
        for u in path:
            response_ugm.add(u)
        for b in list(nltk.bigrams(path)):
            response_bgm.add(b)

    print("total length of response:", response_len)
    print("distinct unigrams:", len(response_ugm)/response_len)
    print("distinct bigrams:", len(response_bgm)/response_len)
    
    return 0

In [None]:
evaluateSample(rnnEncoder, rnnDecoder, beam_size=2, lamda=0.5, threshold=2, verbose=True, myQuery='you better stay right there boy')

In [None]:
time_start = time.time()

score, responses = evaluateCorpus(rnnEncoder, rnnDecoder, beam_size=2, lamda=0.0, threshold=0, loader=deveLoader, display=True)
distinctEval(responses)
print('score:', score)
print('average length:', sum([len(r) for r in responses])/len(responses))

time_end = time.time()
print("complete time:", time_end-time_start)

## Hyper-parameter tuning

In [None]:
# baseline -- gamma = 0, beam = 2
lamda_list = [0.1,0.2,0.4]
thres_list = [1,2]
score_list = [[],[],[]]

for threshold in thres_list:
    for lamda in lamda_list:
        print(time.asctime( time.localtime(time.time()) ))
        print('threshold: ', threshold, '\nlambda: ', lamda)
        score_train,_ = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda, threshold, loader=trainLoader, display=0)
        score_deve,paths_deve = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda, threshold, loader=deveLoader, display=0)
        score_test,paths_test = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda, threshold, loader=testLoader, display=0)
        score_list[0].append(score_train)
        score_list[1].append(score_deve)
        score_list[2].append(score_test)
        print(score_train, score_deve, score_test)
        distinctEval(paths_deve)
        distinctEval(paths_test)

# score_list

In [None]:
# baseline -- gamma = 0, beam = 2
lamda_list = [0.1,0.2,0.4]
thres_list = [1,2]
score_list = [[],[],[]]

for threshold in thres_list:
    for lamda in lamda_list:
        print(time.asctime( time.localtime(time.time()) ))
        print('threshold: ', threshold, '\nlambda: ', lamda)
        score_train,_ = evaluateCorpus(rnnEncoder, rnnDecoder, 4, lamda, threshold, loader=trainLoader, display=0)
        score_deve,paths_deve = evaluateCorpus(rnnEncoder, rnnDecoder, 4, lamda, threshold, loader=deveLoader, display=0)
        score_test,paths_test = evaluateCorpus(rnnEncoder, rnnDecoder, 4, lamda, threshold, loader=testLoader, display=0)
        score_list[0].append(score_train)
        score_list[1].append(score_deve)
        score_list[2].append(score_test)
        print(score_train, score_deve, score_test)
        distinctEval(paths_deve)
        distinctEval(paths_test)

# score_list

In [None]:
# hyper-prrameter tuning -- gamma is set as 0.25
lamda_list = [-0.2,0,0.2,0.5]
thres_list = [1,2]
score_list = [[],[],[]]

for threshold in thres_list:
    for lamda in lamda_list:
        print(time.asctime( time.localtime(time.time()) ))
        print(threshold, '\t', lamda)
        score_train,_ = evaluateCorpus(rnnEncoder, rnnDecoder,2, lamda, threshold, loader=trainLoader, display=0)
        score_deve,paths_deve = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda, threshold, loader=deveLoader, display=0)
        score_test,paths_test = evaluateCorpus(rnnEncoder, rnnDecoder, 2, lamda, threshold, loader=testLoader, display=0)
        score_list[0].append(score_train)
        score_list[1].append(score_deve)
        score_list[2].append(score_test)
        print(score_train, score_deve, score_test)
        distinctEval(paths_deve)
        distinctEval(paths_test)

# score_list

In [None]:
# hyper-prrameter tuning
lamda_list = [0.1, 0.3]  # gamma = 0.2
thres_list = [2, 3]
score_list = [[],[],[]]

for threshold in thres_list:
    for lamda in lamda_list:
        print(time.asctime( time.localtime(time.time()) ))
        print(threshold, '\t', lamda)
        score_train,_ = evaluateCorpus(rnnEncoder, rnnDecoder,5, lamda, threshold, loader=trainLoader, display=0)
        score_deve,paths_deve = evaluateCorpus(rnnEncoder, rnnDecoder, 5, lamda, threshold, loader=deveLoader, display=0)
        score_test,paths_test = evaluateCorpus(rnnEncoder, rnnDecoder, 5, lamda, threshold, loader=testLoader, display=0)
        score_list[0].append(score_train)
        score_list[1].append(score_deve)
        score_list[2].append(score_test)
        print(score_train, score_deve, score_test)
        distinctEval(paths_deve)
        distinctEval(paths_test)

# score_list

In [None]:
# hyper-prrameter tuning
lamda_list = [0.2, 0.4]  # gamma = 0.4
thres_list = [1, 2]
score_list = [[],[],[]]

for threshold in thres_list:
    for lamda in lamda_list:
        print(time.asctime( time.localtime(time.time()) ))
        print(threshold, '\t', lamda)
        score_train,_ = evaluateCorpus(rnnEncoder, rnnDecoder,5, lamda, threshold, loader=trainLoader, display=0)
        score_deve,paths_deve = evaluateCorpus(rnnEncoder, rnnDecoder, 5, lamda, threshold, loader=deveLoader, display=0)
        score_test,paths_test = evaluateCorpus(rnnEncoder, rnnDecoder, 5, lamda, threshold, loader=testLoader, display=0)
        score_list[0].append(score_train)
        score_list[1].append(score_deve)
        score_list[2].append(score_test)
        print(score_train, score_deve, score_test)
        distinctEval(paths_deve)
        distinctEval(paths_test)

# score_list