In [1]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive"

os.chdir(path)
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['至大三上成绩单.pdf',
 'BB9E1FA0@D3815A65.36CC2E59',
 '体检信息登记表.xlsx',
 'resume.pdf',
 '申请信息表--推荐信表格.gdoc',
 '脚手架.rar',
 'Colab Notebooks',
 'Resume_xsy_2020_for job.pdf',
 'extracted_dialogue.txt',
 '4000_checkpoint (1).tar',
 '4000_checkpoint.tar',
 'CSCI544 presentation.gdoc',
 'xsy_test_result.txt',
 'ghc_test_result.txt',
 'gru_movie_reply.txt',
 'fb_train.json',
 'fb_valid.json',
 'fb_train',
 'gru_fb_reply_13602.txt',
 'gru_fb_reply_train_1w.txt',
 '566 rnn-encoder.ipynb']

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = 0
EOS_token = 1
learning_rate = 0.1
hidden_size = 256
teacher_forcing_ratio = 0.5



class Lang:
    def __init__(self, name="dict"):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {EOS_token:"EOS",SOS_token:"SOS"}
        self.n_words = 2  

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)  
        self.gru = nn.GRU(hidden_size, hidden_size) 

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1) 
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

def prepareData():
    lang_dict = Lang()
    sentences = []
    max_length=0
    with open("./extracted_dialogue.txt","r") as f:
        lines = f.read().strip().split("\n")
        for i in range(0, len(lines), 2):
            tmp = lines[i].strip()
            if len(tmp.split(' ')) > 100:
              continue
            sentences.append(tmp)
            lang_dict.addSentence(tmp)
            if len(tmp.split(' ')) > max_length:
                max_length = len(tmp.split(' '))
    return lang_dict, sentences, max_length

def getSentenceTensor(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
    
    encoder_hidden = encoder.initHidden() 

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length+1, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach() 

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


def trainIters(encoder, decoder, lang_dict, max_length, sentences,  n_iters, print_every=1000, learning_rate=0.01):
    print_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_data = [getSentenceTensor(lang_dict, random.choice(sentences))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        iter_data = training_data[iter - 1]
        input_tensor = iter_data
        target_tensor = iter_data

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion, max_length)
        print_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('iter, loss =  (%d %.4f) ' %  (iter,  print_loss_avg))



lang_dict, sentences, max_length = prepareData()
print(max_length)

encoder = EncoderRNN(lang_dict.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, lang_dict.n_words).to(device)

trainIters(encoder, decoder, lang_dict, max_length, sentences, 10000, print_every=100)



KeyboardInterrupt: ignored

In [8]:


from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
import re
import os
import unicodedata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


MAX_LENGTH = 10  # Maximum sentence length

# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token


class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens
        for word in keep_words:
            self.addWord(word)


# Lowercase and remove non-letter characters
def normalizeString(s):
    s = s.lower()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


# Takes string sentence, returns sentence of word indexes
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] if word in voc.word2index else PAD_token for word in sentence.split(' ') ] + [EOS_token]


class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # type: (Tensor, Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor]
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden


class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)


class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder, decoder_n_layers):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self._device = device
        self._SOS_token = SOS_token
        self._decoder_n_layers = decoder_n_layers

    __constants__ = ['_device', '_SOS_token', '_decoder_n_layers']

    def forward(self, input_seq : torch.Tensor, input_length : torch.Tensor, max_length : int):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self._decoder_n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=self._device, dtype=torch.long) * self._SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=self._device, dtype=torch.long)
        all_scores = torch.zeros([0], device=self._device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores


In [9]:
def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words, tokens


# Evaluate inputs from user input (stdin)
def evaluateInput(searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words,_ = evaluate(searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

# Normalize input sentence and call evaluate()
def evaluateExample(sentence, searcher, voc):
    print("> " + sentence)
    # Normalize sentence
    input_sentence = normalizeString(sentence)
    # Evaluate sentence
    output_words,_ = evaluate(searcher, voc, input_sentence)
    output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
    print('Bot:', ' '.join(output_words))

In [None]:
import math

def evaluate_all(pairs, searcher, voc, gram):
    ret = []
    for s, reply in pairs:
        output_words, tokens = evaluate(searcher, voc, s)
        reply_tokens = indexesFromSentence(voc, reply)
        ret. append((tokens, reply_tokens))
    return BLEU(ret, gram)

def BLEU(pairs, gram):
  accuracy = 0
  for tokens, reply_tokens in pairs:
      if len(tokens) < gram or len(reply_tokens) < gram:
          continue
      total = len(tokens) - gram +1
      acc = 0
      bp = 1
      if len(tokens) < len(reply_tokens):
          bp *= math.exp(1- (len(tokens)/len(reply_tokens)))
      for i in range(len(tokens) - gram + 1 ): 
          for j in range(len(reply_tokens) - gram +1):
              flag = 1
              for k in range(gram):
                  if tokens[i+k] != reply_tokens[j+k]:
                      flag = 0
                      break
              if flag == 1:
                  break
          acc += flag
      acc /= total
      accuracy += bp*acc
  accuracy /= len(pairs)
  return accuracy


In [12]:
import json
import random

MAX_LENGTH = 10  # Maximum sentence length to consider

def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData():
    voc = Voc("fb")
    with open("./fb_train.json","r") as f:
        pairs = json.load(f)
        for i in range(len(pairs)):
            pairs[i][0] = normalizeString(pairs[i][0])
            pairs[i][1] = normalizeString(pairs[i][1])
        print("Read {!s} sentence pairs".format(len(pairs)))
        pairs = filterPairs(pairs)
        print("Trimmed to {!s} sentence pairs".format(len(pairs)))
        print("Counting words...")
        for pair in pairs:
            voc.addSentence(pair[0])
            voc.addSentence(pair[1])
        print("Counted words:", voc.num_words)
        return voc, pairs
    voc, data = prepareData()

save_dir = './fb_train'
corpus_name = 'fb'
voc, data = prepareData()
# Print some pairs to validate
print("\npairs:")
for pair in data[:10]:
    print(pair)

Read 218181 sentence pairs
Trimmed to 30233 sentence pairs
Counting words...
Counted words: 6952

pairs:
['okay but i was published in new yorker once', 'you better not make any spelling mistakes .']
['how does that feel for you', 'makes me secure with my manly hobby skills .']
['makes me secure with my manly hobby skills .', 'i bet that it does']
['i bet that it does', 'anyway . what do you do ?']
['anyway . what do you do ?', 'i watch kids for a living']
['you work for a funeral home ?', 'yes it is nice halloween is my fav .']
['no i don t work at a funeral home', 'ok i see that s your halloween costume']
['lol oh i see taught you own it', 'me also what is your favorite ?']
['me also what is your favorite ?', 'well i like sherlock holmes and others']
['what are you going to school for', 'i m trying to get my ba in finance']


In [13]:

import itertools

MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, data, MIN_COUNT)

def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] if word in voc.word2index else PAD_token for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

keep_words 4769 / 6949 = 0.6863
Trimmed from 30233 pairs to 26898, 0.8897 of total
input_variable: tensor([[   5,    5,   19,   35,   36],
        [ 100,  100,  188,   36,   11],
        [4682,  139,   11,   11,  106],
        [   4,   54,   75,   36,   36],
        [  28,   19,  131,   23,  613],
        [4518,  179,  120,  157,  164],
        [  46,   11, 2464,   37,   37],
        [ 746,   37,   37,    2,    2],
        [  18,    2,    2,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([10,  9,  9,  8,  8])
target_variable: tensor([[ 142,  247,    5,    5,  132],
        [ 162,   81, 2202,  219,  490],
        [   5,   36, 1568,  219,    5],
        [ 283,   11,   18,   67,  283],
        [ 734,   42,  155,  436,   71],
        [1391,   37,   65,  436,   91],
        [  23,    2,  320,    2,  729],
        [1058,    0,   18,    0,   18],
        [   2,    0,    2,    0,    2]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  Tr

In [14]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [15]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [16]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for rnn packing should always be on the cpu
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))


# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.8
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 6000
print_every = 100
save_every = 1000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")

trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 100; Percent complete: 1.7%; Average loss: 5.3090
Iteration: 200; Percent complete: 3.3%; Average loss: 4.4705
Iteration: 300; Percent complete: 5.0%; Average loss: 4.2176
Iteration: 400; Percent complete: 6.7%; Average loss: 4.0623
Iteration: 500; Percent complete: 8.3%; Average loss: 4.0201
Iteration: 600; Percent complete: 10.0%; Average loss: 3.7986
Iteration: 700; Percent complete: 11.7%; Average loss: 3.7588
Iteration: 800; Percent complete: 13.3%; Average loss: 3.6733
Iteration: 900; Percent complete: 15.0%; Average loss: 3.5977
Iteration: 1000; Percent complete: 16.7%; Average loss: 3.4615
Iteration: 1100; Percent complete: 18.3%; Average loss: 3.3652
Iteration: 1200; Percent complete: 20.0%; Average loss: 3.2782
Iteration: 1300; Percent complete: 21.7%; Average loss: 3.2308
Iteration: 1400; Percent complete: 23.3%; Average loss: 3.3545
Iteration: 1500; Percent complete: 25.0%; Average loss: 3.13

In [None]:

import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 1
iter_num = 400


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, voc):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in input_tensor])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(input_tensor)
    input_batch = input_batch.transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)


    # Forward input through encoder model
    encoder_outputs, encoder_hidden = encoder(input_batch, lengths)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    # Prepare encoder's final hidden layer to be first hidden input to the decoder
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    # Initialize decoder input with SOS_token
    decoder_outputs = torch.zeros([0], device=device)
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    loss = 0
    print_losses = []
    n_totals = 0
    if use_teacher_forcing:
        for t in range(MAX_LENGTH):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(MAX_LENGTH):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    for di in range(MAX_LENGTH):
        # Forward pass through decoder
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
        decoder_outputs = torch.cat((decoder_outputs, decoder_output), 0)
        decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
        if decoder_input == EOS_token:
            break
        decoder_input = torch.unsqueeze(decoder_input, 0)

    for i in range(len(target_tensor), decoder_outputs.shape[0]):
        target_tensor.append(PAD_token)
    target_tensor = torch.tensor(target_tensor[:decoder_outputs.shape[0]], device = device).reshape(-1)

    loss = criterion(decoder_outputs, target_tensor)
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / MAX_LENGTH


def trainIters(encoder, decoder, voc, data, batch_size, n_iters, print_every=1000, learning_rate=0.01):
    print_loss_total = 0

    encoder_optimizer = optim.Adam(encoder.parameters())
    decoder_optimizer = optim.Adam(decoder.parameters())
    criterion= nn.CrossEntropyLoss()

    cnt = 0
    for iter in range(1, n_iters + 1):
        random.shuffle(data)

        for s1,s2 in data:
            s1 = indexesFromSentence(voc, s1)
            s2 = indexesFromSentence(voc, s2)

            loss = train([s1], s2, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, voc)

            cnt += 1
            if cnt % print_every == 0:
                print('iter, loss =  (%d %.4f) ' %  (iter,  loss))


embedding = nn.Embedding(voc.num_words, hidden_size)
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)

trainIters(encoder, decoder, voc, data, batch_size, iter_num, print_every=1000)


In [17]:
import random

# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder, decoder.n_layers)


# with open("./fb_train.json","r") as f:
#     train_data = json.load(f)
#     for i in range(len(train_data)):
#         train_data[i][0] = normalizeString(train_data[i][0])
#         train_data[i][1] = normalizeString(train_data[i][1])
# print(len(train_data))

# random.shuffle(data)

# my_reply = ""

# for input_sentence,reply in data[:10000]:
#     output_words,_ = evaluate(scripted_searcher, voc, input_sentence)
#     output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
#     # print(input_sentence)
#     # print(' '.join(output_words))
#     # print()
#     my_reply +=' '.join(output_words) +'\n'

# with open("./gru_fb_reply_train_1w.txt","w") as f:
#   f.write(my_reply)

# with open("./ghc_test_result.txt","r") as f:
#     pairs = []
#     max_length=0
#     lines = f.read().strip().split("\n")
#     for i in range(0, len(lines), 3):
#             tmp = lines[i].strip()
#             tmp = normalizeString(tmp)
#             tmp1 = lines[i+1].strip()
#             tmp1 = normalizeString(tmp1)
#             pairs.append((indexesFromSentence(voc, tmp1), indexesFromSentence(voc, tmp)))
# accuracy = BLEU(pairs, 1)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 1))
# accuracy = BLEU(pairs, 2)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 2))
# accuracy = BLEU(pairs, 3)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 3))
# accuracy = BLEU(pairs, 4)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 4))


# accuracy = evaluate_all(pairs[-1000:], scripted_searcher, voc, 1)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 1))
# accuracy = evaluate_all(pairs[-1000:], scripted_searcher, voc, 2)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 2))
# accuracy = evaluate_all(pairs[-1000:], scripted_searcher, voc, 3)april29

# print("accuracy is {}% for {}-gram ".format(accuracy*100, 3))
# accuracy = evaluate_all(pairs[-1000:], scripted_searcher, voc, 4)
# print("accuracy is {}% for {}-gram ".format(accuracy*100, 4))

# random.shuffle(pairs)

# for s, r in pairs[:10]:
#     evaluateExample(s, scripted_searcher, voc)


# Evaluate examples
sentences = ["boss i need a rise", "what's up?", "who are you?", 
             "where am I?", "where are you from?", "okay give me the keys .", 
             "hello what are doing today ?", "i just got done watching a horror movie",
             "hi ! how are you doing tonight ?", "i work in a homeless shelter in my town .",
             "i love the crowds , getting to know people .", "that nobel prize will just have to wait .",
             "you got a name ?","i love spending time with my family", "so what do you do now for fun ? i like to read .",
             "wanna kiss me jim ?", "i've a dream , it is to work from home", "i make time stop . i've a superpower.", 
             "i don't like blood. i faint when i see blood.", "hi ! do you like turtles ?"]
for s in sentences:
    evaluateExample(s, searcher, voc)

> boss i need a rise
Bot: do you mean help you ?
> what's up?
Bot: nothing much . do you hunt ?
> who are you?
Bot: lol i m you
> where am I?
Bot: i am in new york area
> where are you from?
Bot: i am from illinois west side of chicago .
> okay give me the keys .
Bot: i guess what does that not
> hello what are doing today ?
Bot: i m not really i m doing great hp
> i just got done watching a horror movie
Bot: what movie are you watching ? movie
> hi ! how are you doing tonight ?
Bot: i m good thanks for asking
> i work in a homeless shelter in my town .
Bot: nice do you have any hobbies ?
> i love the crowds , getting to know people .
Bot: where is your favorite part
> that nobel prize will just have to wait .
Bot: and what do you do ?
> you got a name ?
Bot: yes ! is your name name ?
> i love spending time with my family
Bot: i like playing the guitar in my free time
> so what do you do now for fun ? i like to read .
Bot: i love reading . my book is
> wanna kiss me jim ?
Bot: if you a

In [7]:
save_dir = os.path.join("data", "save")
corpus_name = "cornell movie-dialogs corpus"

device = "cpu"

# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# If you're loading your own model
# Set checkpoint to load from
checkpoint_iter = 4000
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                             '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                             '{}_checkpoint.tar'.format(checkpoint_iter))

# If you're loading the hosted model
loadFilename = './4000_checkpoint.tar'

# Load model
# Force CPU device options (to match tensors in this tutorial)
checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc = Voc(corpus_name)
voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
# Load trained model params
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()
print('Models built and ready to go!')

### Compile the whole greedy search model to TorchScript model
# Create artificial inputs
test_seq = torch.LongTensor(MAX_LENGTH, 1).random_(0, voc.num_words).to(device)
test_seq_length = torch.LongTensor([test_seq.size()[0]]).to(device)
# Trace the model
traced_encoder = torch.jit.trace(encoder, (test_seq, test_seq_length))

### Convert decoder model
# Create and generate artificial inputs
test_encoder_outputs, test_encoder_hidden = traced_encoder(test_seq, test_seq_length)
test_decoder_hidden = test_encoder_hidden[:decoder.n_layers]
test_decoder_input = torch.LongTensor(1, 1).random_(0, voc.num_words)
# Trace the model
traced_decoder = torch.jit.trace(decoder, (test_decoder_input, test_decoder_hidden, test_encoder_outputs))

### Initialize searcher module by wrapping ``torch.jit.script`` call
scripted_searcher = torch.jit.script(GreedySearchDecoder(traced_encoder, traced_decoder, decoder.n_layers))

# Use appropriate device
scripted_searcher.to(device)
# Set dropout layers to eval mode
scripted_searcher.eval()

# Evaluate examples
sentences = ["boss i need a rise", "what's up?", "who are you?", 
             "where am I?", "where are you from?", "okay give me the keys .", 
             "hello what are doing today ?", "i just got done watching a horror movie",
             "hi ! how are you doing tonight ?", "i work in a homeless shelter in my town .",
             "i love the crowds , getting to know people .", "that nobel prize will just have to wait .",
             "you got a name ?","i love spending time with my family", "so what do you do now for fun ? i like to read .",
             "wanna kiss me jim ?", "i've a dream , it is to work from home", "i make time stop . i've a superpower.", 
             "i don't like blood. i faint when i see blood.", "hi ! do you like turtles ?"]
for s in sentences:
    evaluateExample(s, scripted_searcher, voc)

Building encoder and decoder ...
Models built and ready to go!


  if a.grad is not None:


> boss i need a rise
Bot: you re a good loser .
> what's up?
Bot: i m going to get my car .
> who are you?
Bot: i m the owner .
> where am I?
Bot: in the house .
> where are you from?
Bot: south america .
> okay give me the keys .
Bot: what is it ?
> hello what are doing today ?
Bot: i m thirsty .
> i just got done watching a horror movie
Bot: you re not supposed to .
> hi ! how are you doing tonight ?
Bot: i m fine .
> i work in a homeless shelter in my town .
Bot: i m sorry .
> i love the crowds , getting to know people .
Bot: i know .
> that nobel prize will just have to wait .
Bot: i ll be there for you .
> you got a name ?
Bot: i m a psychiatrist .
> i love spending time with my family
Bot: i ll get it for you .
> so what do you do now for fun ? i like to read .
Bot: you want to know what you want ?
> wanna kiss me jim ?
Bot: yes .
> i've a dream , it is to work from home
Bot: i m not sure .
> i make time stop . i've a superpower.
Bot: you re not supposed to .
> i don't like blood