In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:
device

device(type='cuda')

In [3]:
corpus_name = "question-comment"
datafile = "/home/zhipeng/CB/best-answer-superuser/question2comment/question2comments"

In [4]:
def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

In [5]:
printLines(datafile)

b"ca n't connect to internet in safe mode\twhat error do you receive while executing `ping 8.8.8.8` ?\n"
b"ca n't connect to internet in safe mode\tdo you have access to another computer with a dvd burner ?\n"
b"ca n't connect to internet in safe mode\tat what point it looped ? maybe while loading a specific driver ?\n"
b"ca n't connect to internet in safe mode\tcan you disable the ethernet card in the bios to see if it is causing the crash ?\n"
b"ca n't connect to internet in safe mode\tdoes it work if you try a wired connection ?\n"
b"ca n't connect to internet in safe mode\twhat 's the make and model of the pc ? can you run memtest86+ from a cd/usb ?\n"
b"ca n't connect to internet in safe mode\twhat else did you change in `msconfig` ?\n"
b"ca n't connect to internet in safe mode\twill it activate in safe mode with networking enabled ?\n"
b"ca n't connect to internet in safe mode\tout of curiosity what exactly is on the other end of the cable ?\n"
b"ca n't connect to internet in saf

In [6]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [7]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [8]:
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [9]:
# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus_name, datafile):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs

In [10]:
def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


In [11]:
MAX_LENGTH = 16  # Maximum sentence length to consider
voc, pairs = loadPrepareData(corpus_name, datafile)

MIN_COUNT = 1    # Minimum word count threshold for trimming
# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

# reverse pairs
# pairs = [list(reversed(p)) for p in pairs]

Start preparing training data ...
Reading lines...
Read 385670 sentence pairs
Trimmed to 282753 sentence pairs
Counting words...
Counted words: 16147
keep_words 16144 / 16144 = 1.0000
Trimmed from 282753 pairs to 282753, 1.0000 of total


In [12]:
for e in pairs:
  print(e)
  break

['ca n t connect to internet in safe mode', 'what error do you receive while executing ping . . . ?']


In [13]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

In [14]:
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[  187,  5050,  1178,    77,  1095],
        [ 1336,   385,   873,   111,   998],
        [  132,   598,  2131,    95,  1022],
        [   56,  4332,     7,   190,  1796],
        [  934,    57,   735, 13261,   323],
        [   58,    75,   892,    30,  1914],
        [  432,   250,     9,  1502,     2],
        [   25,   111,   420,  1331,     0],
        [  803,    57,     2,     2,     0],
        [  170,    76,     0,     0,     0],
        [  364,     2,     0,     0,     0],
        [ 1036,     0,     0,     0,     0],
        [    2,     0,     0,     0,     0]])
lengths: tensor([13, 11,  9,  9,  7])
target_variable: tensor([[  63,   45,   14,  189,  189],
        [ 109,   15,   15,  147,  998],
        [  26, 1441,  115,  148,  122],
        [ 163,  131,   22,   21,   62],
        [ 126, 1181,   58,  190,   15],
        [  58,   21,  236,  102,  110],
        [ 707,   14, 3156,  190,   21],
        [  57,   15,  705,   21,    2],
        [  58,   22,   

# Define Models - Seq2Seq

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [16]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [17]:
class DecoderRNN(nn.Module):
    def __init__(self, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()
        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
         # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # rnn_output shape(1, batch_size, hidden_size) 
        rnn_output = rnn_output.squeeze(0)
        # rnn_output shape(batch_size, hidden_size)
        output = self.out(rnn_output)
        # output : shape(batch_size, output_size)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [18]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [19]:
# Define Loss
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [20]:
# Visualise what's happening in the loss function
# decoder_out shape:(batch_size, vocab_size) target_size = (batch_size, 1)
dec_o = torch.rand(5, 7)
dec_o = F.softmax(dec_o, dim=1)
tar = torch.tensor([2, 1, 5, 4, 0], dtype=torch.long)
tar = tar.view(-1, 1)
mask = torch.tensor([1, 0, 1, 1, 0], dtype=torch.uint8)
print(dec_o)
print(tar)
# Get softmax scores for the expected correct predictions
gath_ten = torch.gather(dec_o, 1, tar)
print(gath_ten)
print(gath_ten.shape)
crossEntropy = -torch.log(gath_ten)
print("Cross Entropy:")
print(crossEntropy)
mask = mask.unsqueeze(1)
print(mask)
loss = crossEntropy.masked_select(mask)
print("Loss:")
print(loss)
print(loss.shape)
print("Sum of mask elements (How many elements we are considering):", mask.sum())
print("Mean of the Loss:", loss.mean())
print("Mean of the corss-entropy loss(without masking)", crossEntropy.mean())

tensor([[0.1778, 0.1341, 0.1305, 0.1156, 0.1439, 0.1422, 0.1560],
        [0.1526, 0.0998, 0.1065, 0.1324, 0.1555, 0.1828, 0.1704],
        [0.1676, 0.0996, 0.2507, 0.1525, 0.1012, 0.1075, 0.1208],
        [0.1493, 0.1134, 0.1864, 0.1464, 0.0971, 0.1781, 0.1294],
        [0.1317, 0.1033, 0.1191, 0.1263, 0.1900, 0.2193, 0.1101]])
tensor([[2],
        [1],
        [5],
        [4],
        [0]])
tensor([[0.1305],
        [0.0998],
        [0.1075],
        [0.0971],
        [0.1317]])
torch.Size([5, 1])
Cross Entropy:
tensor([[2.0367],
        [2.3041],
        [2.2298],
        [2.3322],
        [2.0271]])
tensor([[1],
        [0],
        [1],
        [1],
        [0]], dtype=torch.uint8)
Loss:
tensor([2.0367, 2.2298, 2.3322])
torch.Size([3])
Sum of mask elements (How many elements we are considering): tensor(3)
Mean of the Loss: tensor(2.1996)
Mean of the corss-entropy loss(without masking) tensor(2.1860)


In [21]:
# Visualizing what's happening in one iteration, only run this for visualization

small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
print("input_variable:")
print(input_variable, input_variable.shape)
print("lengths:")
print(lengths, lengths.shape)
print("target_variable:")
print(target_variable, target_variable.shape)
print("mask:") 
print(mask, mask.shape)
print("max_target_len:", max_target_len)

input_variable:
tensor([[  676,   824,   746,  6782,  6407],
        [    7,   138,    58, 11411,   372],
        [11808,   374,   618,   400,    57],
        [   27,   824,    53,  1777,   245],
        [  484,  1265,    27,  1229,    99],
        [   73,   250,   902,  5141,    73],
        [   80,    77,  1082,  1496,     2],
        [   58,   111,   231,     2,     0],
        [   79,   107,   776,     0,     0],
        [ 4768,   824,   902,     0,     0],
        [  794, 15640,     2,     0,     0],
        [    2,     2,     0,     0,     0]]) torch.Size([12, 5])
lengths:
tensor([12, 12, 11,  8,  7]) torch.Size([5])
target_variable:
tensor([[  12,   12,   77,   94,  380],
        [  56,  457,   15,  893,   94],
        [  58,   62,  728,  656,   68],
        [  13,   15,  231, 4540,   58],
        [1054,  110,  224,   62,  237],
        [  21,   21,  218,  480,   56],
        [   2,    2,   95,    9,  196],
        [   0,    0,   21,   58,  197],
        [   0,    0,    2, 1420,

In [22]:
# Define the parameters 
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
embedding = nn.Embedding(voc.num_words, hidden_size)

# Define the Encoder and Decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = DecoderRNN(embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
# Zero gradients
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

# Set device options
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

Building optimizers ...


In [23]:
# encoder_outputs: (max_length, batch_size, hidden_size)
# encoder_hidden: (n_layers x num_directions, batch_size, hidden_size)
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print("Input varialbe:", input_variable, input_variable.shape)
print("Encoder Outputs Shape:", encoder_outputs, encoder_outputs.shape)
print("Last Encoder Hidden Shape:", encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print("Initial Decoder Input:")
print(decoder_input, decoder_input.shape)

# Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers] # encoder_hidden[:2,:,:].shape
print("Initial Decoder hidden state shape:", decoder_hidden.shape)

Input varialbe: tensor([[  676,   824,   746,  6782,  6407],
        [    7,   138,    58, 11411,   372],
        [11808,   374,   618,   400,    57],
        [   27,   824,    53,  1777,   245],
        [  484,  1265,    27,  1229,    99],
        [   73,   250,   902,  5141,    73],
        [   80,    77,  1082,  1496,     2],
        [   58,   111,   231,     2,     0],
        [   79,   107,   776,     0,     0],
        [ 4768,   824,   902,     0,     0],
        [  794, 15640,     2,     0,     0],
        [    2,     2,     0,     0,     0]], device='cuda:0') torch.Size([12, 5])
Encoder Outputs Shape: tensor([[[-0.0731,  0.1340,  0.0549,  ..., -0.1466, -0.4015,  0.1331],
         [-0.2705, -0.3039,  0.1299,  ..., -0.3505,  0.2420, -0.2821],
         [-0.2417, -0.3604, -0.0772,  ..., -0.1259,  0.1903, -0.0656],
         [ 0.3065,  0.1362,  0.0280,  ...,  0.0671, -0.1792, -0.1618],
         [ 0.0448, -0.0348, -0.3049,  ..., -0.1125,  0.1159,  0.1025]],

        [[-0.0839,  0.1512

In [24]:
print("Now lets whats happening in every timestep of the GRU")
# Assume we are using Teacher Forcing
for t in range(max_target_len):
  decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
  print("Decoder Output Shape:", decoder_output.shape)
  print("Decoder Hidden Shape:", decoder_hidden.shape)
  # Teacher forcing : next input is current target
  decoder_input = target_variable[t].view(1, -1)
  print("Target variable at the current timestep before reshaping:", target_variable[t], target_variable[t].shape)
  print("Decoder input:", decoder_input, decoder_input.shape)
  # Calculate and accumulate loss
  print("The mask at the current timestep:", mask[t], mask[t].shape)
  
  mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
  print("Mask Loss:", mask_loss)
  print("Total:", nTotal)
  loss += mask_loss
  print_losses.append(mask_loss.item() * nTotal)
  print(print_losses)
  
  n_totals += nTotal
  print(n_totals)
  encoder_optimizer.step()
  decoder_optimizer.step()
  returned_loss = sum(print_losses) / n_totals
  print("Returned Loss:", returned_loss)
  print("\n")
  print("-------------------------------DONE ONE TIMESTEP-------------------------------")
  print("\n")
  # break

Now lets whats happening in every timestep of the GRU
Decoder Output Shape: torch.Size([5, 16147])
Decoder Hidden Shape: torch.Size([2, 5, 500])
Target variable at the current timestep before reshaping: tensor([ 12,  12,  77,  94, 380], device='cuda:0') torch.Size([5])
Decoder input: tensor([[ 12,  12,  77,  94, 380]], device='cuda:0') torch.Size([1, 5])
The mask at the current timestep: tensor([1, 1, 1, 1, 1], device='cuda:0', dtype=torch.uint8) torch.Size([5])
Mask Loss: tensor(9.6538, device='cuda:0', grad_fn=<MeanBackward0>)
Total: 5
[48.26900005340576]
5
Returned Loss: 9.653800010681152


-------------------------------DONE ONE TIMESTEP-------------------------------


Decoder Output Shape: torch.Size([5, 16147])
Decoder Hidden Shape: torch.Size([2, 5, 500])
Target variable at the current timestep before reshaping: tensor([ 56, 457,  15, 893,  94], device='cuda:0') torch.Size([5])
Decoder input: tensor([[ 56, 457,  15, 893,  94]], device='cuda:0') torch.Size([1, 5])
The mask at th

In [25]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [26]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [27]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [28]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [29]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [None]:
# save_dir = "./gdrive/My Drive/best-answer/Query-boosting/question-boosting/"
save_dir = "/home/zhipeng/CB/best-answer-superuser/question2comment/"

# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 100000
print_every = 500
save_every = 10000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 500; Percent complete: 0.5%; Average loss: 4.6066
Iteration: 1000; Percent complete: 1.0%; Average loss: 3.7589
Iteration: 1500; Percent complete: 1.5%; Average loss: 3.3553
Iteration: 2000; Percent complete: 2.0%; Average loss: 3.0302
Iteration: 2500; Percent complete: 2.5%; Average loss: 2.7734
Iteration: 3000; Percent complete: 3.0%; Average loss: 2.5645
Iteration: 3500; Percent complete: 3.5%; Average loss: 2.3948
Iteration: 4000; Percent complete: 4.0%; Average loss: 2.2362
Iteration: 4500; Percent complete: 4.5%; Average loss: 2.1164
Iteration: 5000; Percent complete: 5.0%; Average loss: 2.0051
Iteration: 5500; Percent complete: 5.5%; Average loss: 1.9057
Iteration: 6000; Percent complete: 6.0%; Average loss: 1.8311
Iteration: 6500; Percent complete: 6.5%; Average loss: 1.7508
Iteration: 7000; Percent complete: 7.0%; Average loss: 1.6991
Iteration: 7500; Percent complete: 7.5%; Average loss: 1.6237

Iteration: 64500; Percent complete: 64.5%; Average loss: 0.7221
Iteration: 65000; Percent complete: 65.0%; Average loss: 0.7260
Iteration: 65500; Percent complete: 65.5%; Average loss: 0.7246
Iteration: 66000; Percent complete: 66.0%; Average loss: 0.7280
Iteration: 66500; Percent complete: 66.5%; Average loss: 0.7212
Iteration: 67000; Percent complete: 67.0%; Average loss: 0.7271
Iteration: 67500; Percent complete: 67.5%; Average loss: 0.7240
Iteration: 68000; Percent complete: 68.0%; Average loss: 0.7130
Iteration: 68500; Percent complete: 68.5%; Average loss: 0.7175
Iteration: 69000; Percent complete: 69.0%; Average loss: 0.7161
Iteration: 69500; Percent complete: 69.5%; Average loss: 0.7171
Iteration: 70000; Percent complete: 70.0%; Average loss: 0.7172
Iteration: 70500; Percent complete: 70.5%; Average loss: 0.7161
Iteration: 71000; Percent complete: 71.0%; Average loss: 0.7055
Iteration: 71500; Percent complete: 71.5%; Average loss: 0.7097
Iteration: 72000; Percent complete: 72.0

In [0]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [0]:
def evaluateInput(encoder, decoder, searcher, voc):
    # input_sentence = ''
    while(1):
        try:
            # Get input sentence
            # input_sentence = input('> ')
            # Check if it is quit case
            # if input_sentence == 'q' or input_sentence == 'quit': break
            pair = random.choice(pairs)
            input_sentence, target_sentence = pair[0], pair[1]
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))
            break
            
        except KeyError:
            print("Error: Encountered unknown word.")

In [0]:
def evaluateRandomly(encoder, decoder, n=10):
  for i in range(n):
    pair = random.choice(pairs)
    input_sentence, target_sentence = pair[0], pair[1]
    print(">", pair[0])
    print("=", pair[1])
    # Normalize sentence
    input_sentence = normalizeString(input_sentence)
    # print(input_sentence)
    # Evaluate sentence
    output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
    # Format and print response sentence
    output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
    print('Bot:', ' '.join(output_words))

In [61]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Randomly evaluate
evaluateRandomly(encoder, decoder)

> is there a command to get the location of a file which is selected in nautilus ?
= do you want a new tab or a new window ?
Bot: op are you still looking for an answer ? xterm ? ? ? ? ? ? ? ? ? ? ?
> how do i downgrade google chrome ?
= what version of ubuntu are you on ?
Bot: switching to vmlinuz .old and initrd .img .old did not help why would it have ? ? ? ? ? ? ? ? ?
> how to let ubuntu promt my password on the greeter screen instead of the keyring promt ?
= so you want to disable the dialogue that asks you to input your password to change system critical settings ?
Bot: what desktop environment are you using ? ? maybe ? ? kde ? ? ? ? ? ? ? ? ?
> lubuntu . lts installation shows block message
= did you use the webcam just before changing ? and was it a clear picture ?
Bot: what is cpu model ? of home ? cpu ram ? torrent ? ? ? ? ? ? ? ? ?
> gnome . choppy animations with intel hd graphics
= are any drivers listed in additional drivers ?
Bot: are you running bit ? ? and restoring th