In [3]:
# Requirements
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
use_cuda = torch.cuda.is_available()

In [4]:
# Loading data files
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
def readText(src):
    print("Reading lines...")
    lines = []
    
    # Read the file and split into lines
    with open(src, 'r') as f:
        header = f.readline()
        cols = [c.strip() for c in header.split('\t')]
    
        for li, line in enumerate(f):
            cols = [c.strip() for c in line.split('\t')]
            lines.append(cols[1].lower())
            
            if np.mod(li, 100) == 0:
                print('Processed {} lines so far..'.format(li+1))
    
    return lines
    
lines = readText("toy_sentences.tsv")
print(random.choice(lines))

Reading lines...
Processed 1 lines so far..
Processed 101 lines so far..
Processed 201 lines so far..
Processed 301 lines so far..
Processed 401 lines so far..
Processed 501 lines so far..
Processed 601 lines so far..
Processed 701 lines so far..
a first-class road movie that proves you can run away from home , but your ego and all your problems go with you .


In [7]:
class bongEmbed(nn.Module):
    def __init__(self, options):
        super(bongEmbed,self).__init__()

        self.options = options

        self.emb = nn.Embedding(options['n_words']+1, options['n_hid'])

        self.hids = []
        for li in range(options['n_layers']):
            self.hids.append([
                nn.Linear(options['n_hid'], options['n_hid']),
                eval('nn.{}'.format(options['act']))()
                ])
            indim = options['n_hid']
        for i in range(options['n_layers']):
            print(self.hids[i][0])
        self.hid_modules = nn.ModuleList([h[0] for h in self.hids])

        self.classifier = nn.Linear(options['n_hid'], 2)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, s1, s1m):
        print("################## forward ###################")
        s1emb = self.emb(s1)
        print("s1", s1)
        print("s1m", s1m)
        print("s1emb", s1emb)
        s1emb = torch.mul(s1emb, s1m.unsqueeze(2).expand_as(s1emb))
        print("s1emb", s1emb)
        s1emb = torch.sum(s1emb,1).squeeze()
        print("s1emb", s1emb)

        h = s1emb

        for li in range(self.options['n_layers']):
            h = self.hids[li][0](h)
            h = self.hids[li][1](h)

        z = self.classifier(h)

        return z

In [8]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [9]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = Variable(torch.LongTensor(indexes).view(-1, 1))
    if use_cuda:
        return result.cuda()
    else:
        return result


def variablesFromPair(pair):
    input_variable = variableFromSentence(input_lang, pair[0])
    target_variable = variableFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [10]:
teacher_forcing_ratio = 0.5


def train(input_variable, target_variable, decoder, decoder_optimizer, criterion, max_length=MAX_LENGTH):

    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    

    loss = 0

    for i in range(input_length):
        
        encoder_output = bagOfNgram(input_variable[i]) ##

    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = decoder.initHidden()

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]

            decoder_input = Variable(torch.LongTensor([[ni]]))
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, target_variable[di])
            if ni == EOS_token:
                break

    loss.backward()

    decoder_optimizer.step()

    return loss.data[0] / target_length

NameError: name 'MAX_LENGTH' is not defined

In [11]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [12]:
def trainIters(decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [variablesFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_variable = training_pair[0]
        target_variable = training_pair[1]

        loss = train(input_variable, target_variable,
                     decoder, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [13]:
from collections import OrderedDict
options = OrderedDict(
            {'n_words': 100000,
             'order': 3,
             'saveto': "result.txt",
             'vocab': "dictionary.pickle",
             'n_layers': 2,
             'n_hid': 100,
             'emb': 64,
             'act': 'Tanh',
             'batch': 64,
             'disp_freq': 10,
             'save_freq': 1000,
             'val_freq': 1000,
             'n_epochs': 100,
             'seed': 123,
            })

emb = nn.Embedding(options["n_words"], options["n_hid"])

In [14]:
class data_iterator:

    def __init__(self, fname, options):
        self.fname = fname
        self.options = options

        self.source = open(fname, 'r')
        self.source.readline() # dump the header

        self.vocab = pkl.load(open(options['vocab'], 'rb'))

        self.end_of_data = False

        self.nlp = spacy.load('en')


    def __iter__(self):
        return self

    def reset(self):
        self.source.seek(0)
        self.source.readline() # dump the header

    def __next__(self):
        if self.end_of_data:
            self.end_of_data = False
            self.reset()
            raise StopIteration

        s1 = []
        labels = []

        try:
            while True:
                line = self.source.readline()
                if line == '':
                    raise IOError

                cols = [c.strip() for c in line.split('\t')]
                l_ = cols[0].lower()
                if l_ == '-':
                    continue

                if len(cols) < 3:
                    labels.append(0)
                else:
                    if cols[2] not in label_map:
                        continue
                    labels.append(label_map[cols[2]])
                s1_ = self.process(cols[1].lower())
                s1.append(s1_)

                if len(s1) > self.options['batch']:
                    break
        except IOError:
            self.end_of_data = True

        if len(s1) <= 0:
            self.end_of_data = False
            self.reset()
            raise StopIteration

        s1, s1m = self.equalizer(s1)

        return s1, s1m, labels

    def equalizer(self, sents):
        max_len = numpy.max([len(s) for s in sents])
        sents_ = []
        masks_ = []
        i = 0 
        for sent in sents:
            s_ = [0] * max_len
            m_ = [1] * len(sent) + [0] * (max_len - len(sent))
            masks_.append(m_)
            s_[:len(sent)] = sent
            sents_.append(s_)
        return sents_, masks_

    def process(self, sent):
        sent = sent.replace('"', '')
        #print(sent)
        uwords = [t.text for t in self.nlp(str(sent))]
        bong = []
        for ng in set([' '.join(t).strip() for t in 
                zip(*[uwords[i:] for i in range(3)])]):
            #print(ng)
            if ng in self.vocab:
                idx = self.vocab[ng]
                if idx > self.options['n_words']:
                    pass
                else:
                    bong.append(idx)
            else:
                pass
        print(bong)
        return bong


In [15]:
train = data_iterator('train.tsv', options)

FileNotFoundError: [Errno 2] No such file or directory: 'train.tsv'