## Drive access

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

def downloadFile(fileId):
  file = fileId
  downloaded = drive.CreateFile({'id': fileId})
  downloaded.GetContentFile(file)
  return file

def uploadToDrive(filename): 
  # upload file to drive
  # link to folder : https://drive.google.com/drive/folders/1hQ1Oo1NZJyiKNcqg9DW0ARI7jdo4W-VG?usp=sharing
  folder_id = '1hQ1Oo1NZJyiKNcqg9DW0ARI7jdo4W-VG'
  file = drive.CreateFile({'parents':[{u'id': folder_id}]})
  file.SetContentFile(filename)
  file.Upload()

## Preprocess

In [1]:
from nltk.tokenize import word_tokenize

import nltk
import os
from io import open
import re
import sys

# nltk.download('punkt')

DATA_DIR = "./data"

# dummy data
data = os.path.join(DATA_DIR,"dummy-europarl-v7.en")
preprocessed_data = os.path.join(DATA_DIR,"dummy_preprocessed.txt")

# real data
# data = os.path.join(DATA_DIR,"europarl-v7.en")
# preprocessed_data = os.path.join(DATA_DIR,"europarl-v7.en.preprocessed.txt")

NUM = ''

# EOS_PUNCTS = {".": ".PERIOD", "?": "?QUESTIONMARK", "!": "!EXCLAMATIONMARK"}
# INS_PUNCTS = {",": ",COMMA", ";": ";SEMICOLON", ":": ":COLON", "-": "-DASH"}

EOS_PUNCTS = {".": "", "?": "", "!": ""}
INS_PUNCTS = {",": "", ";": "", ":": "", "-": ""}

forbidden_symbols = re.compile(r"[\[\]\(\)\/\\\>\<\=\+\_\*]")
apostrophe = re.compile(r" '[^ ]")
numbers = re.compile(r"\d")
multiple_punct = re.compile(r'([\.\?\!\,\:\;\-])(?:[\.\?\!\,\:\;\-]){1,}')

is_number = lambda x: len(numbers.sub("", x)) / len(x) < 0.6

def untokenize(line):
    return line.replace(" '", "'").replace(" n't", "n't").replace("can not", "cannot")

def skip(line):

    # skip empty line
    if line.strip() == '':
        return True

    # skip line without one of EOS_PUNCTS
    last_symbol = line[-1]
    if not last_symbol in EOS_PUNCTS:
        return True

    # skip line with forbidden symbols
    if forbidden_symbols.search(line) is not None:
        return True
    
    # skip line with single quoted text, like 'high-risk'
    if apostrophe.search(line) is not None:
        return True
    
    # skip line with single quote
    if "\' s " in line:
        return True
    if "\' ll " in line:
        return True
    if "\' ve " in line:
        return True
    if "\' m " in line:
        return True

    return False

def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in INS_PUNCTS:
#             output_tokens.append(INS_PUNCTS[token])
        elif token in EOS_PUNCTS:
#             output_tokens.append(EOS_PUNCTS[token])
        elif is_number(token):
#             output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ")

def preprocess(input_file, output_file):
    skipped = 0
    
    print("Preprocessing", input_file)
    with open(output_file, 'w', encoding='utf-8') as out_txt:
        with open(input_file, 'r', encoding='utf-8') as text:

            for line in text:

                line = line.replace("\"", "").strip()
                line = multiple_punct.sub(r"\g<1>", line)

                if skip(line):
                    skipped += 1
                    continue

                line = process_line(line)

                out_txt.write(line + '\n')

    print("Skipped %d lines" % skipped)
    

preprocess(data, preprocessed_data)

Preprocessing ./data\dummy-europarl-v7.en
Skipped 13065 lines


## Split train dev test

In [48]:
from sklearn.model_selection import train_test_split
import pandas as pd

def writeToFile(data, filename):
    data.to_csv(filename, index = False, header = False, quoting=csv.QUOTE_MINIMAL, sep="\n")

DATA_DIR = "./data"

trainDevTest_name = "dummy_ep"
# trainDevTest_name = "ep"
trainDevTest_dir = os.path.join(DATA_DIR, trainDevTest_name)

input_file = preprocessed_data

df = pd.read_csv(input_file, sep="\n", header=None)
# print(df.head())

train, dev = train_test_split(df, test_size=0.1, shuffle=False)
train, test = train_test_split(train, test_size=0.1, shuffle=False)

# print(train.head())
# print(dev.head())
# print(test.head())

print(train.shape)
print(dev.shape)
print(test.shape)

writeToFile(train, trainDevTest_dir+".train.txt")
writeToFile(dev, trainDevTest_dir+".dev.txt")
writeToFile(test, trainDevTest_dir+".test.txt")

(1655208, 1)
(204347, 1)
(183913, 1)


## Create vocabulary and embedding for train dev test (Data.py)

In [5]:
import random
import os
import sys
import operator
try:
    import cPickle
except ImportError:
    import _pickle as cPickle
try:
    input = raw_input
except NameError:
    pass
from io import open
import fnmatch
import shutil

DATA_DIR = "./data"

DATA_PATH = os.path.join(DATA_DIR,"embeddedData")

# path to text file in the format:
# word1 0.123 0.123 ... 0.123
# word2 0.123 0.123 ... 0.123 etc...
# e.g. glove.6B.50d.txt
PRETRAINED_EMBEDDINGS_PATH = None

END = "</S>"
UNK = "<UNK>"
NUM = "<NUM>"

SPACE = "_SPACE"

MAX_WORD_VOCABULARY_SIZE = 100000
MIN_WORD_COUNT_IN_VOCAB = 2
MAX_SEQUENCE_LEN = 50

TRAIN_FILE = os.path.join(DATA_PATH, "train")
DEV_FILE = os.path.join(DATA_PATH, "dev")
TEST_FILE = os.path.join(DATA_PATH, "test")

# Stage 2
TRAIN_FILE2 = os.path.join(DATA_PATH, "train2")
DEV_FILE2 = os.path.join(DATA_PATH, "dev2")
TEST_FILE2 = os.path.join(DATA_PATH, "test2")

WORD_VOCAB_FILE = os.path.join(DATA_PATH, "vocabulary")

PUNCTUATION_VOCABULARY = [SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"]
PUNCTUATION_MAPPING = {}

# Comma, period & question mark only:
# PUNCTUATION_VOCABULARY = {SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK"}
# PUNCTUATION_MAPPING = {"!EXCLAMATIONMARK": ".PERIOD", ":COLON": ",COMMA", ";SEMICOLON": ".PERIOD", "-DASH": ",COMMA"}

EOS_TOKENS = {".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK"}
CRAP_TOKENS = {"<doc>", "<doc.>"} # punctuations that are not included in vocabulary nor mapping, must be added to CRAP_TOKENS
PAUSE_PREFIX = "<sil="

# replacement for pickling that takes less RAM. Useful for large datasets.
def dump(d, path):
    with open(path, 'w') as f:
        for s in d:
            f.write("%s\n" % repr(s))

def loadData(path):
    d = []
    with open(path, 'r') as f:
        for l in f:
            d.append(eval(l))
    return d

def add_counts(word_counts, line):
    for w in line.split():
        if w in CRAP_TOKENS or w in PUNCTUATION_VOCABULARY or w in PUNCTUATION_MAPPING or w.startswith(PAUSE_PREFIX):
            continue
        word_counts[w] = word_counts.get(w, 0) + 1

def build_vocabulary(word_counts):
    return [wc[0] for wc in reversed(sorted(word_counts.items(), key=operator.itemgetter(1))) if wc[1] >= MIN_WORD_COUNT_IN_VOCAB and wc[0] != UNK][:MAX_WORD_VOCABULARY_SIZE] # Unk will be appended to end

def write_vocabulary(vocabulary, file_name):
    if END not in vocabulary:
        vocabulary.append(END)
    if UNK not in vocabulary:
        vocabulary.append(UNK)

    print("Vocabulary size: %d" % len(vocabulary))

    with open(file_name, 'w', encoding='utf-8') as f:
        f.write("\n".join(vocabulary))

def iterable_to_dict(arr):
    return dict((x.strip(), i) for (i, x) in enumerate(arr))

def read_vocabulary(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        return iterable_to_dict(f.readlines())

def write_processed_dataset(input_files, output_file):
    """
    data will consist of two sets of aligned subsequences (words and punctuations) of MAX_SEQUENCE_LEN tokens (actually punctuation sequence will be 1 element shorter).
    If a sentence is cut, then it will be added to next subsequence entirely (words before the cut belong to both sequences)
    """

    data = []

    word_vocabulary = read_vocabulary(WORD_VOCAB_FILE)
    punctuation_vocabulary = iterable_to_dict(PUNCTUATION_VOCABULARY)

    num_total = 0
    num_unks = 0

    current_words = []
    current_punctuations = []
    current_pauses = []

    last_eos_idx = 0 # if it's still 0 when MAX_SEQUENCE_LEN is reached, then the sentence is too long and skipped.
    last_token_was_punctuation = True # skipt first token if it's punctuation
    last_pause = 0.0

    skip_until_eos = False # if a sentence does not fit into subsequence, then we need to skip tokens until we find a new sentence

    for input_file in input_files:

        with open(input_file, 'r', encoding='utf-8') as text:

            for line in text:

                for token in line.split():

                    # First map oov punctuations to known punctuations
                    if token in PUNCTUATION_MAPPING:
                        token = PUNCTUATION_MAPPING[token]

                    if skip_until_eos:

                        if token in EOS_TOKENS:
                            skip_until_eos = False

                        continue

                    elif token in CRAP_TOKENS:
                        continue

                    elif token.startswith(PAUSE_PREFIX):
                        last_pause = float(token.replace(PAUSE_PREFIX,"").replace(">",""))

                    elif token in punctuation_vocabulary:

                        if last_token_was_punctuation: # if we encounter sequences like: "... !EXLAMATIONMARK .PERIOD ...", then we only use the first punctuation and skip the ones that follow
                            continue

                        if token in EOS_TOKENS:
                            last_eos_idx = len(current_punctuations) # no -1, because the token is not added yet

                        punctuation = punctuation_vocabulary[token]

                        current_punctuations.append(punctuation)
                        last_token_was_punctuation = True

                    else:

                        if not last_token_was_punctuation:
                            current_punctuations.append(punctuation_vocabulary[SPACE])

                        word = word_vocabulary.get(token, word_vocabulary[UNK])

                        current_words.append(word)
                        current_pauses.append(last_pause)
                        last_token_was_punctuation = False

                        num_total += 1
                        num_unks += int(word == word_vocabulary[UNK])

                    if len(current_words) == MAX_SEQUENCE_LEN: # this also means, that last token was a word
                        
                        assert len(current_words) == len(current_punctuations) + 1, "#words: %d; #punctuations: %d" % (len(current_words), len(current_punctuations))
                        assert current_pauses == [] or len(current_words) == len(current_pauses), "#words: %d; #pauses: %d" % (len(current_words), len(current_pauses))

                        # Sentence did not fit into subsequence - skip it
                        if last_eos_idx == 0: 
                            skip_until_eos = True

                            current_words = []
                            current_punctuations = []
                            current_pauses = []

                            last_token_was_punctuation = True # next sequence starts with a new sentence, so is preceded by eos which is punctuation

                        else:
                            subsequence = [
                                current_words[:-1] + [word_vocabulary[END]],
                                current_punctuations,
                                current_pauses[1:]
                            ]

                            data.append(subsequence)

                            # Carry unfinished sentence to next subsequence
                            current_words = current_words[last_eos_idx+1:]
                            current_punctuations = current_punctuations[last_eos_idx+1:]
                            current_pauses = current_pauses[last_eos_idx+1:]

                        last_eos_idx = 0 # sequence always starts with a new sentence

    print("%.2f%% UNK-s in %s" % (num_unks / num_total * 100, output_file))

    dump(data, output_file)

def create_dev_test_train_split_and_vocabulary(root_path, create_vocabulary, train_output, dev_output, test_output, pretrained_embeddings_path=None):

    train_txt_files = []
    dev_txt_files = []
    test_txt_files = []

    if create_vocabulary and not pretrained_embeddings_path:
        word_counts = dict()
    
    for root, dirnames, filenames in os.walk(root_path):
        for filename in fnmatch.filter(filenames, '*.txt'):

            path = os.path.join(root, filename)

            if filename.endswith(".test.txt"):
                test_txt_files.append(path)

            elif filename.endswith(".dev.txt"):
                dev_txt_files.append(path)

            else:
                train_txt_files.append(path)

                if create_vocabulary and not pretrained_embeddings_path:
                    with open(path, 'r', encoding='utf-8') as text:
                        for line in text:
                            add_counts(word_counts, line)

    if create_vocabulary:
        if pretrained_embeddings_path:
            vocabulary = []
            embeddings = []
            with open(pretrained_embeddings_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.split()
                    w = line[0]
                    e = [float(x) for x in line[1:]]
                    vocabulary.append(w)
                    embeddings.append(e)

            with open("We.pcl", 'wb') as f:
                cPickle.dump(embeddings, f, cPickle.HIGHEST_PROTOCOL)
        else:
            vocabulary = build_vocabulary(word_counts)
        write_vocabulary(vocabulary, WORD_VOCAB_FILE)

    write_processed_dataset(train_txt_files, train_output)
    write_processed_dataset(dev_txt_files, dev_output)
    write_processed_dataset(test_txt_files, test_output)
    
    
# path to train, dev, and test files
path = os.path.join(DATA_DIR,"trainDevTest")

replace = False
if os.path.exists(DATA_PATH):

    while True:
        resp = input("Data path '%s' already exists. Do you want to:\n[r]eplace the files in existing data path?\n[e]xit?\n>" % DATA_PATH)
        resp = resp.lower().strip()
        if resp not in ('r', 'e'):
            continue
        if resp == 'e':
            sys.exit()
        elif resp == 'r':
            replace = True
        break

if replace and os.path.exists(DATA_PATH):
    shutil.rmtree(DATA_PATH)

os.makedirs(DATA_PATH)
    
create_dev_test_train_split_and_vocabulary(path, True, TRAIN_FILE, DEV_FILE, TEST_FILE, PRETRAINED_EMBEDDINGS_PATH)

Data path './data\embeddedData' already exists. Do you want to:
[r]eplace the files in existing data path?
[e]xit?
>r
Vocabulary size: 17067
0.45% UNK-s in ./data\embeddedData\train
1.21% UNK-s in ./data\embeddedData\dev
1.34% UNK-s in ./data\embeddedData\test


## Training

### Model

In [6]:
import theano
try:
    import cPickle
    cpickle_options = {}
except ImportError:
    import _pickle as cPickle
    cpickle_options = { 'encoding': 'latin-1' }
import os
import theano.tensor as T

def PReLU(a, x):
    return T.maximum(0.0, x) + a * T.minimum(0.0, x)

def ReLU(x):
    return T.maximum(0.0, x)

def _get_shape(i, o, keepdims):
    if (i == 1 or o == 1) and not keepdims:
        return (max(i,o),)
    else:
        return (i, o)

def _slice(tensor, size, i):
    """Gets slice of columns of the tensor"""
    if tensor.ndim == 2:
        return tensor[:, i*size:(i+1)*size]
    elif tensor.ndim == 1:
        return tensor[i*size:(i+1)*size]
    else:
        raise NotImplementedError("Tensor should be 1 or 2 dimensional")

def weights_const(i, o, name, const, keepdims=False):
    W_values = np.ones(_get_shape(i, o, keepdims)).astype(theano.config.floatX) * const
    return theano.shared(value=W_values, name=name, borrow=True)

def weights_identity(i, o, name, const, keepdims=False):
    #"A Simple Way to Initialize Recurrent Networks of Rectified Linear Units" (2015) (http://arxiv.org/abs/1504.00941)
    W_values = np.eye(*_get_shape(i, o, keepdims)).astype(theano.config.floatX) * const
    return theano.shared(value=W_values, name=name, borrow=True)

def weights_Glorot(i, o, name, rng, is_logistic_sigmoid=False, keepdims=False):
    #http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
    d = np.sqrt(6. / (i + o))
    if is_logistic_sigmoid:
        d *= 4.
    W_values = rng.uniform(low=-d, high=d, size=_get_shape(i, o, keepdims)).astype(theano.config.floatX)
    return theano.shared(value=W_values, name=name, borrow=True)

def loadModel(file_path, minibatch_size, x, p=None):
    try:
        import cPickle
    except ImportError:
        import _pickle as cPickle
    import theano
    import numpy as np

    with open(file_path, 'rb') as f:
        state = cPickle.load(f, **cpickle_options)

    rng = np.random
    rng.set_state(state["random_state"])

    net = GRU(
        rng=rng,
        x=x,
        minibatch_size=minibatch_size,
        n_hidden=state["n_hidden"],
        x_vocabulary=state["x_vocabulary"],
        y_vocabulary=state["y_vocabulary"],
        stage1_model_file_name=state.get("stage1_model_file_name", None),
        p=p
        )

    for net_param, state_param in zip(net.params, state["params"]):
        net_param.set_value(state_param, borrow=True)

    gsums = [theano.shared(gsum) for gsum in state["gsums"]] if state["gsums"] else None

    return net, (gsums, state["learning_rate"], state["validation_ppl_history"], state["epoch"], rng)

class GRULayer(object):

    def __init__(self, rng, n_in, n_out, minibatch_size):
        super(GRULayer, self).__init__()
        # Notation from: An Empirical Exploration of Recurrent Network Architectures

        self.n_in = n_in
        self.n_out = n_out

        # Initial hidden state
        self.h0 = theano.shared(value=np.zeros((minibatch_size, n_out)).astype(theano.config.floatX), name='h0', borrow=True)

        # Gate parameters:
        self.W_x = weights_Glorot(n_in, n_out*2, 'W_x', rng)
        self.W_h = weights_Glorot(n_out, n_out*2, 'W_h', rng)
        self.b = weights_const(1, n_out*2, 'b', 0)
        # Input parameters
        self.W_x_h = weights_Glorot(n_in, n_out, 'W_x_h', rng)
        self.W_h_h = weights_Glorot(n_out, n_out, 'W_h_h', rng)
        self.b_h = weights_const(1, n_out, 'b_h', 0)

        self.params = [self.W_x, self.W_h, self.b, self.W_x_h, self.W_h_h, self.b_h]

    def step(self, x_t, h_tm1):

        rz = T.nnet.sigmoid(T.dot(x_t, self.W_x) + T.dot(h_tm1, self.W_h) + self.b)
        r = _slice(rz, self.n_out, 0)
        z = _slice(rz, self.n_out, 1)

        h = T.tanh(T.dot(x_t, self.W_x_h) + T.dot(h_tm1 * r, self.W_h_h) + self.b_h)

        h_t = z * h_tm1 + (1. - z) * h

        return h_t

class GRU(object):

    def __init__(self, rng, x, minibatch_size, n_hidden, x_vocabulary, y_vocabulary, stage1_model_file_name=None, p=None):

        assert not stage1_model_file_name and not p, "Stage 1 model can't have stage 1 model"

        x_vocabulary_size = len(x_vocabulary)
        y_vocabulary_size = len(y_vocabulary)

        self.n_hidden = n_hidden
        self.x_vocabulary = x_vocabulary
        self.y_vocabulary = y_vocabulary

        # input model
        pretrained_embs_path = "We.pcl"
        if os.path.exists(pretrained_embs_path):
            print("Found pretrained embeddings in '%s'. Using them..." % pretrained_embs_path)
            with open(pretrained_embs_path, 'rb') as f:
                We = cPickle.load(f, **cpickle_options)
            n_emb = len(We[0])
            We.append([0.1]*n_emb) # END
            We.append([0.0]*n_emb) # UNK - both quite arbitrary initializations

            We = np.array(We).astype(theano.config.floatX)
            self.We = theano.shared(value=We, name="We", borrow=True)
        else:
            n_emb = n_hidden
            self.We = weights_Glorot(x_vocabulary_size, n_emb, 'We', rng) # Share embeddings between forward and backward model

        self.GRU_f = GRULayer(rng=rng, n_in=n_emb, n_out=n_hidden, minibatch_size=minibatch_size)
        self.GRU_b = GRULayer(rng=rng, n_in=n_emb, n_out=n_hidden, minibatch_size=minibatch_size)

        # output model
        self.GRU = GRULayer(rng=rng, n_in=n_hidden*2, n_out=n_hidden, minibatch_size=minibatch_size)
        self.Wy = weights_const(n_hidden, y_vocabulary_size, 'Wy', 0)
        self.by = weights_const(1, y_vocabulary_size, 'by', 0)

        # attention model
        n_attention = n_hidden * 2 # to match concatenated forward and reverse model states
        self.Wa_h = weights_Glorot(n_hidden, n_attention, 'Wa_h', rng) # output model previous hidden state to attention model weights
        self.Wa_c = weights_Glorot(n_attention, n_attention, 'Wa_c', rng) # contexts to attention model weights
        self.ba = weights_const(1, n_attention, 'ba', 0)
        self.Wa_y = weights_Glorot(n_attention, 1, 'Wa_y', rng) # gives weights to contexts

        # Late fusion parameters
        self.Wf_h = weights_const(n_hidden, n_hidden, 'Wf_h', 0)
        self.Wf_c = weights_const(n_attention, n_hidden, 'Wf_c', 0)
        self.Wf_f = weights_const(n_hidden, n_hidden, 'Wf_f', 0)
        self.bf = weights_const(1, n_hidden, 'by', 0)

        self.params = [self.We,
                       self.Wy, self.by,
                       self.Wa_h, self.Wa_c, self.ba, self.Wa_y,
                       self.Wf_h, self.Wf_c, self.Wf_f, self.bf]

        self.params += self.GRU.params + self.GRU_f.params + self.GRU_b.params

        # bi-directional recurrence
        def input_recurrence(x_f_t, x_b_t, h_f_tm1, h_b_tm1):
            h_f_t = self.GRU_f.step(x_t=x_f_t, h_tm1=h_f_tm1)
            h_b_t = self.GRU_b.step(x_t=x_b_t, h_tm1=h_b_tm1)
            return [h_f_t, h_b_t]

        def output_recurrence(x_t, h_tm1, Wa_h, Wa_y, Wf_h, Wf_c, Wf_f, bf, Wy, by, context, projected_context):

            # Attention model
            h_a = T.tanh(projected_context + T.dot(h_tm1, Wa_h))
            alphas = T.exp(T.dot(h_a, Wa_y))
            alphas = alphas.reshape((alphas.shape[0], alphas.shape[1])) # drop 2-axis (sized 1)
            alphas = alphas / alphas.sum(axis=0, keepdims=True)
            weighted_context = (context * alphas[:,:,None]).sum(axis=0)

            h_t = self.GRU.step(x_t=x_t, h_tm1=h_tm1)

            # Late fusion
            lfc = T.dot(weighted_context, Wf_c) # late fused context
            fw = T.nnet.sigmoid(T.dot(lfc, Wf_f) + T.dot(h_t, Wf_h) + bf) # fusion weights
            hf_t = lfc * fw + h_t # weighted fused context + hidden state

            z = T.dot(hf_t, Wy) + by
            y_t = T.nnet.softmax(z)

            return [h_t, hf_t, y_t, alphas]

        x_emb = self.We[x.flatten()].reshape((x.shape[0], minibatch_size, n_emb))

        [h_f_t, h_b_t], _ = theano.scan(fn=input_recurrence,
            sequences=[x_emb, x_emb[::-1]], # forward and backward sequences
            outputs_info=[self.GRU_f.h0, self.GRU_b.h0])

        # 0-axis is time steps, 1-axis is batch size and 2-axis is hidden layer size
        context = T.concatenate([h_f_t, h_b_t[::-1]], axis=2)
        projected_context = T.dot(context, self.Wa_c) + self.ba

        [_, self.last_hidden_states, self.y, self.alphas], _ = theano.scan(fn=output_recurrence,
            sequences=[context[1:]], # ignore the 1st word in context, because there's no punctuation before that
            non_sequences=[self.Wa_h, self.Wa_y, self.Wf_h, self.Wf_c, self.Wf_f, self.bf, self.Wy, self.by, context, projected_context],
            outputs_info=[self.GRU.h0, None, None, None])

        print("Number of parameters is %d" % sum(np.prod(p.shape.eval()) for p in self.params))

        self.L1 = sum(abs(p).sum() for p in self.params)
        self.L2_sqr = sum((p**2).sum() for p in self.params)

    def cost(self, y):
        num_outputs = self.y.shape[0]*self.y.shape[1] # time steps * number of parallel sequences in batch
        output = self.y.reshape((num_outputs, self.y.shape[2]))
        return -T.sum(T.log(output[T.arange(num_outputs), y.flatten()]))

    def save(self, file_path, gsums=None, learning_rate=None, validation_ppl_history=None, best_validation_ppl=None, epoch=None, random_state=None):
        try:
            import cPickle
        except ImportError:
            import _pickle as cPickle
        state = {
            "type":                     self.__class__.__name__,
            "n_hidden":                 self.n_hidden,
            "x_vocabulary":             self.x_vocabulary,
            "y_vocabulary":             self.y_vocabulary,
            "stage1_model_file_name":   self.stage1_model_file_name if hasattr(self, "stage1_model_file_name") else None,
            "params":                   [p.get_value(borrow=True) for p in self.params],
            "gsums":                    [s.get_value(borrow=True) for s in gsums] if gsums else None,
            "learning_rate":            learning_rate,
            "validation_ppl_history":   validation_ppl_history,
            "epoch":                    epoch,
            "random_state":             random_state
        }

        with open(file_path, 'wb') as f:
            cPickle.dump(state, f, protocol=cPickle.HIGHEST_PROTOCOL)

### Main

In [7]:
from collections import OrderedDict
from time import time

import theano
try:
    import cPickle
except ImportError:
    import _pickle as cPickle
import sys
import os.path
try:
    input = raw_input
except NameError:
    pass

import theano.tensor as T
import numpy as np

MAX_EPOCHS = 50
MINIBATCH_SIZE = 128
L2_REG = 0.0
CLIPPING_THRESHOLD = 2.0
PATIENCE_EPOCHS = 1
MAX_SEQUENCE_LEN = 50

"""
Bi-directional RNN with attention
For a sequence of N words, the model makes N punctuation decisions (no punctuation before the first word, but there's a decision after the last word or before </S>)
"""

def get_minibatch(file_name, batch_size, shuffle, with_pauses=False):

    dataset = loadData(file_name)

    if shuffle:
        np.random.shuffle(dataset)

    X_batch = []
    Y_batch = []
    if with_pauses:
        P_batch = []

    if len(dataset) < batch_size:
        print("WARNING: Not enough samples in '%s'. Reduce mini-batch size to %d or use a dataset with at least %d words." % (
            file_name,
            len(dataset),
            MINIBATCH_SIZE * MAX_SEQUENCE_LEN))

    for subsequence in dataset:

        X_batch.append(subsequence[0])
        Y_batch.append(subsequence[1])
        if with_pauses:
            P_batch.append(subsequence[2])
        
        if len(X_batch) == batch_size:

            # Transpose, because the model assumes the first axis is time
            X = np.array(X_batch, dtype=np.int32).T
            Y = np.array(Y_batch, dtype=np.int32).T
            if with_pauses:
                P = np.array(P_batch, dtype=theano.config.floatX).T
            
            if with_pauses:
                yield X, Y, P
            else:
                yield X, Y

            X_batch = []
            Y_batch = []
            if with_pauses:
                P_batch = []

### Create model

In [None]:
DATA_DIR = "./data"
SPACE = "_SPACE"

model_name = "ep"
num_hidden = 256
learning_rate = 0.02

# constant from Data
DATA_PATH = os.path.join(DATA_DIR,"embeddedData")
WORD_VOCAB_FILE = os.path.join(DATA_PATH, "vocabulary")
PUNCTUATION_VOCABULARY = [SPACE, ",COMMA", ".PERIOD", "?QUESTIONMARK", "!EXCLAMATIONMARK", ":COLON", ";SEMICOLON", "-DASH"]
TRAIN_FILE = os.path.join(DATA_PATH, "train")
DEV_FILE = os.path.join(DATA_PATH, "dev")

model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate)

print(num_hidden, learning_rate, model_file_name)

word_vocabulary = read_vocabulary(WORD_VOCAB_FILE)
punctuation_vocabulary = iterable_to_dict(PUNCTUATION_VOCABULARY)

x = T.imatrix('x')
y = T.imatrix('y')
lr = T.scalar('lr')

continue_with_previous = False
if os.path.isfile(model_file_name):

    while True:
        resp = input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name)
        resp = resp.lower().strip()
        if resp not in ('c', 'r', 'e'):
            continue
        if resp == 'e':
            sys.exit()
        elif resp == 'c':
            continue_with_previous = True
        break

if continue_with_previous:
    print("Loading previous model state")

    net, state = loadModel(model_file_name, MINIBATCH_SIZE, x)
    gsums, learning_rate, validation_ppl_history, starting_epoch, rng = state
    best_ppl = min(validation_ppl_history)

else:
    rng = np.random
    rng.seed(1)

    print("Building model...")
    net = GRU(
        rng=rng,
        x=x,
        minibatch_size=MINIBATCH_SIZE,
        n_hidden=num_hidden,
        x_vocabulary=word_vocabulary,
        y_vocabulary=punctuation_vocabulary
        )

    starting_epoch = 0
    best_ppl = np.inf
    validation_ppl_history = []
        
    gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in net.params]

cost = net.cost(y) + L2_REG * net.L2_sqr

gparams = T.grad(cost, net.params)
updates = OrderedDict()

# Compute norm of gradients
norm = T.sqrt(T.sum(
            [T.sum(gparam ** 2) for gparam in gparams]
        ))

    
# Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011)    
for gparam, param, gsum in zip(gparams, net.params, gsums):
    gparam = T.switch(
        T.ge(norm, CLIPPING_THRESHOLD),
        gparam / norm * CLIPPING_THRESHOLD,
        gparam
    ) # Clipping of gradients
    updates[gsum] = gsum + (gparam ** 2)
    updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6)))

train_model = theano.function(
    inputs=[x, y, lr],
    outputs=cost,
    updates=updates
)

validate_model = theano.function(
    inputs=[x, y],
    outputs=net.cost(y)
)

print("Training...")
for epoch in range(starting_epoch, MAX_EPOCHS):
    t0 = time()
    total_neg_log_likelihood = 0
    total_num_output_samples = 0
    iteration = 0 
    for X, Y in get_minibatch(TRAIN_FILE, MINIBATCH_SIZE, shuffle=True):
        total_neg_log_likelihood += train_model(X, Y, learning_rate)
        total_num_output_samples += np.prod(Y.shape)
        iteration += 1
        if iteration % 100 == 0:
            sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100)))
            sys.stdout.flush()
    print("Total number of training labels: %d" % total_num_output_samples)

    total_neg_log_likelihood = 0
    total_num_output_samples = 0
    for X, Y in get_minibatch(DEV_FILE, MINIBATCH_SIZE, shuffle=False):
        total_neg_log_likelihood += validate_model(X, Y)
        total_num_output_samples += np.prod(Y.shape)
    print("Total number of validation labels: %d" % total_num_output_samples)
        
    ppl = np.exp(total_neg_log_likelihood / total_num_output_samples)
    validation_ppl_history.append(ppl)

    print("Validation perplexity is %s" % np.round(ppl, 4))

    if ppl <= best_ppl:
        best_ppl = ppl
        net.save(model_file_name, gsums=gsums, learning_rate=learning_rate, validation_ppl_history=validation_ppl_history, best_validation_ppl=best_ppl, epoch=epoch, random_state=rng.get_state())
    elif best_ppl not in validation_ppl_history[-PATIENCE_EPOCHS:]:
        print("Finished!")
        print("Best validation perplexity was %s" % best_ppl)
        break

256 0.02 Model_ep_h256_lr0.02.pcl
Building model...
Number of parameters is 6406408
Training...


  rval = inputs[0].__getitem__(inputs[1:])
