In [1]:
from frostings.loader import *
#from frostings.utils import *
import numpy as np
import gzip
import os

In [2]:
def save_gz(path, data):
    print "saving to %s" %path
    np.save(path, data)
    gzippath = "gzip " + path
    print "running command: %s" % gzippath
    os.system(gzippath)

def load_gz(path):
    if path.endswith(".gz"):
        print "has the gz ..."
        f = gzip.open(path, 'rb')
        print "np loading it ..!"
        return np.load(f)
    else:
        print "aint got no gz ..!"
        return np.load(path)

In [3]:
def remove_samples(samples):
    # remove input sentences that are too short or too long
    samples = [((x, l1), (t, l2)) for (x, l1), (t, l2) in samples if len(x) > 1 and len(x) <= 400]

    # remove target sentences that are too short or too long
    samples = [((x, l1), (t, l2)) for (x, l1), (t, l2) in samples if len(t) > 1 and len(t) <= 450]

    return samples

class TextLoadMethod(LoadMethod):

    def __init__(self):
        self._prepare_data()

    def _load_data(self):
        print "loading data ..."
        with open("data/train/europarl-v7.fr-en.en", "r") as f:
            self.train_X = f.read().split("\n")
            language = ["en" for _ in range(len(self.train_X))] #this is bad coding ..!
            self.train_X = zip(self.train_X, language)
        print "train X loaded ..."
        with open("data/train/europarl-v7.fr-en.fr", "r") as f:
            self.train_t = f.read().split("\n")
            language = ["fr" for _ in range(len(self.train_t))]
            self.train_t = zip(self.train_t, language)
        print "train t loaded"
        self.samples = zip(self.train_X, self.train_t)

    def _preprocess_data(self):
        print "preprocessing data ..."
        self.samples = sorted(self.samples, key=lambda (X, t): len(X)*10000 + len(t))
        print "data sorted ..."
        # remove samples not of interest
        self.samples = remove_samples(self.samples)
        print "samples of no interest removed"
        print len(self.samples)
        save_gz("data/train.npy", self.samples)

    def _prepare_data(self):
        print "prepare_data started"
        if not os.path.exists("data/train.npy.gz"):
            self._load_data()
            self._preprocess_data()
        else:
            print "we have it!"
            self.samples = load_gz("data/train.npy.gz")

In [4]:
%%time
text_load_method = TextLoadMethod()

prepare_data started
we have it!
has the gz ...
np loading it ..!
CPU times: user 14.3 s, sys: 1.36 s, total: 15.7 s
Wall time: 16.1 s


In [5]:
text_load_method(3)

array([[ 'You have requested a debate on this subject in the course of the next few days, during this part-session.',
        'en'],
       [ 'Vous avez souhait\xc3\xa9 un d\xc3\xa9bat \xc3\xa0 ce sujet dans les prochains jours, au cours de cette p\xc3\xa9riode de session.',
        'fr']], 
      dtype='|S450')

In [103]:
## batch
# prepare a dictionary for mapping characters to integer tokens

def get_dictionary_char(lang = 'en'):
    with open('./data/alphabet.' + lang, 'r') as f:
        alphabet_raw = f.read().replace('\r\n', '\n').replace('\r', '\n') # removing microsoft formatting
        alphabet = list(set(alphabet_raw)) # removing duplicate entries
    return {character: idx for idx, character in enumerate(alphabet)}

def char_encoding(sentence, alphadict):
    # gets the encoding e.g. a = 180
    encode = lambda c: alphadict[c]
    # concatenating each char in the string to np.array.shape(len(sentence), len(alphadict))
    encoding = [encode(c) for c in sentence]
    return np.array(encoding)

def spaces(sentence):
    spaces = [idx-1 for idx, c in enumerate(sentence) if c == " "]
    spaces.append(len(sentence)-1)
    return np.array(spaces)

def char_length(in_string):
    return len(in_string)

class TextBatchGenerator(BatchGenerator):

    def _preprocess_sample(self):
        char_dict = dict()
        char_dict['en'] = get_dictionary_char()
        char_dict['fr'] = get_dictionary_char('fr')
        for sample_idx, sample in enumerate(self.samples):
            my_s = []
            # samples should be tuple((train_X, "en") (train_t, "fr"))
            for elem, lang in sample:
                my_s.append(char_encoding(elem, char_dict[lang])) # char encoding
                my_s.append(spaces(elem)) # spaces
                my_s.append(char_length(elem)) # char length
            self.samples[sample_idx] = tuple(my_s)# + sample # concats with original sample
    
    def _make_batch_holder(self, mlen_t_X, mln_s_X, mlen_t_t, mlen_s_t):
        self.batch = []
        self.batch.append(np.zeros((self.batch_info.batch_size, mlen_t_X, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, mln_s_X, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, mlen_t_t, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, mlen_s_t, 1)))
        self.batch.append(np.zeros((self.batch_info.batch_size, 1)))
        
        #pass # should make a "holder", e.g. self.batch.append(np.zeros((self.batch_info.batch_size, max_length, encoding_size) and .append a np.zeros for sequences_lengths, spaces etc.

    def _make_batch(self):
        self._preprocess_sample()
        mlen_t_X = max(self.samples, key=lambda x: x[2])[2]
        mlen_s_X = len(max(self.samples, key=lambda x: len(x[1]))[1])
        mlen_t_t = max(self.samples, key=lambda x: x[5])[5]
        mlen_s_t = len(max(self.samples, key=lambda x: len(x[4]))[4])
        print mlen_t_X
        print mlen_s_X
        print mlen_t_t
        print mlen_s_t
        self._make_batch_holder(mlen_t_X, mlen_s_X, mlen_t_t, mlen_s_t)
        for sample_idx, (t_X, s_X, l_X, t_t, s_t, l_t) in enumerate(self.samples):
            t_X = np.array([t_X], dtype='float32').T # to shape (len,1)
            l_s_X = len(s_X)
            s_X = np.array([s_X], dtype='float32').T
            t_t = np.array([t_t], dtype='float32').T
            l_s_t = len(s_t)
            s_t = np.array([s_t], dtype='float32').T
            self.batch[0][sample_idx][:l_X] = t_X
            self.batch[1][sample_idx] = l_X
            self.batch[2][sample_idx][:l_s_X] = s_X
            self.batch[3][sample_idx] = l_s_X
            self.batch[4][sample_idx][:l_t] = t_t
            self.batch[5][sample_idx] = l_t
            self.batch[6][sample_idx][:l_s_t] = s_t
            self.batch[7][sample_idx] = l_s_t

#            assert False
            #            self.batch[0][sample_idx] = 
            
        self.samples = [] # resetting
        return self.batch

In [104]:
%%time
sample_info = SampleInfo(len(text_load_method.samples)) # needs to know how many samples we have, so it can make an idx for all of them.
sample_gen = SampleGenerator(text_load_method, sample_info) # generates one sample which consists of several elements sample = (elem, elem, elem)
batch_info = BatchInfo(batch_size=32)
text_batch_gen = TextBatchGenerator(sample_gen, batch_info) # Generates a batch, being a tuples

SampleInfo initated
ElemGenerator initiated
BatchInfo initiated
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 219 µs


In [113]:
i = 0
for batch in text_batch_gen.gen_batch():
    batch
    print i
    i+=1
    if i == 5:
        assert False

334
57
387
58
0
109
21
125
17
1
238
40
277
42
2
59
10
82
12
3
248
47
302
45
4


AssertionError: 

In [114]:
batch[0].shape

(32, 248, 1)

3