In [1]:
from frostings.loader import *
import numpy as np

In [2]:
def remove_samples(samples):
    # remove input sentences that are too short or too long
    samples = [((x, l1), (t, l2)) for (x, l1), (t, l2) in samples if len(x) > 1 and len(x) <= 400]

    # remove target sentences that are too short or too long
    samples = [((x, l1), (t, l2)) for (x, l1), (t, l2) in samples if len(t) > 1 and len(t) <= 450]

    return samples

# prepare a dictionary for mapping characters to integer tokens

def get_dictionary_char(lang = 'en'):
    with open('./data/alphabet.' + lang, 'r') as f:
        alphabet_raw = f.read().replace('\r\n', '\n').replace('\r', '\n') # removing microsoft formatting
        alphabet = list(set(alphabet_raw)) # removing duplicate entries
    return {character: idx for idx, character in enumerate(alphabet)}

def char_encoding(sentence, lang = 'en'):
    alphadict = get_dictionary_char(lang) # getting the dictionary of encodings
    encode = lambda c: -1 # error handling
    try:
        # gets the encoding e.g. a = 180, one hots it and
        # makes sure it has dimensions np.array.shape(1, len(alphadict))
        encode = lambda c: np.array([alphadict[c]])
    except KeyError:
        print("encoding %s was NOT found in dictionary!" % s)
    # concatenating each char in the string to np.array.shape(len(sentence), len(alphadict))
    encoding = [encode(c) for c in sentence]
    return np.concatenate(encoding, axis=0)

def spaces(sentence):
    spaces = [np.asarray([idx-1]) for idx, c in enumerate(sentence) if c == " "]
    spaces.append(np.array([len(sentence)-1]))
    spaces = np.concatenate(spaces)
    return spaces

def char_length(in_string):
    return len(in_string)

class TextLoadMethod(LoadMethod):

    def _load_data(self):
        with open("data/train/europarl-v7.fr-en.en", "r") as f:
            self.train_X = f.read().split("\n")
            language = ["en" for _ in range(len(self.train_X))]
            self.train_X = zip(self.train_X, language)
        with open("data/train/europarl-v7.fr-en.fr", "r") as f:
            self.train_t = f.read().split("\n")
            language = ["fr" for _ in range(len(self.train_t))]
            self.train_t = zip(self.train_t, language)
        self.samples = zip(self.train_X, self.train_t)

    def _preprocess_data(self):
        if self.samples == None:
            self._load_data()
        self.samples = sorted(self.samples, key=lambda (X, t): len(X)*10000 + len(t))
        # remove samples not of interest
        self.samples = remove_samples(self.samples)
        for sample_idx, sample in enumerate(self.samples):
            my_s = []
            if (sample_idx % 10000) == 0:
                print("%d of %d preprocessed ..."  % (sample_idx, len(self.samples)))
            # samples should be tuple((train_X, "en") (train_t, "fr"))
            for elem, lang in sample:
                # char encoding
                my_s.append(char_encoding(elem, lang))
                # spaces
                my_s.append(spaces(elem))
                # char length
                my_s.append(char_length(elem))
            self.samples[sample_idx] = tuple(my_s) + sample # concats with original sample


In [3]:
text_load_method = TextLoadMethod()



0 of 1958868 preprocessed ...
10000 of 1958868 preprocessed ...
20000 of 1958868 preprocessed ...
30000 of 1958868 preprocessed ...
40000 of 1958868 preprocessed ...
50000 of 1958868 preprocessed ...
60000 of 1958868 preprocessed ...
70000 of 1958868 preprocessed ...
80000 of 1958868 preprocessed ...
90000 of 1958868 preprocessed ...
100000 of 1958868 preprocessed ...
110000 of 1958868 preprocessed ...
120000 of 1958868 preprocessed ...
130000 of 1958868 preprocessed ...
140000 of 1958868 preprocessed ...
150000 of 1958868 preprocessed ...
160000 of 1958868 preprocessed ...
170000 of 1958868 preprocessed ...
180000 of 1958868 preprocessed ...
190000 of 1958868 preprocessed ...
200000 of 1958868 preprocessed ...
210000 of 1958868 preprocessed ...
220000 of 1958868 preprocessed ...
230000 of 1958868 preprocessed ...
240000 of 1958868 preprocessed ...
250000 of 1958868 preprocessed ...
260000 of 1958868 preprocessed ...
270000 of 1958868 preprocessed ...
280000 of 1958868 preprocessed ...

KeyboardInterrupt: 

In [118]:
text_load_method(549)

(('The third objective is urban and rural development, within the scope of a balanced territorial policy.',
  'en'),
 ('', 'fr'))

In [None]:
The third objective is urban and rural development, within the scope of a balanced territorial policy.

In [7]:
np.array([[123]]).shape

(1, 1)

In [28]:
with open('./data/alphabet.en', 'r') as f:
    john = set(f.read().replace('\r\n', '\n').replace('\r', '\n'))

In [31]:
len(john)

199

In [None]:
class TextLoadMethod(LoadMethod):

    def _load_data(self):
        with open("data/train/europarl-v7.fr-en.en", "r") as f:
            self.train_X = f.read().split("\n")
        with open("data/train/europarl-v7.fr-en.fr", "r") as f:
            self.train_t = f.read().split("\n")
        self.samples = zip(self.train_X, self.train_t)

    def _preprocess_data(self):
        if self.samples == None:
            self._load_data()
        self.samples = sorted(self.samples, key=lambda (X, t): len(X)*10000 + len(t))
        # remove samples not of interest
        self.samples = remove_samples(self.samples)
        for sample_idx, sample in enumerate(self.samples):
            my_s = []
            for elem in sample:# samples should be tuple(train_X, train_t)
                # char encoding
                my_s.append(char_encoding(elem))
                # spaces
                my_s.append(spaces(elem))
                # char length
                my_s.append(char_length(elem))
                # word length
                my_s.append(word_length(elem))
            self.samples[sample_idx] = tuple(my_s) + sample # concats with original sample

def get_max_length(encodings):
	pass

class TextBatchGenerator(BatchGenerator):

	def _make_batch_holder(self, max_length):
		self.batch = []
		pass # should make a "holder", e.g. self.batch.append(np.zeros((self.batch_info.batch_size, max_length, encoding_size) and .append a np.zeros for sequences_lengths, spaces etc.

	def _make_batch(self):
		self._make_batch_holder()
		for _ in range(len(self.samples)):
			pass # Should fit each sample to the holder
		return self.batch

# Chunk loader is not thought of here, but it should fit without modifying it

### RUNNING THE TEXT LOADER ###

text_load_method = TextLoadMethod()
text_load_method(10000) # remember that it has a __call__ function

sample_info = SampleInfo(len(text_load_method.samples)) # needs to know how many samples we have, so it can make an idx for all of them.
sample_gen = SampleGenerator(text_load_method, sample_info) # generates one sample which consists of several elements sample = (elem, elem, elem)
batch_info = BatchInfo(batch_size=32)
text_batch_gen = TextBatchGenerator(sample_gen, batch_info) # Generates a batch, being a tuples
chunk_info = ChunkInfo()
chunk_gen = ChunkGenerator(text_batch_gen, chunk_info)
# should be used like.
# for train_X_char_enc, train_X_word_enc, train_X_sequence_length ... in text_batch_gen.gen_batch():