In [182]:
"""Language modeling preprocessing
"""

import numpy as np
import h5py
import argparse
import sys
import re
import codecs
from itertools import izip

In [214]:
#data_name = 'data/valid_chars_kaggle_answer.txt'

def gen_letter_indices(data_name):
    with codecs.open(data_name, 'r', encoding="latin-1") as f:
        lines = f.readlines()
    letter_indices = {'<space>': 1}
    ind = 2
    for letter in lines[0].split():        
        if letter not in letter_indices:
            letter_indices[letter] = ind
            ind += 1 
    return letter_indices

def to_indices(data_name, letter_indices):
    with codecs.open(data_name, 'r', encoding="latin-1") as f:
        lines = f.readlines()
    letters = []
    for line in lines:
        for letter in line.split():
            letters.append(letter_indices[letter])        
    return letters

def convert_to_output(batch_indices):
    return [1 if ind == 1 else 2 for ind in batch_indices]

# Test to make sure that it's working
# data_name = 'data/train_chars.txt'
# with codecs.open(data_name, 'r', encoding="latin-1") as f:
#     lines = f.readlines()
# text_indices = lines[0].split()
def convert_to_batches(text_indices, l=50, b=32, test_b=True):
    n = len(text_indices)
    batches = []
    for i in range(n/(b*l)):
        batch = []
        for j in range(b):
            if not test_b:
                batch.append(text_indices[n/b*j + l * i: n/b*j + l + l * i])
            else:
                batch.append(convert_to_test(text_indices[n/b*j + l * i: n/b*j + l + l * i]))
        batches.append(batch)
    return np.array(batches)

# Generate the indices in the format as test
def to_indices_kaggle(data_name, letter_indices, test_b):
    with codecs.open(data_name, 'r', encoding="latin-1") as f:
        lines = f.readlines()
    letters_lines_output = []
    if test_b:
        for i in range(len(lines)):
            lines[i] = lines[i][:-5][:]
    else:
        lines = lines[0].split('</s>')
        for i in range(len(lines)):
            line = lines[i]
            letters_lines_output.append(line.count('<space>'))
            lines[i] = line.replace('<space> ', '')
        lines = lines[:-1]
        
    # Add padding
    max_len = 0
    for line in lines:
        len_line = len(line.split())
        if len_line > max_len:
            max_len = len_line

    for i in range(len(lines)):
        line = lines[i]
        line = line.replace('<space> ', '')
        lines[i] = (max_len - len(line.split())) * '</s> ' + line
    
    letters_lines = []
    for line in lines:
        letters = []
        for letter in line.split():
            letters.append(letter_indices[letter])    
        letters_lines.append(np.array(letters))
    return np.array(letters_lines), np.array(letters_lines_output)

data_name = 'data/valid_chars.txt'
data_name = 'data/test_chars.txt'
data_name = 'data/valid_chars_kaggle.txt'
data_name = 'data/train_chars.txt'

letter_indices = gen_letter_indices('data/train_chars.txt')
text_train_indices = to_indices('data/train_chars.txt', letter_indices)
train_batches = convert_to_batches(text_train_indices)
train_output_batches = convert_to_batches(text_train_indices, test_b = True)

text_valid_indices = to_indices('data/valid_chars.txt', letter_indices)
valid_batches = convert_to_batches(text_valid_indices)
valid_output_batches = convert_to_batches(text_valid_indices, test_b = True)

text_test_indices = to_indices('data/test_chars.txt', letter_indices)
test_batches = convert_to_batches(text_test_indices)

valid_lines_kaggle, valid_output_letters_kaggle = to_indices_kaggle('data/valid_chars.txt', letter_indices, False)
test_lines_kaggle, _ = to_indices_kaggle('data/test_chars.txt', letter_indices, True)

with h5py.File('data.hdf5', "w") as f:
    f['nletters'] = np.array([len(letter_indices)], dtype=np.int32)
    f['nclasses'] = np.array([2], dtype=np.int32)

    f['train_text'] = np.array(text_train_indices)
    f['valid_text'] = np.array(text_valid_indices)
    
    f['train_input_batches'] = train_batches
    f['train_output_batches'] = train_output_batches

    f['valid_input_batches'] = valid_batches
    f['valid_output_batches'] = valid_output_batches
    
    f['valid_lines_kaggle'] = valid_lines_kaggle
    f['valid_output_kaggle'] = valid_output_letters_kaggle

    f['test_lines_kaggle'] = test_lines_kaggle
    
    f['train_bigrams'] = out = np.asarray(np.asmatrix(np.array(text_train_indices[:-1]))).T
    f['train_ngrams_output'] = np.asarray(np.asmatrix([int(ind == 0) for ind in text_train_indices][1:])).T

    f['valid_bigrams'] = np.asarray(np.asmatrix(np.array(text_valid_indices[:-1]))).T
    f['valid_ngrams_output'] = np.asarray(np.asmatrix([int(ind == 0) for ind in text_valid_indices][1:])).T