# Using character-level representations




In [1]:
# Importing the dependencies
import glob
import numpy as np
import os

from random import shuffle

In [2]:
# Add the paths to the corpus. It should end in aclImdb/train
CORPUS_PATH = None
CORPUS_PATH = "/Users/albarron/corpora/misc/stanford_movie_review/aclImdb/train"

In [3]:
# Loading the data

def pre_process_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them
    together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

def collect_expected(dataset):
    """Extracting the expected output for all the instances"""
    return [sample[0] for sample in dataset]

In [4]:
# Loading instances and expected classes (as usual)
dataset = pre_process_data(CORPUS_PATH)
expected = collect_expected(dataset)

In [5]:
def avg_len(data):
    """Computes the average length of the data"""
    total_len = 0
    for sample in data:
        total_len += len(sample[1])
    return total_len/len(data)
avg_len(dataset)

1325.06964

The average length in terms of words is 202.44 (don't believe me and go find yourself).

That is, we would unroll the network **6.5x**! 

In [6]:
def clean_data(data):
    """ Lowercase, replace unknowns with UNK, and listify """
    new_data = []
    VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
    for sample in data:
        new_sample = []
        for char in sample[1].lower():   # Just grab the string, not the label
            if char in VALID:
                new_sample.append(char)
            else:
                new_sample.append('UNK') # a new "character"
       
        new_data.append(new_sample)
    return new_data

listified_data = clean_data(dataset)

In [7]:
def char_pad_trunc(data, maxlen):
    """ Truncate to maxlen or add in PAD tokens """
    new_dataset = []
    for sample in data:
        if len(sample) > maxlen:
            new_data = sample[:maxlen]
        elif len(sample) < maxlen:
            pads = maxlen - len(sample)
            new_data = sample + ['PAD'] * pads # yet other "characters"
        else:
            new_data = sample
        new_dataset.append(new_data)
    return new_dataset

In [8]:
# Producing the one-hot encodings (no embeddings here!)
def create_dicts(data):
    """Create bi-directional characters: from char to index and from index to char 
    Modified from Keras LSTM example"""
    chars = set()
    for sample in data:
        chars.update(set(sample))
    # what are we doing here?
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char

In [9]:
def onehot_encode(dataset, char_indices, maxlen):
    """ 
    One-hot encode the tokens
    
    Args:
        dataset  list of lists of tokens
        char_indices  dictionary of {key=character, value=index to use encoding vector}
        maxlen  int  Length of each sample
    Return:
        np array of shape (samples, tokens, encoding length)
    """
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
    for i, sentence in enumerate(dataset):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
    return X

In [10]:
# Load and preprocess the data
dataset = pre_process_data(CORPUS_PATH)
expected = collect_expected(dataset)
listified_data = clean_data(dataset)

maxlen = 1500
common_length_data = char_pad_trunc(listified_data, maxlen)

char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, maxlen)

In [11]:
# Split the data
split_point = int(len(encoded_data)*.8)

x_train = encoded_data[:split_point]
y_train = np.array(expected[:split_point])
x_test = encoded_data[split_point:]
y_test = np.array(expected[split_point:])
# Pay attention: in the book they forgot to turn y_[train|test] into numpy arrays

In [12]:
# Building the network
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, LSTM

num_neurons = 40

print('Build model...')
model = Sequential()

model.add(LSTM(
    num_neurons,
    return_sequences=True,
    input_shape=(maxlen, len(char_indices.keys())))
    )

model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1500, 40)          14080     
_________________________________________________________________
dropout (Dropout)            (None, 1500, 40)          0         
_________________________________________________________________
flatten (Flatten)            (None, 60000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 60001     
Total params: 74,081
Trainable params: 74,081
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Training the network
batch_size = 32
epochs = 10
model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test)
    )
# This will take about 3-4 minutes/epoch

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x16562ac40>

In [None]:
# Saving the model
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("char_lstm_weights3.h5")

**Back to the slides**