# Using character-level representations




In [1]:
# Importing the dependencies
import glob
import numpy as np
import os

from random import shuffle

In [2]:
# Add the path to the corpus. It should end in aclImdb/train
CORPUS_PATH = None

In [3]:
# Loading the data

def pre_process_data(filepath):
    """
    Load pos and neg examples from separate dirs then shuffle them
    together.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    shuffle(dataset)
    return dataset

def collect_expected(dataset):
    """Extracting the expected output for all the instances"""
    return [sample[0] for sample in dataset]

In [4]:
# Loading instances and expected classes (as usual)
dataset = pre_process_data(CORPUS_PATH)
expected = collect_expected(dataset)

In [5]:
def avg_len(data):
    """Computes the average length of the data"""
    total_len = 0
    for sample in data:
        total_len += len(sample[1])
    return total_len/len(data)
avg_len(dataset)

1325.06964

The average word length is 202.44 (**Homework:** don't believe me and go find yourself).

That is, we would unroll the network **6.5x**! 

In [6]:
def clean_data(data):
    """Shift to lower case, replace unknowns with UNK, and listify """
    new_data = []
    VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
    for sample in data:
        new_sample = []
        for char in sample[1].lower():  # Just grab the string, not the label
            # Not extremely efficient procedure
            if char in VALID:
                new_sample.append(char)
            else:
                new_sample.append('UNK')
       
        new_data.append(new_sample)
    return new_data

# listified_data = clean_data(dataset)

**Homework**: turn the process to determine if a character is VALID more efficient

In [7]:
def char_pad_trunc(data, maxlen):
    """ We truncate to maxlen or add PAD tokens """
    new_dataset = []
    for sample in data:
        if len(sample) > maxlen:
            new_data = sample[:maxlen]
        elif len(sample) < maxlen:
            pads = maxlen - len(sample)
            new_data = sample + ['PAD'] * pads
        else:
            new_data = sample
        new_dataset.append(new_data)
    return new_dataset

In [8]:
# Producing the one-hot encodings (no embeddings here)
def create_dicts(data):
    """ Modified from Keras LSTM example"""
    chars = set()
    for sample in data:
        chars.update(set(sample))
    char_indices = dict((c, i) for i, c in enumerate(chars))
    indices_char = dict((i, c) for i, c in enumerate(chars))
    return char_indices, indices_char

In [9]:
def onehot_encode(dataset, char_indices, maxlen):
    """ 
    One hot encode the tokens
    
    Args:
        dataset  list of lists of tokens
        char_indices  dictionary of {key=character, value=index to use encoding vector}
        maxlen  int  length of each sample
    Return:
        np array of shape (samples, tokens, encoding length)
    """
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
    for i, sentence in enumerate(dataset):
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1
    return X

In [10]:
# Load and preprocess the data
# The first 2 steps were run earlier
# dataset = pre_process_data(CORPUS_PATH)
# expected = collect_expected(dataset)
listified_data = clean_data(dataset)

maxlen = 1500
common_length_data = char_pad_trunc(listified_data, maxlen)

char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, maxlen)

In [11]:
# Split the data
split_point = int(len(encoded_data)*.8)

x_train = encoded_data[:split_point]
y_train = np.array(expected[:split_point])
x_test = encoded_data[split_point:]
y_test = np.array(expected[split_point:])
# Pay attention: in the book they forgot to turn y_[train|test] into numpy arrays

In [12]:
# A quick view to the first instance
x_train[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [13]:
# Shape of the resulting array
x_train.shape

(20000, 1500, 47)

In [14]:
# How many instances do we have?
len(x_train)

20000

**Q: What is  the size of the vocabulary?**

In [15]:
# Building the network
from keras.models import Sequential
from keras.layers import Dense, Dropout,  Flatten, LSTM
# Embedding,
num_neurons = 40

print('Build model...')
model = Sequential()

model.add(LSTM(
    num_neurons,
    return_sequences=True,
    input_shape=(maxlen, len(char_indices.keys())))
    )

model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

2023-11-28 11:52:54.742824: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-28 11:52:55.141355: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 11:52:55.141390: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 11:52:55.143576: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-28 11:52:55.320223: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-28 11:52:55.340266: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

Build model...


2023-11-28 11:53:01.869124: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1500, 40)          14080     
                                                                 
 dropout (Dropout)           (None, 1500, 40)          0         
                                                                 
 flatten (Flatten)           (None, 60000)             0         
                                                                 
 dense (Dense)               (None, 1)                 60001     
                                                                 
Total params: 74081 (289.38 KB)
Trainable params: 74081 (289.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


[rmsprop](https://keras.io/api/optimizers/rmsprop/)

In [None]:
# Training the network
batch_size = 32
epochs = 10
model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test)
    )
# This would take between 5 and 10 minutes per epoch, depending on the hardware!

In [None]:
# Saving the model
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
    json_file.write(model_structure)
model.save_weights("char_lstm_weights3.h5")

Back to the slides