# Homework 3 - IMDB Review (RNN)

## Define constants, functions and set random seed

In [None]:
NUM_WORDS = 4000
MAXLEN = 400
BATCH_SIZE = 64
EPOCHS = 8
SEED = 0xFFFF
SHOULD_RANDOMLY_SPLIT_VAL = False
VAL_SPLIT = 0

import os
from os import path
from datetime import datetime
def create_result_dir() -> str:
    result_dir = "result." + datetime.today().strftime("%y%m%d-%H%M%S")
    if not path.exists(result_dir): os.makedirs(result_dir)
    else:
        result_dir_suffix = 0
        while path.exists(result_dir + "-" + str(result_dir_suffix)):
            result_dir_suffix += 1
        result_dir = result_dir + "-" + str(result_dir_suffix)
        os.makedirs(result_dir)
    return result_dir

import pandas, numpy
from keras.preprocessing import sequence, text
def prepare_data(
    train_data: pandas.DataFrame, test_data: pandas.DataFrame,
) -> (numpy.ndarray, numpy.ndarray, numpy.ndarray):
    train_labels = train_data["Sentiment"].values.astype("int32")
    train_texts = train_data["SentimentText"]
    test_texts = test_data["SentimentText"]
    tokenizer = text.Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(train_texts)
    def pad_sequences_from_texts(texts: pandas.Series) \
        -> numpy.ndarray:
        return sequence.pad_sequences(
            tokenizer.texts_to_sequences(texts),
            maxlen=MAXLEN,
        )
    return (
        train_labels,
        pad_sequences_from_texts(train_texts),
        pad_sequences_from_texts(test_texts),
    )

from matplotlib import pyplot
from keras import callbacks
def show_train_history(history: callbacks.History):
    fig, ax = pyplot.subplots(nrows=1, ncols=2)
    fig.set_size_inches(18, 6)
    ax[0].set_title("Model accuary")
    ax[0].plot(history.history["acc"])
    ax[0].plot(history.history["val_acc"])
    ax[0].set_ylabel("accuary")
    ax[0].set_xlabel("epoch")
    ax[0].legend(["result", "validation"], loc="upper left")
    ax[1].set_title("Model loss")
    ax[1].plot(history.history["loss"])
    ax[1].plot(history.history["val_loss"])
    ax[1].set_ylabel("loss")
    ax[1].set_xlabel("epoch")
    ax[1].legend(["result", "validation"], loc="upper left")
    pyplot.show()

from numpy import random
random.seed(SEED)

## Read files and prepare data

In [None]:
import zipfile
from keras.utils import np_utils
train = pandas.read_csv("data/train_data.csv")
test = pandas.read_csv("data/test_data_ans.csv")
train_labels, train_texts, test_texts = prepare_data(train, test)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from keras import models, layers, initializers, activations
model = models.Sequential([
    layers.Embedding(
        input_dim=NUM_WORDS,
        output_dim=16,
        input_length=MAXLEN,
    ),
    layers.Dropout(0.3),
    layers.LSTM(16),
    layers.Dense(
        units=512,
        activation=activations.relu,
    ),
    layers.Dropout(0.4),
    layers.Dense(
        units=1,
        activation=activations.sigmoid,
    ),
])

from keras import optimizers, losses
model.compile(
    optimizer=optimizers.Adam(amsgrad=True),
    loss=losses.binary_crossentropy,
    metrics=["accuracy"],
)

model.summary()

## Train the model

In [None]:
result_dir = create_result_dir()
model_filename = "model.{epoch:0%dd}-{val_loss:.4f}.hdf5" \
    % len(str(abs(EPOCHS)))
model_path = path.join(result_dir, model_filename)

model_checkpoint = callbacks.ModelCheckpoint(
    model_path,
    monitor="val_loss",
    verbose=1,
    save_best_only=True,
)
early_stopping = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=64,
    verbose=1,
)

if SHOULD_RANDOMLY_SPLIT_VAL:
    from sklearn import model_selection
    train_x, val_x, train_y, val_y = model_selection.train_test_split(
        train_texts, train_labels,
        test_size=VAL_SPLIT,
        random_state=SEED,
    )
    val_data = (val_x, val_y)
else:
    train_x, train_y = train_texts, train_labels
    val_data = None

history = model.fit(
    x=train_x,
    y=train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    callbacks=[
        model_checkpoint,
        early_stopping,
    ],
    validation_split=VAL_SPLIT,
    validation_data=val_data,
)

In [None]:
import pickle
with open(path.join(result_dir, "history.pickle"), "wb") as file:
    pickle.dump(history.history, file, pickle.HIGHEST_PROTOCOL)

import glob
model_files = sorted(glob.glob(path.join(result_dir, "model.*-*.hdf5")))
model = models.load_model(model_files[-1])
predictions = model.predict_classes(test_texts)

pandas.DataFrame(data={"sentiment": predictions.flatten()}).to_csv(
    path.join(result_dir, "predictions.csv"),
    index_label="id",
)

In [None]:
show_train_history(history)