In [34]:
# Data-handling libraries
import pandas as pd
import numpy as np
import scipy.sparse
import pickle

# Feature extraction/transformation libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Modelling libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, Input, LSTM, Bidirectional, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Evaluation libraries
from eda.accuracy import fuzzy_accuracy

# Text-handling libraries
import spacy
nlp = spacy.load("en")
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[A-Za-z\'-]+')

# Other libraries
import matplotlib.pyplot as plt
import itertools
from progress import show_progress
from joblib import Parallel, delayed, dump

# Random seed
seed = 5777

# Options
pad_shape = 1000

# Read in data

In [35]:
essays = pd.read_csv(f"../data_private/essays_cleaned_target.csv")

# Define variables

## Word vectors

In [4]:
y = to_categorical(essays["score"])

# Define vocabulary
#vocab = set(tokenizer.tokenize(" ".join(essays["essay"]))) # using simple regex match
vocab = set(token.text for essay in essays["essay"] for token in nlp.tokenizer(essay)) # using spacy's more sophisticated matcher

# Convert words to numerical indices <https://www.tensorflow.org/tutorials/text/text_generation>
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(list(vocab))

#X_vector = [[word2idx[token] for token in tokenizer.tokenize(essay)] for essay in essays["essay"]] # nltk
X_vector = [[word2idx[token.text] for token in nlp.tokenizer(essay)] for essay in essays["essay"]] # spacy

## Metadata

In [19]:
meta_cols = [
    "tokens",
    "types",
    "sent_len",
    "word_len",
    "freq",
    "semicolons",
    "link_words",
    "pps",
    "max_depth",
]

X_meta = essays[meta_cols].to_numpy()

# Train/test split

In [20]:
X_vector_train, \
X_vector_test, \
X_meta_train, \
X_meta_test, \
y_train, \
y_test = train_test_split(X_vector, X_meta, y, random_state=seed);

In [21]:
essays.loc[0, "score"]

1.0

# Scale

In [22]:
ss = StandardScaler()

# Word vectors
X_vector_train = pad_sequences(X_vector_train, maxlen=pad_shape)
X_vector_test = pad_sequences(X_vector_test, maxlen=pad_shape)

# Metadata
X_meta_train_sc = ss.fit_transform(X_meta_train)
X_meta_test_sc = ss.transform(X_meta_test)

pca = PCA(random_state=seed, n_components=5)
X_meta_train_pca = pca.fit_transform(X_meta_train_sc)
X_meta_test_pca = pca.fit_transform(X_meta_test_sc)

# Export train/test data

For gridsearching models on AWS

In [23]:
with open("nn-data.p", "wb") as outfile:
    pickle.dump((
        X_vector_train,
        X_vector_test,
        X_meta_train_pca,
        X_meta_test_pca,
        y_train,
        y_test,
        X_meta_train_pca.shape[1],
        len(vocab)
    ), outfile)

# Run model

This RNN has two sets of inputs: the word vectors from the documents, which are put in at the beginning, and the metadata, which is incorporated after the GRU layer ([source](http://digital-thinking.de/deep-learning-combining-numerical-and-text-features-in-deep-neural-networks/)).

In [9]:
def keras_model():
    # Borrowed in part from:
    # <https://stackoverflow.com/a/55234203>
    # <http://digital-thinking.de/deep-learning-combining-numerical-and-text-features-in-deep-neural-networks/>

    # Define inputs
    vector_input = Input(shape=(pad_shape,))
    meta_input = Input(shape=(X_meta_train_pca.shape[1],))

    # Define embedding and GRU layers
    rnn = Embedding(len(vocab), 96, input_length=pad_shape)(vector_input)
    rnn = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(1e-3)))(rnn)
    rnn = Bidirectional(GRU(128, return_sequences=False, kernel_regularizer=l2(1e-3)))(rnn)

    # Incorporate metadata
    rnn = Concatenate()([rnn, meta_input])

    # Define hidden and output layers
    rnn = Dense(128, activation="relu", kernel_regularizer=l2(1e-3))(rnn)
    rnn = Dense(128, activation="relu", kernel_regularizer=l2(1e-3))(rnn)
    rnn = Dense(4, activation="softmax")(rnn)

    model = Model(inputs=[vector_input, meta_input], outputs=[rnn])
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])
    return model

best_score = 0
best_model = None
summary = ""

params = {
    "gru_neurons": [64, 128],
    "l1_neurons": [64],
    "l2_neurons": [64],
    "alpha": [1e-1]
}

# Fit single model
model = keras_model()
history = model.fit([X_vector_train, X_meta_train_pca], y_train,
                                             validation_data=([X_vector_test, X_meta_test_pca], y_test),
                                             epochs=3, verbose=1)

# # Hand-rolled gridsearch since sklearn's GridSearchCV doesn't support multiple inputs:
# # <https://github.com/keras-team/keras/issues/2748>, <https://github.com/keras-team/keras/issues/9001>
# # @show_progress
# def keras_gridsearch(p):
#     global best_score, best_model, summary
#     model = keras_model(*p)
#     history = model.fit([X_vector_train, X_meta_train_pca], y_train,
#                         validation_data=([X_vector_test, X_meta_test_pca], y_test),
#                         epochs=3, verbose=1)
#     test_score = history.history["val_acc"][-1]
#     summary += str(p) + str(test_score) + "\n"
#     if test_score > best_score:
#         best_score = test_score
#         best_model = model

# #keras_gridsearch(itertools.product(*params.values()), update_freq=1)

# Parallel(n_jobs=-1, require='sharedmem')(delayed(keras_gridsearch)(p) for p in itertools.product(*params.values()))

# print(summary)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Evaluate model

In [25]:
y_pred = model.predict([X_vector_test, X_meta_test_pca])

fuzzy_accuracy(
    y_test.argmax(axis=1), # Get the vector index with the max value;
    y_pred.argmax(axis=1), # i.e., undo the one-hot encoding.
    tolerance=1)

0.959954233409611

In [30]:
for layer in model.layers:
    print(layer.input_shape, layer.output_shape)

[(None, 1000)] [(None, 1000)]
(None, 1000) (None, 1000, 96)
(None, 1000, 96) (None, 1000, 256)
(None, 1000, 256) (None, 256)
[(None, 5)] [(None, 5)]
[(None, 256), (None, 5)] (None, 261)
(None, 261) (None, 128)
(None, 128) (None, 128)
(None, 128) (None, 4)


# Write to disk

In [24]:
dump(vocab, "../EssayScorer/vocab.bin", compress=True)
dump(ss, "../EssayScorer/scaler.bin", compress=True)
dump(pca, "../EssayScorer/pca.bin", compress=True)
model.save("../EssayScorer/model.keras")

# Load from disk

In [36]:
model = load_model("../EssayScorer/model.keras")
y_pred = model.predict([X_vector_test, X_meta_test_pca])

fuzzy_accuracy(
    y_test.argmax(axis=1), # Get the vector index with the max value;
    y_pred.argmax(axis=1), # i.e., undo the one-hot encoding.
    tolerance=0)

0.620137299771167