In [68]:
# Data-handling libraries
import pandas as pd
import scipy.sparse
import pickle

# Feature extraction/transformation libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Modelling libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, Input, LSTM, Bidirectional, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import skipgrams
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Text-handling libraries
import spacy
nlp = spacy.load("en")
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[A-Za-z\'-]+')

# Other libraries
import matplotlib.pyplot as plt
import itertools
from progress import show_progress
from joblib import Parallel, delayed

# Random seed
seed = 5777

# Options
pad_shape = 1000

# Read in data

In [2]:
essays = pd.read_csv(f"../data_private/essays_cleaned.csv")

# Define variables

## Word vectors

In [41]:
target_set = essays[(essays["essay_set"] == 3) | (essays["essay_set"] == 4)]

y = to_categorical(target_set["score"])

# Define vocabulary
#vocab = set(tokenizer.tokenize(" ".join(target_set["essay"]))) # using simple regex match
vocab = set(token.text for essay in target_set["essay"] for token in nlp.tokenizer(essay)) # using spacy's more sophisticated matcher

# Convert words to numerical indices <https://www.tensorflow.org/tutorials/text/text_generation>
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(list(vocab))

#X_vector = [[word2idx[token] for token in tokenizer.tokenize(essay)] for essay in target_set["essay"]] # nltk
X = [[word2idx[token.text] for token in nlp.tokenizer(essay)] for essay in target_set["essay"]] # spacy

In [48]:
#plt.scatter(target_set["types"], target_set["score"]);

In [54]:
target_set[target_set["types"] < 10][["essay", "types", "score"]]

Unnamed: 0,essay,types,score
4053,It think it means to be all you can be.,7.0,0.0
4318,The setting effects the cyclist very much. By ...,9.0,2.0
4825,The features of the setting affected the cycli...,8.0,1.0
5930,Saeng would return to her homeland.,6.0,0.0
6199,So she saying she will take the test next year.,9.0,1.0
6280,He wrote this because he is talking about ...,9.0,0.0
6313,Reserved need to check keenly,5.0,3.0
6444,NO IMAGE,2.0,1.0
6726,@PERSON1 go to the top and then over the hill.,9.0,0.0
6989,I think they think it will bring them luck,5.0,1.0


In [55]:
target_set.loc[6313, :]

essay_id                                   9870
essay_set                                     4
essay             Reserved need to check keenly
rater1_domain1                                3
rater2_domain1                                3
                              ...              
semicolons                                    0
link_words                                    0
pps                                           5
max_depth                                     2
score                                         3
Name: 6313, Length: 112, dtype: object

## Metadata

In [14]:
meta_cols = [
    "tokens",
    "types",
    "sent_len",
    "word_len",
    "freq",
    "semicolons",
    "link_words",
    "pps",
    "max_depth"
]

X_meta = target_set[meta_cols].to_numpy()

# Train/test split

In [15]:
X_vector_train, \
X_vector_test, \
X_meta_train, \
X_meta_test, \
y_train, \
y_test = train_test_split(X_vector, X_meta, y, random_state=seed);

# Scale

In [30]:
ss = StandardScaler()

# Word vectors
X_vector_train = pad_sequences(X_vector_train, maxlen=pad_shape)
X_vector_test = pad_sequences(X_vector_test, maxlen=pad_shape)

# Metadata
X_meta_train_sc = ss.fit_transform(X_meta_train)
X_meta_test_sc = ss.transform(X_meta_test)

pca = PCA(random_state=seed, n_components=5)
X_meta_train_pca = pca.fit_transform(X_meta_train_sc)
X_meta_test_pca = pca.fit_transform(X_meta_test_sc)

# Export train/test data

For gridsearching models on AWS

In [56]:
with open("nn-data.p", "wb") as outfile:
    pickle.dump((
        X_vector_train,
        X_vector_test,
        X_meta_train_pca,
        X_meta_test_pca,
        y_train,
        y_test,
        X_meta_train_pca.shape[1],
        len(vocab)
    ), outfile)

In [61]:
X_meta_train_pca.shape[1]

5

# Model

This RNN has two sets of inputs: the word vectors from the documents, which are put in at the beginning, and the metadata, which is incorporated after the GRU layer ([source](http://digital-thinking.de/deep-learning-combining-numerical-and-text-features-in-deep-neural-networks/)).

In [77]:
def keras_model(gru_neurons, l1_neurons, l2_neurons, alpha):
    # Borrowed in part from:
    # <https://stackoverflow.com/a/55234203>
    # <http://digital-thinking.de/deep-learning-combining-numerical-and-text-features-in-deep-neural-networks/>

    # Define inputs
    vector_input = Input(shape=(pad_shape,))
    meta_input = Input(shape=(X_meta_train_pca.shape[1],))

    # Define embedding and GRU layers
    rnn = Embedding(len(vocab), 96, input_length=pad_shape)(vector_input)
    rnn = Bidirectional(GRU(gru_neurons, return_sequences=True, kernel_regularizer=l2(0.01)))(rnn)
    rnn = Bidirectional(GRU(gru_neurons, return_sequences=False, kernel_regularizer=l2(0.01)))(rnn)

    # Incorporate metadata
    rnn = Concatenate()([rnn, meta_input])

    # Define hidden and output layers
    rnn = Dense(l1_neurons, activation="relu", kernel_regularizer=l2(0.01))(rnn)
    rnn = Dense(l2_neurons, activation="relu", kernel_regularizer=l2(0.01))(rnn)
    rnn = Dense(4, activation="softmax")(rnn)

    model = Model(inputs=[vector_input, meta_input], outputs=[rnn])
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])
    return model

best_score = 0
best_model = None
summary = ""

params = {
    "gru_neurons": [64, 128],
    "l1_neurons": [64],
    "l2_neurons": [64],
    "alpha": [1e-1]
}

# Fit single model
model = keras_model(64, 64, 64, 1e-1)
history = model.fit([X_vector_train, X_meta_train_pca], y_train,
                                             validation_data=([X_vector_test, X_meta_test_pca], y_test),
                                             epochs=5, verbose=1)

# # Hand-rolled gridsearch
# # @show_progress
# def keras_gridsearch(p):
#     global best_score, best_model, summary
#     model = keras_model(*p)
#     history = model.fit([X_vector_train, X_meta_train_pca], y_train,
#                         validation_data=([X_vector_test, X_meta_test_pca], y_test),
#                         epochs=3, verbose=1)
#     test_score = history.history["val_acc"][-1]
#     summary += str(p) + str(test_score) + "\n"
#     if test_score > best_score:
#         best_score = test_score
#         best_model = model

# #keras_gridsearch(itertools.product(*params.values()), update_freq=1)

# Parallel(n_jobs=-1, require='sharedmem')(delayed(keras_gridsearch)(p) for p in itertools.product(*params.values()))

# print(summary)

# # Save winning model to disk
model.save("../model.keras")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
def fuzzy_accuracy(y_true, y_pred, tolerance):
    """Returns accuracy of a model trained on numeric data with a tolerance.
       For example, with a tolerance of 1, a model prediction of 9 for a true
       value of 10 will be counted in the "fuzzy accuracy"."""

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return np.mean(np.abs(y_true - y_pred) <= tolerance)

In [83]:
y_pred = model.predict([X_vector_test, X_meta_test_pca])

In [86]:
y_pred.argmax(axis=1)

array([3, 1, 2, 2, 1, 1, 3, 2, 3, 1, 2, 1, 3, 1, 1, 2, 2, 1, 1, 2, 2, 3,
       3, 1, 2, 1, 2, 3, 3, 1, 2, 3, 2, 2, 1, 1, 2, 1, 1, 3, 2, 3, 1, 2,
       2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3,
       2, 2, 3, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 1, 2,
       1, 3, 1, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 3, 2, 1, 1, 3, 1, 2, 2, 1,
       2, 2, 3, 1, 1, 1, 1, 3, 3, 1, 2, 1, 2, 3, 1, 2, 2, 1, 1, 2, 2, 1,
       1, 2, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 1, 1, 2, 1, 2, 1,
       1, 3, 2, 1, 2, 1, 2, 1, 3, 2, 2, 2, 2, 1, 3, 3, 1, 1, 2, 1, 2, 2,
       1, 3, 3, 1, 1, 3, 3, 2, 3, 2, 2, 3, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2,
       1, 2, 1, 2, 1, 3, 1, 2, 2, 3, 3, 1, 3, 3, 3, 2, 3, 2, 3, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 2, 3, 2, 1, 2, 2, 1,
       2, 1, 3, 1, 1, 1, 2, 3, 1, 1, 2, 3, 1, 2, 2, 2, 1, 3, 3, 2, 1, 3,
       1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 3, 2, 3, 3, 2, 3, 1, 2, 2,
       2, 1, 2, 1, 3, 2, 2, 1, 2, 2, 2, 3, 2, 1, 1,

In [91]:
fuzzy_accuracy(y_test.argmax(axis=1), y_pred.argmax(axis=1), tolerance=1)

0.9736842105263158