In [15]:
import tensorflow as tf
import pandas as pd
import numpy as np
import spacy
import os
import datetime 

from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense, LSTM, Bidirectional, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [2]:
nlp = spacy.load('en_vectors_web_lg', disable=['parser', 'tagger', 'ner'])
# nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [3]:
path = '/Users/andradea/Documents/cr-engine/datasets/ASAP/'
files = os.listdir(path)
files = [f for f in files if 'CR' in f and '.csv' in f]
files

['ASAP_CR10.csv',
 'ASAP_CR9.csv',
 'ASAP_CR8.csv',
 'ASAP_CR3.csv',
 'ASAP_CR2.csv',
 'ASAP_CR1.csv',
 'ASAP_CR5.csv',
 'ASAP_CR4.csv',
 'ASAP_CR6.csv',
 'ASAP_CR7.csv']

In [4]:
file = files[5]
print(file)
df = pd.read_csv(path + file)
df.head(2)

ASAP_CR1.csv


Unnamed: 0,item,id,response,score_d1,r1_d1,r2_d1
0,ASAP_CR1,ASAP_CR1_1,First I would add that the students might want...,1,1,0
1,ASAP_CR1,ASAP_CR1_2,"In the procedure, the group of students said t...",1,1,1


In [5]:
responses = df.response.tolist()
labels = df.score_d1.values

In [6]:
length = []
for response in responses:
    length.append(len(nlp(response)))
int(np.ceil(np.mean(length)))

54

In [7]:
max_len = int(np.ceil(np.mean(length)))
vec_size = 300
num_classes = len(df.score_d1.value_counts())
print('number of classes:', num_classes)

number of classes: 4


In [8]:
response_embedded = np.zeros((len(responses), max_len, vec_size))

for i,response in enumerate(responses): 
    response = nlp(response)
    for j,word in enumerate(response):
        if j < max_len:
            response_embedded[i][j] = word.vector

response_embedded.shape

(400, 54, 300)

In [9]:
one_hot_labels = to_categorical(labels, num_classes=num_classes)

In [10]:
train_x, test_x, train_y, test_y = train_test_split(response_embedded, one_hot_labels, test_size=0.25, random_state=42)
print('training size:', train_x.shape[0])
print('testing size:', test_x.shape[0])

training size: 300
testing size: 100


In [11]:
model = Sequential()
model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=(max_len, vec_size))) # returns a sequence of vectors of dimension 32
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(512, return_sequences=True))) # returns a sequence of vectors of dimension 32
model.add(Dropout(0.5))
model.add(LSTM(1024)) # return a single vector
model.add(BatchNormalization())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

In [21]:
learning_rate = [0.01]
epochs = 10000
batch_size = 6

In [22]:
optimizer = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, 
                            epsilon=None, amsgrad=True, clipvalue=0.5)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', 
              metrics=['accuracy'])
callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.001, verbose=1)]
history = model.fit(train_x, train_y, batch_size=batch_size, 
                    epochs=epochs, validation_data=(test_x, test_y), 
                    callbacks=callbacks)

Train on 300 samples, validate on 100 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 00004: early stopping


In [None]:
print('training error:', 1 - max(history.history['acc']), 'validation error:', 1- max(history.history['val_acc']))

In [None]:
train_pred = model.predict(train_x)
train_k = cohen_kappa_score(pd.DataFrame(train_y).idxmax(axis=1), 
                            pd.DataFrame(train_pred).idxmax(axis=1), 
                            weights='quadratic')
test_pred = model.predict(test_x)
test_k = cohen_kappa_score(pd.DataFrame(test_y).idxmax(axis=1), 
                            pd.DataFrame(test_pred).idxmax(axis=1), 
                            weights='quadratic')

print('training kappa:', train_k, 'validation kappa:', test_k)

In [None]:
path_write = '/Users/alejandro/Documents/GitHub/CRAIS/LSTM/'

timestamp = datetime.datetime.now().strftime('%Y-%m-%d')
model.save(path_write + 'LSTM_Model_ASAP_' + file.split('.csv')[0] + '_' + timestamp + '.h5')