In [1]:
import keras
from keras.layers import Input, Dense, Embedding, LSTM, concatenate, Flatten, Dropout
from keras.utils import np_utils 
from keras.datasets import mnist 
from keras.models import Model
from keras.optimizers import Adadelta
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence, Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
import numpy as np 
import pandas as pd

Using TensorFlow backend.


## Import Data

In [64]:
data = pd.read_csv('preprocessed_data.csv')
data.head(2)

Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,725.05
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,213.03


In [65]:
Y = data['project_is_approved']
X = data.drop('project_is_approved', axis = 1)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify = Y, random_state = 42, test_size = 0.1)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, stratify = y_train, random_state = 42, test_size = 0.1)

In [67]:
print('Train Size :', X_train.shape)
print('CV Size :', X_cv.shape)
print('Test Size :', X_test.shape)


Train Size : (88490, 8)
CV Size : (9833, 8)
Test Size : (10925, 8)


### Encode Essays

In [68]:
import pickle
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words = set(model.keys())

In [69]:
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
X_train_essays = X_train['essay'].values

tokenizer.fit_on_texts(X_train_essays)

train_sequences = tokenizer.texts_to_sequences(X_train_essays)
cv_sequences = tokenizer.texts_to_sequences(X_cv['essay'].values)
test_sequences = tokenizer.texts_to_sequences(X_test['essay'].values)


MAX_SEQUENCE_LENGTH = np.max([len(essay) for essay in train_sequences])

X_padded_essays_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_padded_essays_cv = pad_sequences(cv_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_padded_essays_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)



In [70]:
from tqdm import tqdm_notebook
GLOVE_VECTOR_DIMENSION = 300

word_index = tokenizer.word_index

embedding_matrix = np.zeros((len(word_index) + 1, GLOVE_VECTOR_DIMENSION))
for word, i in tqdm_notebook(word_index.items()):
    embedding_vector = model.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

HBox(children=(IntProgress(value=0, max=51964), HTML(value='')))




## Define the LSTM Model 

In [71]:
X_train_school_state = X_train['school_state'].values
state_vocab = list(set(state for state in X_train_school_state ))
vocab_size = len(state_vocab)

X_state_train = [one_hot(state, vocab_size) for state in X_train_school_state]
X_state_cv = [one_hot(state, vocab_size) for state in X_cv['school_state'].values]
X_state_test = [one_hot(state, vocab_size) for state in X_test['school_state'].values]

In [72]:
X_train_project_grade_category = X_train['project_grade_category'].values
grade_vocab = list(set(grade for grade in X_train_project_grade_category ))
vocab_size = len(grade_vocab)

X_grade_train = [one_hot(grade, vocab_size, filters = '') for grade in X_train_project_grade_category]
X_grade_cv = [one_hot(grade, vocab_size, filters = '') for grade in X_cv['project_grade_category'].values]
X_grade_test = [one_hot(grade, vocab_size, filters = '') for grade in X_test['project_grade_category'].values]

In [73]:
X_train_categories = X_train['clean_categories'].values
category_vocab = list(set(category for categories in X_train_categories for category in categories.split()))
vocab_size = len(category_vocab)

X_category_train = [one_hot(category, vocab_size, filters = '') for category in X_train_categories]
X_category_cv = [one_hot(category, vocab_size, filters = '') for category in X_cv['clean_categories'].values]
X_category_test = [one_hot(category, vocab_size, filters = '') for category in X_test['clean_categories'].values]

In [74]:
MAX_SEQUENCE_LENGTH_CATEGORY = np.max([len(essay) for essay in X_category_train])

X_category_train = pad_sequences(X_category_train, maxlen=MAX_SEQUENCE_LENGTH_CATEGORY)
X_category_cv = pad_sequences(X_category_cv, maxlen=MAX_SEQUENCE_LENGTH_CATEGORY)
X_category_test = pad_sequences(X_category_test, maxlen=MAX_SEQUENCE_LENGTH_CATEGORY)

In [75]:
X_train_sub_categories = X_train['clean_subcategories'].values
sub_category_vocab = list(set(category for categories in X_train_sub_categories for category in categories.split()))
vocab_size = len(sub_category_vocab)

X_sub_category_train = [one_hot(category, vocab_size, filters = '') for category in X_train_sub_categories]
X_sub_category_cv = [one_hot(category, vocab_size, filters = '') for category in X_cv['clean_subcategories'].values]
X_sub_category_test = [one_hot(category, vocab_size, filters = '') for category in X_test['clean_subcategories'].values]

In [76]:
MAX_SEQUENCE_LENGTH_SUB_CATEGORY = np.max([len(essay) for essay in X_sub_category_train])

X_sub_category_train = pad_sequences(X_sub_category_train, maxlen=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)
X_sub_category_cv = pad_sequences(X_sub_category_cv, maxlen=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)
X_sub_category_test = pad_sequences(X_sub_category_test, maxlen=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)

In [77]:
X_train_teacher = X_train['teacher_prefix'].values
teacher_vocab = list(set(category for categories in X_train_teacher for category in categories.split()))
vocab_size = len(teacher_vocab)

X_teacher_train = [one_hot(prefix, vocab_size, filters = '') for prefix in X_train_teacher]
X_teacher_cv = [one_hot(prefix, vocab_size, filters = '') for prefix in X_cv['teacher_prefix'].values]
X_teacher_test = [one_hot(prefix, vocab_size, filters = '') for prefix in X_test['teacher_prefix'].values]

In [78]:
X_train_numeric = X_train[['teacher_number_of_previously_posted_projects', 'price']].values
X_cv_numeric = X_cv[['teacher_number_of_previously_posted_projects', 'price']].values
X_test_numeric = X_test[['teacher_number_of_previously_posted_projects', 'price']].values

In [84]:
# ESSAYS LAYER
input_essays = Input(shape=(MAX_SEQUENCE_LENGTH,))
x_essays = Embedding(len(word_index) + 1, 
              GLOVE_VECTOR_DIMENSION, 
              weights=[embedding_matrix],
              input_length=MAX_SEQUENCE_LENGTH,
              trainable = False)(input_essays)
x_essays = LSTM(10)(x_essays)
lstm_essay_model = Model(input_essays, x_essays)


EMBEDDING_DIMENSION = 50

# STATES LAYER
input_states = Input(shape=(1,))
x_state = Embedding(len(state_vocab) + 1, EMBEDDING_DIMENSION, input_length=1)(input_states)
x_state = Flatten()(x_state)
state_model = Model(input_states, x_state)

# GRADE LAYER
input_grades = Input(shape=(1,))
x_grade = Embedding(len(grade_vocab) + 1, EMBEDDING_DIMENSION, input_length=1)(input_grades)
x_grade = Flatten()(x_grade)
grade_model = Model(input_grades, x_grade)

# CATEGORY LAYER
input_category = Input(shape=(MAX_SEQUENCE_LENGTH_CATEGORY,))
x_category = Embedding(len(category_vocab) + 1, EMBEDDING_DIMENSION, input_length=MAX_SEQUENCE_LENGTH_CATEGORY)(input_category)
x_category = Flatten()(x_category)
category_model = Model(input_category, x_category)

# SUB CATEGORY LAYER
input_sub_category = Input(shape=(MAX_SEQUENCE_LENGTH_SUB_CATEGORY,))
x_sub_category = Embedding(len(sub_category_vocab) + 1, EMBEDDING_DIMENSION, input_length=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)(input_sub_category)
x_sub_category = Flatten()(x_sub_category)
sub_category_model = Model(input_sub_category, x_sub_category)

# TEACHER PREFIX LAYER
input_teacher = Input(shape=(1,))
x_teacher = Embedding(len(teacher_vocab) + 1, EMBEDDING_DIMENSION, input_length=1)(input_teacher)
x_teacher = Flatten()(x_teacher)
teacher_model = Model(input_teacher, x_teacher)

# NUMERIC LAYER
input_teacher = Input(shape=(2,))
x_numeric = Dense(16, activation='relu')(input_teacher)
numeric_model = Model(input_teacher, x_numeric)


combined = concatenate([lstm_essay_model.output, 
                        state_model.output,
                        grade_model.output,
                        category_model.output,
                        sub_category_model.output,
                        teacher_model.output,
                        numeric_model.output
                       ])


x_combined = Dense(128, activation='relu')(combined)
x_combined = Dropout(0.5)(x_combined)

x_combined = Dense(128, activation='relu')(combined)
x_combined = Dropout(0.5)(x_combined)

x_combined = Dense(128, activation='relu')(combined)
x_combined = Dense(1, activation="sigmoid")(x_combined)

final_model = Model(inputs=[lstm_essay_model.input, 
                            state_model.input,
                            grade_model.input,
                            category_model.input,
                            sub_category_model.input,
                            teacher_model.input,
                            numeric_model.input], 
                    outputs = x_combined)

In [85]:
final_model.compile(loss = 'binary_crossentropy', metrics = ['accuracy'], optimizer = Adadelta())
final_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 320)          0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 3)            0                                            
__________________________________________________________________________________________________
input_19 (

In [86]:
test_input_length = 10000

input_train = [  np.array(X_padded_essays_train)[:test_input_length], 
                 np.array(X_state_train)[:test_input_length],
                 np.array(X_grade_train)[:test_input_length],
                 np.array(X_category_train)[:test_input_length],
                 np.array(X_sub_category_train)[:test_input_length],
                 np.array(X_teacher_train)[:test_input_length],
                 np.array(X_train_numeric)[:test_input_length],
                ]


input_cv = [  np.array(X_padded_essays_cv)[:test_input_length], 
                 np.array(X_state_cv)[:test_input_length],
                 np.array(X_grade_cv)[:test_input_length],
                 np.array(X_category_cv)[:test_input_length],
                 np.array(X_sub_category_cv)[:test_input_length],
                 np.array(X_teacher_cv)[:test_input_length],
                 np.array(X_cv_numeric)[:test_input_length],
                ]

In [None]:
final_model.fit(input_train, y_train[:test_input_length], 
         batch_size = 128, 
         epochs = 10,
         verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10

In [83]:
from sklearn.metrics import auc, roc_curve

y_cv_proba = final_model.predict(input_cv)
test_fpr, test_tpr, te_thresholds = roc_curve(y_cv, y_cv_proba)
auc(test_fpr, test_tpr)

0.42306862662400596

## Model 2

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = 'word')
tfidf.fit(X_train['essay'].values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(tfidf.idf_)
plt.title('Boxplot of IDF values for essay words')

Text(0.5, 1.0, 'Boxplot of IDF values for essay words')

In [8]:
print('25th Percentile :', np.percentile(tfidf.idf_, 25))
print('75th Percentile :', np.percentile(tfidf.idf_, 75))

idf_low_threshold = np.percentile(tfidf.idf_, 2.5)
idf_high_threshold = np.percentile(tfidf.idf_, 97.5)

25th Percentile : 9.682605929807279
75th Percentile : 11.697508950349542


In [9]:
from tqdm import tqdm_notebook

stop_words = []
idf_words = tfidf.get_feature_names()

for idf_value, word in tqdm_notebook(zip(tfidf.idf_, idf_words)):
    if idf_value < idf_low_threshold or idf_value > idf_high_threshold :
        stop_words.append(word)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
print('%d words removed after filtering, ' % len(stop_words))

1298 words removed after filtering, 


### Remove low `idf_` words from essays

In [11]:
def remove_low_idf_words(args):
        essay = args[0]
        pool_list = args[1]
        row_id = args[2]
        processed_essay = ' '.join(e for e in essay.split() if e not in stop_words)
        if len(processed_essay) == 0:
            processed_essay = '  '
        pool_list.append([row_id, processed_essay])

In [12]:
import multiprocessing
import os

manager = multiprocessing.Manager()
processed_essay = manager.list()
args = [(essay, processed_essay, row_id) for essay, row_id in zip(X_train['essay'].values, X_train.index)]
with multiprocessing.Pool(os.cpu_count()) as p:
    r = list(tqdm_notebook(p.imap(remove_low_idf_words, args), total=X_train.shape[0]))

HBox(children=(IntProgress(value=0, max=88490), HTML(value='')))




In [17]:
filtered_essay_train = pd.DataFrame(processed_essay[:88600], columns = ['id', 'filtered_essay'])
filtered_essay_train.index = filtered_essay_train['id']
filtered_essay_train = filtered_essay_train.drop(['id'], axis = 1)
X_train['id'] = X_train.index
X_train = X_train.merge(filtered_essay_train, on = 'id')

In [35]:
manager = multiprocessing.Manager()
processed_essay_cv = manager.list()
args = [(essay, processed_essay_cv, row_id) for essay, row_id in zip(X_cv['essay'].values, X_cv.index)]
with multiprocessing.Pool(os.cpu_count()) as p:
    r = list(tqdm_notebook(p.imap(remove_low_idf_words, args), total=X_cv.shape[0]))

HBox(children=(IntProgress(value=0, max=9833), HTML(value='')))




In [41]:
filtered_essay_cv = pd.DataFrame(processed_essay_cv[:88600], columns = ['id', 'filtered_essay'])
filtered_essay_cv.index = filtered_essay_cv['id']
filtered_essay_cv = filtered_essay_cv.drop(['id'], axis = 1)
X_cv['id'] = X_cv.index
X_cv = X_cv.merge(filtered_essay_cv, on = 'id')

In [43]:
manager = multiprocessing.Manager()
processed_essay_test = manager.list()
args = [(essay, processed_essay_test, row_id) for essay, row_id in zip(X_test['essay'].values, X_test.index)]
with multiprocessing.Pool(os.cpu_count()) as p:
    r = list(tqdm_notebook(p.imap(remove_low_idf_words, args), total=X_test.shape[0]))

HBox(children=(IntProgress(value=0, max=10925), HTML(value='')))




In [44]:
filtered_essay_test = pd.DataFrame(processed_essay_test[:88600], columns = ['id', 'filtered_essay'])
filtered_essay_test.index = filtered_essay_test['id']
filtered_essay_test = filtered_essay_test.drop(['id'], axis = 1)
X_test['id'] = X_test.index
X_test = X_test.merge(filtered_essay_test, on = 'id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


### Modelling

In [47]:
import pickle
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words = set(model.keys())

In [48]:
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
X_train_essays = X_train['filtered_essay'].values

tokenizer.fit_on_texts(X_train_essays)

train_sequences = tokenizer.texts_to_sequences(X_train_essays)
cv_sequences = tokenizer.texts_to_sequences(X_cv['filtered_essay'].values)
test_sequences = tokenizer.texts_to_sequences(X_test['filtered_essay'].values)


MAX_SEQUENCE_LENGTH = np.max([len(essay) for essay in train_sequences])

X_padded_essays_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_padded_essays_cv = pad_sequences(cv_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_padded_essays_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)



In [49]:
from tqdm import tqdm_notebook
GLOVE_VECTOR_DIMENSION = 300

word_index = tokenizer.word_index

embedding_matrix = np.zeros((len(word_index) + 1, GLOVE_VECTOR_DIMENSION))
for word, i in tqdm_notebook(word_index.items()):
    embedding_vector = model.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

HBox(children=(IntProgress(value=0, max=50666), HTML(value='')))




## Define the LSTM Model 

In [50]:
X_train_school_state = X_train['school_state'].values
state_vocab = list(set(state for state in X_train_school_state ))
vocab_size = len(state_vocab)

X_state_train = [one_hot(state, vocab_size) for state in X_train_school_state]
X_state_cv = [one_hot(state, vocab_size) for state in X_cv['school_state'].values]
X_state_test = [one_hot(state, vocab_size) for state in X_test['school_state'].values]

In [51]:
X_train_project_grade_category = X_train['project_grade_category'].values
grade_vocab = list(set(grade for grade in X_train_project_grade_category ))
vocab_size = len(grade_vocab)

X_grade_train = [one_hot(grade, vocab_size, filters = '') for grade in X_train_project_grade_category]
X_grade_cv = [one_hot(grade, vocab_size, filters = '') for grade in X_cv['project_grade_category'].values]
X_grade_test = [one_hot(grade, vocab_size, filters = '') for grade in X_test['project_grade_category'].values]

In [52]:
X_train_categories = X_train['clean_categories'].values
category_vocab = list(set(category for categories in X_train_categories for category in categories.split()))
vocab_size = len(category_vocab)

X_category_train = [one_hot(category, vocab_size, filters = '') for category in X_train_categories]
X_category_cv = [one_hot(category, vocab_size, filters = '') for category in X_cv['clean_categories'].values]
X_category_test = [one_hot(category, vocab_size, filters = '') for category in X_test['clean_categories'].values]

In [53]:
MAX_SEQUENCE_LENGTH_CATEGORY = np.max([len(essay) for essay in X_category_train])

X_category_train = pad_sequences(X_category_train, maxlen=MAX_SEQUENCE_LENGTH_CATEGORY)
X_category_cv = pad_sequences(X_category_cv, maxlen=MAX_SEQUENCE_LENGTH_CATEGORY)
X_category_test = pad_sequences(X_category_test, maxlen=MAX_SEQUENCE_LENGTH_CATEGORY)

In [54]:
X_train_sub_categories = X_train['clean_subcategories'].values
sub_category_vocab = list(set(category for categories in X_train_sub_categories for category in categories.split()))
vocab_size = len(sub_category_vocab)

X_sub_category_train = [one_hot(category, vocab_size, filters = '') for category in X_train_sub_categories]
X_sub_category_cv = [one_hot(category, vocab_size, filters = '') for category in X_cv['clean_subcategories'].values]
X_sub_category_test = [one_hot(category, vocab_size, filters = '') for category in X_test['clean_subcategories'].values]

In [55]:
MAX_SEQUENCE_LENGTH_SUB_CATEGORY = np.max([len(essay) for essay in X_sub_category_train])

X_sub_category_train = pad_sequences(X_sub_category_train, maxlen=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)
X_sub_category_cv = pad_sequences(X_sub_category_cv, maxlen=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)
X_sub_category_test = pad_sequences(X_sub_category_test, maxlen=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)

In [56]:
X_train_teacher = X_train['teacher_prefix'].values
teacher_vocab = list(set(category for categories in X_train_teacher for category in categories.split()))
vocab_size = len(teacher_vocab)

X_teacher_train = [one_hot(prefix, vocab_size, filters = '') for prefix in X_train_teacher]
X_teacher_cv = [one_hot(prefix, vocab_size, filters = '') for prefix in X_cv['teacher_prefix'].values]
X_teacher_test = [one_hot(prefix, vocab_size, filters = '') for prefix in X_test['teacher_prefix'].values]

In [57]:
X_train_numeric = X_train[['teacher_number_of_previously_posted_projects', 'price']].values
X_cv_numeric = X_cv[['teacher_number_of_previously_posted_projects', 'price']].values
X_test_numeric = X_test[['teacher_number_of_previously_posted_projects', 'price']].values

In [58]:
# ESSAYS LAYER
input_essays = Input(shape=(MAX_SEQUENCE_LENGTH,))
x_essays = Embedding(len(word_index) + 1, 
              GLOVE_VECTOR_DIMENSION, 
              weights=[embedding_matrix],
              input_length=MAX_SEQUENCE_LENGTH,
              trainable = False)(input_essays)
x_essays = LSTM(10)(x_essays)
lstm_essay_model = Model(input_essays, x_essays)


EMBEDDING_DIMENSION = 5

# STATES LAYER
input_states = Input(shape=(1,))
x_state = Embedding(len(state_vocab) + 1, EMBEDDING_DIMENSION, input_length=1)(input_states)
x_state = Flatten()(x_state)
state_model = Model(input_states, x_state)

# GRADE LAYER
input_grades = Input(shape=(1,))
x_grade = Embedding(len(grade_vocab) + 1, EMBEDDING_DIMENSION, input_length=1)(input_grades)
x_grade = Flatten()(x_grade)
grade_model = Model(input_grades, x_grade)

# CATEGORY LAYER
input_category = Input(shape=(MAX_SEQUENCE_LENGTH_CATEGORY,))
x_category = Embedding(len(category_vocab) + 1, EMBEDDING_DIMENSION, input_length=MAX_SEQUENCE_LENGTH_CATEGORY)(input_category)
x_category = Flatten()(x_category)
category_model = Model(input_category, x_category)

# SUB CATEGORY LAYER
input_sub_category = Input(shape=(MAX_SEQUENCE_LENGTH_SUB_CATEGORY,))
x_sub_category = Embedding(len(sub_category_vocab) + 1, EMBEDDING_DIMENSION, input_length=MAX_SEQUENCE_LENGTH_SUB_CATEGORY)(input_sub_category)
x_sub_category = Flatten()(x_sub_category)
sub_category_model = Model(input_sub_category, x_sub_category)

# TEACHER PREFIX LAYER
input_teacher = Input(shape=(1,))
x_teacher = Embedding(len(teacher_vocab) + 1, EMBEDDING_DIMENSION, input_length=1)(input_teacher)
x_teacher = Flatten()(x_teacher)
teacher_model = Model(input_teacher, x_teacher)

# NUMERIC LAYER
input_teacher = Input(shape=(2,))
x_numeric = Dense(16, activation='relu')(input_teacher)
numeric_model = Model(input_teacher, x_numeric)


combined = concatenate([lstm_essay_model.output, 
                        state_model.output,
                        grade_model.output,
                        category_model.output,
                        sub_category_model.output,
                        teacher_model.output,
                        numeric_model.output
                       ])


x_combined = Dense(128, activation='relu')(combined)
x_combined = Dropout(0.5)(x_combined)

x_combined = Dense(128, activation='relu')(combined)
x_combined = Dropout(0.5)(x_combined)

x_combined = Dense(128, activation='relu')(combined)
x_combined = Dense(1, activation="sigmoid")(x_combined)

final_model = Model(inputs=[lstm_essay_model.input, 
                            state_model.input,
                            grade_model.input,
                            category_model.input,
                            sub_category_model.input,
                            teacher_model.input,
                            numeric_model.input], 
                    outputs = x_combined)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [59]:
final_model.compile(loss = 'binary_crossentropy', metrics = ['accuracy'], optimizer = Adadelta())
final_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 112)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 3)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [62]:
test_input_length = 80000

input_train = [  np.array(X_padded_essays_train)[:test_input_length], 
                 np.array(X_state_train)[:test_input_length],
                 np.array(X_grade_train)[:test_input_length],
                 np.array(X_category_train)[:test_input_length],
                 np.array(X_sub_category_train)[:test_input_length],
                 np.array(X_teacher_train)[:test_input_length],
                 np.array(X_train_numeric)[:test_input_length],
                ]


input_cv = [  np.array(X_padded_essays_cv)[:test_input_length], 
                 np.array(X_state_cv)[:test_input_length],
                 np.array(X_grade_cv)[:test_input_length],
                 np.array(X_category_cv)[:test_input_length],
                 np.array(X_sub_category_cv)[:test_input_length],
                 np.array(X_teacher_cv)[:test_input_length],
                 np.array(X_cv_numeric)[:test_input_length],
                ]

In [63]:
final_model.fit(input_train, y_train[:test_input_length], 
         batch_size = 128, 
         epochs = 10,
         verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 

In [255]:
from sklearn.metrics import auc, roc_curve

y_cv_proba = final_model.predict(input_cv)
test_fpr, test_tpr, te_thresholds = roc_curve(y_cv, y_cv_proba)
auc(test_fpr, test_tpr)

0.4756647823894884