In [1]:
# Set Working Directory
import os
os.chdir('..')

In [16]:
# Load Requirements
import pandas as pd
import  numpy as np
from keras.preprocessing.text import Tokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Imputer
from keras.preprocessing.sequence import pad_sequences
from scipy import sparse

import zipfile
import re, nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer 

import keras
from keras.layers import Dense, concatenate, Input, Dropout, Embedding, Flatten
from keras.models import Model
from keras.callbacks import EarlyStopping

from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint

from helper import *

In [3]:
# Load Data
data_train, data_test = load_data()
print('data_train shape:', data_train.shape)
print('data_test shape:', data_test.shape)

data_train shape: (400277, 25)
data_test shape: (50064, 16)


In [4]:
# Load Features
data_features = load_features(data_train, data_test)
print('data_features shape:', data_features.shape)

data_features shape: (450341, 16)


In [5]:
def text_processing(phrase):
    """
    Return list processed_phrase: phrase tokens after processing has been completed
    
    param string phrase: phrase to be processed
    
    Required Libraries: re, nltk
    """
    
    # Case Normalization
    processed_phrase = phrase.lower()
    
    # Remove Punctuations
    processed_phrase = re.sub(r"[^a-z0-9-]", " ", processed_phrase)
    
    # Tokenize Phrase
    processed_phrase = processed_phrase.split()
    
    # Remove Stopwords
    processed_phrase = [word for word in processed_phrase if word not in stopwords.words("english")]
    processed_phrase = [word for word in processed_phrase if word != '-']
    
    # Lemmatization
    processed_phrase = [WordNetLemmatizer().lemmatize(word) for word in processed_phrase]
    
    # Recombine list into phrase
    processed_phrase = ' '.join(processed_phrase)
    
    return processed_phrase

def init_prep(data_train, data_test, data_features, label=None):
    """
    Return numpy array X: feature matrix for classification model fitting
    Return numpy array y: labels matrix for classification model fitting
    Return numpy array X_test: feature matrix of test set
    
    Param pandas dataframe data_train: training data (features + labels)
    Param pandas dataframe data_test: test data (features)
    Param pandas dataframe data_features: data in feature columns of data_train and data_test
    
    Required Libraries: pandas, numpy, keras
    Required helper functions: text_processing
    """
    
    # Combined and preprocess text columns
    data_train['combined_text'] = (data_train[data_features.columns]
                                       .drop(columns=['FTE', 'Total'])
                                       .fillna("")
                                       .apply(lambda x: " ".join(x), axis=1)
                                       .apply(lambda x: text_processing(x))
                                  )
    data_test['combined_text'] = (data_test[data_features.columns]
                                       .drop(columns=['FTE', 'Total'])
                                       .fillna("")
                                       .apply(lambda x: " ".join(x), axis=1)
                                       .apply(lambda x: text_processing(x))
                                 )
    data_features['combined_text'] = (data_features
                                          .drop(columns=['FTE', 'Total'])
                                          .fillna("")
                                          .apply(lambda x: " ".join(x), axis=1)
                                          .apply(lambda x: text_processing(x))
                                     )
    
    # Vectorizer text columns in training data
    tokenize = Tokenizer()
    tokenize.fit_on_texts(data_features['combined_text'])
    
    X_text = tokenize.texts_to_sequences(data_train['combined_text'])
    X_text_test = tokenize.texts_to_sequences(data_test['combined_text'])
    
    X_text = pad_sequences(X_text, padding='post', maxlen=50, truncating='post')
    X_text_test = pad_sequences(X_text_test, padding='post', maxlen=50, truncating='post')
    
    # Impute missing numerical data
    imp_total = Imputer(strategy='median')
    imp_total.fit(data_features['Total'].values.reshape(-1, 1))


    total_not_missing = pd.isnull(data_train['Total']).astype(int).values.reshape(-1, 1)
    fte_not_missing = pd.isnull(data_train['FTE']).astype(int).values.reshape(-1, 1)
    total = imp_total.transform(data_train['Total'].values.reshape(-1, 1))
    fte = data_train['FTE'].fillna('0').values.reshape(-1, 1)

    total_not_missing_test = pd.isnull(data_test['Total']).astype(int).values.reshape(-1, 1)
    fte_not_missing_test = pd.isnull(data_test['FTE']).astype(int).values.reshape(-1, 1)
    total_test = imp_total.transform(data_test['Total'].values.reshape(-1, 1))
    fte_test = data_test['FTE'].fillna('0').values.reshape(-1, 1)
    
    # Create feature matrix
    X_numeric = np.concatenate([total, total_not_missing, fte, fte_not_missing], axis=1)
    X_numeric_test = np.concatenate([total_test, total_not_missing_test, fte_test, fte_not_missing_test], axis=1)
    
    # Create labels matrix
    if label:
        y = pd.get_dummies(data_train[label]).values.astype('float64')
    else:
        label = ['Function',
                 'Object_Type',
                 'Operating_Status',
                 'Position_Type',
                 'Pre_K',
                 'Reporting',
                 'Sharing',
                 'Student_Type',
                 'Use']
        y = pd.get_dummies(data_train[label]).values.astype('float64')
    
    return X_numeric, X_text, X_numeric_test, X_text_test, y, tokenize

In [6]:
X_numeric, X_text, X_numeric_test, X_text_test, y, tokenize = init_prep(data_train, data_test, data_features, label=None)
print('X_numeric shape:', X_numeric.shape)
print('X_numeric_test shape:', X_numeric_test.shape)
print('X_text shape:', X_text.shape)
print('X_text_test shape:', X_text_test.shape)
print('y shape:', y.shape)

X_numeric shape: (400277, 4)
X_numeric_test shape: (50064, 4)
X_text shape: (400277, 50)
X_text_test shape: (50064, 50)
y shape: (400277, 104)


In [12]:
print(tokenize.word_index)

{'general': 1, 'fund': 2, 'teacher': 3, 'regular': 4, 'service': 5, 'school': 6, 'employee': 7, 'instruction': 8, 'instructional': 9, 'salary': 10, 'education': 11, 'benefit': 12, 'professional': 13, 'title': 14, 'special': 15, 'support': 16, 'non': 17, 'project': 18, 'staff': 19, 'elementary': 20, 'part': 21, 'wage': 22, 'pay': 23, 'sub': 24, 'operation': 25, 'program': 26, 'food': 27, 'basic': 28, 'supply': 29, 'undesignated': 30, 'extra': 31, 'substitute': 32, 'duty': 33, 'transportation': 34, 'ed': 35, 'personnel': 36, 'undistributed': 37, 'time': 38, 'operating': 39, 'high': 40, 'personal': 41, 'term': 42, 'child': 43, 'training': 44, 'educational': 45, 'disadvantaged': 46, 'retirement': 47, 'k': 48, 'curriculum': 49, 'federal': 50, 'district': 51, 'overtime': 52, 'professi': 53, 'campus': 54, 'activity': 55, 'community': 56, 'student': 57, 'state': 58, 'short': 59, 'middle': 60, 'bus': 61, 'maintenance': 62, 'targeted': 63, 'assistance': 64, 'purpose': 65, 'secondary': 66, '12': 

In [20]:
def build_network(X_numeric, X_text, X_numeric_test, X_text_test, y):
    """
    Return compiled keras-model model
    
    param numpy array X: feature matrix for classification
    param numpy array y: labels matrix for classification
    
    Required Libraries: keras
    """
    
    dropout_value = 0.5
    embedding_vector_length = 400
    
    numeric_input = Input(shape=(X_numeric.shape[1],) , name='numeric_input') 
    text_input = Input(shape=(X_text.shape[1],) , name='text_input')
#     word_embedding = Embedding(input_dim=3804, output_dim=50, mask_zero=True, input_length=50)(text_input)
#     word_embedding = Flatten()(word_embedding)
    
    # Function
    word_embedding_function = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_function = Flatten()(word_embedding_function)
    text_function_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_function)
    text_function_hidden_layer_1 = Dropout(dropout_value)(text_function_hidden_layer_1)
    text_function_hidden_layer_2 = Dense(100, activation='relu')(text_function_hidden_layer_1)
    text_function_hidden_layer_2 = Dropout(dropout_value)(text_function_hidden_layer_2)
    text_function_hidden_layer_3 = Dense(50, activation='relu')(text_function_hidden_layer_2)
    text_function_hidden_layer_3 = Dropout(dropout_value)(text_function_hidden_layer_3)
    numeric_function_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_function_hidden_layer_1 = Dropout(dropout_value)(numeric_function_hidden_layer_1)
    combined_function_layer = concatenate([numeric_function_hidden_layer_1, text_function_hidden_layer_2])
    function_output_layer = Dense(37, activation='softmax')(combined_function_layer)
    
    # Object_Type
    word_embedding_object_type = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_object_type = Flatten()(word_embedding_object_type)
    text_object_type_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_object_type)
    text_object_type_hidden_layer_1 = Dropout(dropout_value)(text_object_type_hidden_layer_1)
    text_object_type_hidden_layer_2 = Dense(100, activation='relu')(text_object_type_hidden_layer_1)
    text_object_type_hidden_layer_2 = Dropout(dropout_value)(text_object_type_hidden_layer_2)
    text_object_type_hidden_layer_3 = Dense(50, activation='relu')(text_object_type_hidden_layer_2)
    text_object_type_hidden_layer_3 = Dropout(dropout_value)(text_object_type_hidden_layer_3)
    text_object_type_hidden_layer_4 = Dense(25, activation='relu')(text_object_type_hidden_layer_3)
    text_object_type_hidden_layer_4 = Dropout(dropout_value)(text_object_type_hidden_layer_4)
    numeric_object_type_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_object_type_hidden_layer_1 = Dropout(dropout_value)(numeric_object_type_hidden_layer_1)
    combined_object_type_layer = concatenate([numeric_object_type_hidden_layer_1, text_object_type_hidden_layer_4])
    object_type_output_layer = Dense(11, activation='softmax')(combined_object_type_layer)
    
    # Operating_Status
    word_embedding_operating_status = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_operating_status = Flatten()(word_embedding_operating_status)
    text_operating_status_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_operating_status)
    text_operating_status_hidden_layer_1 = Dropout(dropout_value)(text_operating_status_hidden_layer_1)
    text_operating_status_hidden_layer_2 = Dense(100, activation='relu')(text_operating_status_hidden_layer_1)
    text_operating_status_hidden_layer_2 = Dropout(dropout_value)(text_operating_status_hidden_layer_2)
    text_operating_status_hidden_layer_3 = Dense(50, activation='relu')(text_operating_status_hidden_layer_2)
    text_operating_status_hidden_layer_3 = Dropout(dropout_value)(text_operating_status_hidden_layer_3)
    text_operating_status_hidden_layer_4 = Dense(25, activation='relu')(text_operating_status_hidden_layer_3)
    text_operating_status_hidden_layer_4 = Dropout(dropout_value)(text_operating_status_hidden_layer_4)
    numeric_operating_status_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_operating_status_hidden_layer_1 = Dropout(dropout_value)(numeric_operating_status_hidden_layer_1)
    combined_operating_status_layer = concatenate([numeric_operating_status_hidden_layer_1, text_operating_status_hidden_layer_4])
    operating_status_output_layer = Dense(3, activation='softmax')(combined_operating_status_layer)
    
    # Position_Type
    word_embedding_position_type = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_position_type = Flatten()(word_embedding_position_type)
    text_position_type_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_operating_status)
    text_position_type_hidden_layer_1 = Dropout(dropout_value)(text_position_type_hidden_layer_1)
    text_position_type_hidden_layer_2 = Dense(100, activation='relu')(text_position_type_hidden_layer_1)
    text_position_type_hidden_layer_2 = Dropout(dropout_value)(text_position_type_hidden_layer_2)
    text_position_type_hidden_layer_3 = Dense(50, activation='relu')(text_position_type_hidden_layer_2)
    text_position_type_hidden_layer_3 = Dropout(dropout_value)(text_position_type_hidden_layer_3)
    numeric_position_type_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_position_type_hidden_layer_1 = Dropout(dropout_value)(numeric_position_type_hidden_layer_1)
    combined_position_type_layer = concatenate([numeric_position_type_hidden_layer_1, text_position_type_hidden_layer_3])
    position_type_output_layer = Dense(25, activation='softmax')(combined_position_type_layer)
    
    # Pre_K
    word_embedding_pre_k = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_pre_k = Flatten()(word_embedding_pre_k)
    text_pre_k_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_pre_k)
    text_pre_k_hidden_layer_1 = Dropout(dropout_value)(text_pre_k_hidden_layer_1)
    text_pre_k_hidden_layer_2 = Dense(100, activation='relu')(text_pre_k_hidden_layer_1)
    text_pre_k_hidden_layer_2 = Dropout(dropout_value)(text_pre_k_hidden_layer_2)
    text_pre_k_hidden_layer_3 = Dense(50, activation='relu')(text_pre_k_hidden_layer_2)
    text_pre_k_hidden_layer_3 = Dropout(dropout_value)(text_pre_k_hidden_layer_3)
    text_pre_k_hidden_layer_4 = Dense(25, activation='relu')(text_pre_k_hidden_layer_3)
    text_pre_k_hidden_layer_4 = Dropout(dropout_value)(text_pre_k_hidden_layer_4)
    numeric_pre_k_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_pre_k_hidden_layer_1 = Dropout(dropout_value)(numeric_pre_k_hidden_layer_1)
    combined_pre_k_layer = concatenate([numeric_pre_k_hidden_layer_1, text_pre_k_hidden_layer_4])
    pre_k_output_layer = Dense(3, activation='softmax')(combined_pre_k_layer)
    
    # Reporting
    word_embedding_reporting = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_reporting = Flatten()(word_embedding_reporting)
    text_reporting_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_reporting)
    text_reporting_hidden_layer_1 = Dropout(dropout_value)(text_reporting_hidden_layer_1)
    text_reporting_hidden_layer_2 = Dense(100, activation='relu')(text_reporting_hidden_layer_1)
    text_reporting_hidden_layer_2 = Dropout(dropout_value)(text_reporting_hidden_layer_2)
    text_reporting_hidden_layer_3 = Dense(50, activation='relu')(text_reporting_hidden_layer_2)
    text_reporting_hidden_layer_3 = Dropout(dropout_value)(text_reporting_hidden_layer_3)
    text_reporting_hidden_layer_4 = Dense(25, activation='relu')(text_reporting_hidden_layer_3)
    text_reporting_hidden_layer_4 = Dropout(dropout_value)(text_reporting_hidden_layer_4)
    numeric_reporting_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_reporting_hidden_layer_1 = Dropout(dropout_value)(numeric_reporting_hidden_layer_1)
    combined_reporting_layer = concatenate([numeric_reporting_hidden_layer_1, text_reporting_hidden_layer_4])
    reporting_output_layer = Dense(3, activation='softmax')(combined_reporting_layer)
    
    # Sharing
    word_embedding_sharing = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_sharing = Flatten()(word_embedding_sharing)
    text_sharing_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_sharing)
    text_sharing_hidden_layer_1 = Dropout(dropout_value)(text_sharing_hidden_layer_1)
    text_sharing_hidden_layer_2 = Dense(100, activation='relu')(text_sharing_hidden_layer_1)
    text_sharing_hidden_layer_2 = Dropout(dropout_value)(text_sharing_hidden_layer_2)
    text_sharing_hidden_layer_3 = Dense(50, activation='relu')(text_sharing_hidden_layer_2)
    text_sharing_hidden_layer_3 = Dropout(dropout_value)(text_sharing_hidden_layer_3)
    text_sharing_hidden_layer_4 = Dense(25, activation='relu')(text_sharing_hidden_layer_3)
    text_sharing_hidden_layer_4 = Dropout(dropout_value)(text_sharing_hidden_layer_4)
    numeric_sharing_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_sharing_hidden_layer_1 = Dropout(dropout_value)(numeric_sharing_hidden_layer_1)
    combined_sharing_layer = concatenate([numeric_sharing_hidden_layer_1, text_sharing_hidden_layer_4])
    sharing_output_layer = Dense(5, activation='softmax')(combined_sharing_layer)
    
    # Student_Type
    word_embedding_student_type = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_student_type = Flatten()(word_embedding_student_type)
    text_student_type_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_student_type)
    text_student_type_hidden_layer_1 = Dropout(dropout_value)(text_student_type_hidden_layer_1)
    text_student_type_hidden_layer_2 = Dense(100, activation='relu')(text_student_type_hidden_layer_1)
    text_student_type_hidden_layer_2 = Dropout(dropout_value)(text_student_type_hidden_layer_2)
    text_student_type_hidden_layer_3 = Dense(50, activation='relu')(text_student_type_hidden_layer_2)
    text_student_type_hidden_layer_3 = Dropout(dropout_value)(text_student_type_hidden_layer_3)
    text_student_type_hidden_layer_4 = Dense(25, activation='relu')(text_student_type_hidden_layer_3)
    text_student_type_hidden_layer_4 = Dropout(dropout_value)(text_student_type_hidden_layer_4)
    numeric_student_type_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_student_type_hidden_layer_1 = Dropout(dropout_value)(numeric_student_type_hidden_layer_1)
    combined_student_type_layer = concatenate([numeric_student_type_hidden_layer_1, text_student_type_hidden_layer_4])
    student_type_output_layer = Dense(9, activation='softmax')(combined_student_type_layer)
    
    # Use
    word_embedding_use = Embedding(input_dim=3804, output_dim=embedding_vector_length, mask_zero=False, input_length=50)(text_input)
    word_embedding_use = Flatten()(word_embedding_use)
    text_use_hidden_layer_1 = Dense(200, activation='relu')(word_embedding_use)
    text_use_hidden_layer_1 = Dropout(dropout_value)(text_use_hidden_layer_1)
    text_use_hidden_layer_2 = Dense(100, activation='relu')(text_use_hidden_layer_1)
    text_use_hidden_layer_2 = Dropout(dropout_value)(text_use_hidden_layer_2)
    text_use_hidden_layer_3 = Dense(50, activation='relu')(text_use_hidden_layer_2)
    text_use_hidden_layer_3 = Dropout(dropout_value)(text_use_hidden_layer_3)
    text_use_hidden_layer_4 = Dense(25, activation='relu')(text_use_hidden_layer_3)
    text_use_hidden_layer_4 = Dropout(dropout_value)(text_use_hidden_layer_4)
    numeric_use_hidden_layer_1 = Dense(4, activation='relu')(numeric_input)
    numeric_use_hidden_layer_1 = Dropout(dropout_value)(numeric_use_hidden_layer_1)
    combined_use_layer = concatenate([numeric_use_hidden_layer_1, text_use_hidden_layer_4])
    use_output_layer = Dense(8, activation='softmax')(combined_use_layer)
    
    # Output
    combined_output_layer = concatenate([function_output_layer, 
                                         object_type_output_layer,
                                         operating_status_output_layer,
                                         position_type_output_layer,
                                         pre_k_output_layer,
                                         reporting_output_layer,
                                         sharing_output_layer,
                                         student_type_output_layer,
                                         use_output_layer])
    
    model = Model(inputs=[numeric_input, text_input], outputs=[combined_output_layer])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [24]:
clf = build_network(X_numeric, X_text, X_numeric_test, X_text_test, y)
# clf.load_weights('embedding_model.h5')

early_stopping_monitor = EarlyStopping(patience=10)
checkpointer = ModelCheckpoint(filepath="embedding_model.h5", verbose=1, save_best_only=True)

history_clf = clf.fit([X_numeric, X_text], y, epochs=150, batch_size=256, validation_split=0.3, callbacks=[early_stopping_monitor, checkpointer])

clf.save_weights("embedding_model.h5")
model_json = clf.to_json()
with open("embedding_model.json", "w") as json_file:
    json_file.write(model_json)

Train on 280193 samples, validate on 120084 samples
Epoch 1/150

Epoch 00001: val_loss improved from inf to 33.31125, saving model to embedding_model.h5
Epoch 2/150

Epoch 00002: val_loss improved from 33.31125 to 31.84376, saving model to embedding_model.h5
Epoch 3/150

Epoch 00003: val_loss improved from 31.84376 to 29.89720, saving model to embedding_model.h5
Epoch 4/150

Epoch 00004: val_loss improved from 29.89720 to 29.49446, saving model to embedding_model.h5
Epoch 5/150

Epoch 00005: val_loss did not improve from 29.49446
Epoch 6/150

Epoch 00006: val_loss improved from 29.49446 to 28.93242, saving model to embedding_model.h5
Epoch 7/150

Epoch 00007: val_loss improved from 28.93242 to 28.15993, saving model to embedding_model.h5
Epoch 8/150

Epoch 00008: val_loss improved from 28.15993 to 27.74094, saving model to embedding_model.h5
Epoch 9/150

Epoch 00009: val_loss did not improve from 27.74094
Epoch 10/150

Epoch 00010: val_loss improved from 27.74094 to 27.64061, saving mo

KeyboardInterrupt: 