In [1]:
# Set Working Directory
import os
os.chdir('..')

In [2]:
# Load Requirements
import pandas as pd
import  numpy as np
from keras.preprocessing.text import Tokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Imputer
from scipy import sparse

import zipfile
import re, nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer 

import keras
from keras.layers import Dense, concatenate, Input, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping

from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint

from helper import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
# Load Data
data_train, data_test = load_data()
print('data_train shape:', data_train.shape)
print('data_test shape:', data_test.shape)

data_train shape: (400277, 25)
data_test shape: (50064, 16)


In [10]:
# Load Features
data_features = load_features(data_train, data_test)
print('data_features shape:', data_features.shape)

data_features shape: (450341, 16)


# Prepare dataset for classification

In [11]:
def text_processing(phrase):
    """
    Return list processed_phrase: phrase tokens after processing has been completed
    
    param string phrase: phrase to be processed
    
    Required Libraries: re, nltk
    """
    
    # Case Normalization
    processed_phrase = phrase.lower()
    
    # Remove Punctuations
    processed_phrase = re.sub(r"[^a-z0-9]", " ", processed_phrase)
    
    # Tokenize Phrase
    processed_phrase = processed_phrase.split()
    
    # Remove Stopwords
    processed_phrase = [word for word in processed_phrase if word not in stopwords.words("english")]
    
    # Lemmatization
#     processed_phrase = [WordNetLemmatizer().lemmatize(word) for word in processed_phrase]
#     processed_phrase = [WordNetLemmatizer().lemmatize(word, pos='v') for word in processed_phrase] # verbs
    
    # Stemming
    processed_phrase = [SnowballStemmer('english').stem(word) for word in processed_phrase]
    
    # Recombine list into phrase
    processed_phrase = ' '.join(processed_phrase)
    
    return processed_phrase

def init_prep(data_train, data_test, data_features, label=None):
    """
    Return numpy array X: feature matrix for classification model fitting
    Return numpy array y: labels matrix for classification model fitting
    Return numpy array X_test: feature matrix of test set
    
    Param pandas dataframe data_train: training data (features + labels)
    Param pandas dataframe data_test: test data (features)
    Param pandas dataframe data_features: data in feature columns of data_train and data_test
    
    Required Libraries: pandas, numpy, keras
    Required helper functions: text_processing
    """
    
    # Combined and preprocess text columns
    data_train['combined_text'] = (data_train[data_features.columns]
                                       .drop(columns=['FTE', 'Total'])
                                       .fillna("")
                                       .apply(lambda x: " ".join(x), axis=1)
                                       .apply(lambda x: text_processing(x))
                                  )
    data_test['combined_text'] = (data_test[data_features.columns]
                                       .drop(columns=['FTE', 'Total'])
                                       .fillna("")
                                       .apply(lambda x: " ".join(x), axis=1)
                                       .apply(lambda x: text_processing(x))
                                 )
    data_features['combined_text'] = (data_features
                                          .drop(columns=['FTE', 'Total'])
                                          .fillna("")
                                          .apply(lambda x: " ".join(x), axis=1)
                                          .apply(lambda x: text_processing(x))
                                     )
    
    # Vectorizer text columns in training data
    tokenize = Tokenizer()
    tokenize.fit_on_texts(data_features['combined_text'])
    
    X_text = tokenize.texts_to_matrix(data_train['combined_text'])
    X_text_test = tokenize.texts_to_matrix(data_test['combined_text'])
    
    # Impute missing numerical data
    imp_total = Imputer(strategy='median')
    
    imp_total.fit(data_features['Total'].values.reshape(-1, 1))
    
    total = imp_total.transform(data_train['Total'].values.reshape(-1, 1))
    fte = data_train['FTE'].fillna('0').values.reshape(-1, 1)
    
    total_test = imp_total.transform(data_test['Total'].values.reshape(-1, 1))
    fte_test = data_test['FTE'].fillna('0').values.reshape(-1, 1)
    
    # Create feature matrix
    X_numeric = np.concatenate([total, fte], axis=1)
    X_numeric_test = np.concatenate([total_test, fte_test], axis=1)
    
    # Create labels matrix
    if label:
        y = pd.get_dummies(data_train[label]).values.astype('float64')
    else:
        label = ['Function',
                 'Object_Type',
                 'Operating_Status',
                 'Position_Type',
                 'Pre_K',
                 'Reporting',
                 'Sharing',
                 'Student_Type',
                 'Use']
        y = pd.get_dummies(data_train[label]).values.astype('float64')
    
    return X_numeric, X_text, X_numeric_test, X_text_test, y, tokenize

In [12]:
X_numeric, X_text, X_numeric_test, X_text_test, y, tokenize = init_prep(data_train, data_test, data_features, label=None)
print('X_numeric shape:', X_numeric.shape)
print('X_numeric_test shape:', X_numeric_test.shape)
print('X_text shape:', X_text.shape)
print('X_text_test shape:', X_text_test.shape)
print('y shape:', y.shape)

X_numeric shape: (400277, 2)
X_numeric_test shape: (50064, 2)
X_text shape: (400277, 3363)
X_text_test shape: (50064, 3363)
y shape: (400277, 104)


In [13]:
np.savetxt('X_numeric.csv', X_numeric, fmt='%5s', delimiter=",")
np.savetxt('X_numeric_test.csv', X_numeric_test, fmt='%5s', delimiter=",")
np.savetxt('X_text.csv', X_text, fmt='%5s', delimiter=",")
np.savetxt('X_text_test.csv', X_text_test, fmt='%5s', delimiter=",")
np.savetxt('y.csv', y, fmt='%5s', delimiter=",")

In [14]:
X_numeric = pd.read_csv('X_numeric.csv', header=None).values
X_text = pd.read_csv('X_text.csv', header=None).values
X_numeric_test = pd.read_csv('X_numeric_test.csv', header=None).values
X_text_test = pd.read_csv('X_text_test.csv', header=None).values
y = pd.read_csv('y.csv', header=None).values

print('X_numeric shape:', X_numeric.shape)
print('X_numeric_test shape:', X_numeric_test.shape)
print('X_text shape:', X_text.shape)
print('X_text_test shape:', X_text_test.shape)
print('y shape:', y.shape)

X_numeric shape: (400277, 2)
X_numeric_test shape: (50064, 2)
X_text shape: (400277, 3363)
X_text_test shape: (50064, 3363)
y shape: (400277, 104)


In [15]:
label = ['Function',
         'Object_Type',
         'Operating_Status',
         'Position_Type',
         'Pre_K',
         'Reporting',
         'Sharing',
         'Student_Type',
         'Use']
y = pd.get_dummies(data_train[label]).values.astype('float64')
y.shape

(400277, 104)

# Build Model

In [16]:
def build_network(X_numeric, X_text, X_numeric_test, X_text_test, y):
    """
    Return compiled keras-model model
    
    param numpy array X: feature matrix for classification
    param numpy array y: labels matrix for classification
    
    Required Libraries: keras
    """
    
    numeric_input = Input(shape=(X_numeric.shape[1],) , name='numeric_input') 
    text_input = Input(shape=(X_text.shape[1],) , name='text_input')
    
    # Function
    text_function_hidden_layer_1 = Dense(37, activation='relu')(text_input)
    text_function_hidden_layer_1_dropout = Dropout(0.2)(text_function_hidden_layer_1)
    text_function_hidden_layer_2 = Dense(37, activation='relu')(text_function_hidden_layer_1_dropout)
    text_function_hidden_layer_2_dropout = Dropout(0.2)(text_function_hidden_layer_2)
    combined_function_layer = concatenate([numeric_input, text_function_hidden_layer_2_dropout])
    function_output_layer = Dense(37, activation='softmax')(combined_function_layer)
    
    # Object_Type
    text_object_type_hidden_layer_1 = Dense(37, activation='relu')(text_input)
    text_object_type_hidden_layer_1_dropout = Dropout(0.2)(text_object_type_hidden_layer_1)
    text_object_type_hidden_layer_2 = Dense(20, activation='relu')(text_object_type_hidden_layer_1_dropout)
    text_object_type_hidden_layer_2_dropout = Dropout(0.2)(text_object_type_hidden_layer_2)
    combined_object_type_layer = concatenate([numeric_input, text_object_type_hidden_layer_2_dropout])
    object_type_output_layer = Dense(11, activation='softmax')(combined_object_type_layer)
    
    # Operating_Status
    text_operating_status_hidden_layer_1 = Dense(30, activation='relu')(text_input)
    text_operating_status_hidden_layer_1_dropout = Dropout(0.2)(text_operating_status_hidden_layer_1)
    text_operating_status_hidden_layer_2 = Dense(15, activation='relu')(text_operating_status_hidden_layer_1_dropout)
    text_operating_status_hidden_layer_2_dropout = Dropout(0.2)(text_operating_status_hidden_layer_2)
    text_operating_status_hidden_layer_3 = Dense(7, activation='relu')(text_operating_status_hidden_layer_2_dropout)
    text_operating_status_hidden_layer_3_dropout = Dropout(0.2)(text_operating_status_hidden_layer_3)
    combined_operating_status_layer = concatenate([numeric_input, text_operating_status_hidden_layer_3_dropout])
    operating_status_output_layer = Dense(3, activation='softmax')(combined_operating_status_layer)
    
    # Position_Type
    text_position_type_hidden_layer_1 = Dense(37, activation='relu')(text_input)
    text_position_type_hidden_layer_1_dropout = Dropout(0.2)(text_position_type_hidden_layer_1)
    text_position_type_hidden_layer_2 = Dense(37, activation='relu')(text_position_type_hidden_layer_1_dropout)
    text_position_type_hidden_layer_2_dropout = Dropout(0.2)(text_position_type_hidden_layer_2)
    combined_position_type_layer = concatenate([numeric_input, text_position_type_hidden_layer_2_dropout])
    position_type_output_layer = Dense(25, activation='softmax')(combined_position_type_layer)
    
    # Pre_K
    text_pre_k_hidden_layer_1 = Dense(30, activation='relu')(text_input)
    text_pre_k_hidden_layer_1_dropout = Dropout(0.2)(text_pre_k_hidden_layer_1)
    text_pre_k_hidden_layer_2 = Dense(15, activation='relu')(text_pre_k_hidden_layer_1_dropout)
    text_pre_k_hidden_layer_2_dropout = Dropout(0.2)(text_pre_k_hidden_layer_2)
    text_pre_k_hidden_layer_3 = Dense(7, activation='relu')(text_pre_k_hidden_layer_2_dropout)
    text_pre_k_hidden_layer_3_dropout = Dropout(0.2)(text_pre_k_hidden_layer_3)
    combined_pre_k_layer = concatenate([numeric_input, text_pre_k_hidden_layer_3_dropout])
    pre_k_output_layer = Dense(3, activation='softmax')(combined_pre_k_layer)
    
    # Reporting
    text_reporting_hidden_layer_1 = Dense(30, activation='relu')(text_input)
    text_reporting_hidden_layer_1_dropout = Dropout(0.2)(text_reporting_hidden_layer_1)
    text_reporting_hidden_layer_2 = Dense(15, activation='relu')(text_reporting_hidden_layer_1_dropout)
    text_reporting_hidden_layer_2_dropout = Dropout(0.2)(text_reporting_hidden_layer_2)
    text_reporting_hidden_layer_3 = Dense(7, activation='relu')(text_reporting_hidden_layer_2_dropout)
    text_reporting_hidden_layer_3_dropout = Dropout(0.2)(text_reporting_hidden_layer_3)
    combined_reporting_layer = concatenate([numeric_input, text_reporting_hidden_layer_3_dropout])
    reporting_output_layer = Dense(3, activation='softmax')(combined_reporting_layer)
    
    # Sharing
    text_sharing_hidden_layer_1 = Dense(30, activation='relu')(text_input)
    text_sharing_hidden_layer_1_dropout = Dropout(0.2)(text_sharing_hidden_layer_1)
    text_sharing_hidden_layer_2 = Dense(15, activation='relu')(text_sharing_hidden_layer_1_dropout)
    text_sharing_hidden_layer_2_dropout = Dropout(0.2)(text_sharing_hidden_layer_2)
    text_sharing_hidden_layer_3 = Dense(7, activation='relu')(text_sharing_hidden_layer_2_dropout)
    text_sharing_hidden_layer_3_dropout = Dropout(0.2)(text_sharing_hidden_layer_3)
    combined_sharing_layer = concatenate([numeric_input, text_sharing_hidden_layer_3_dropout])
    sharing_output_layer = Dense(5, activation='softmax')(combined_sharing_layer)
    
    # Student_Type
    text_student_type_hidden_layer_1 = Dense(30, activation='relu')(text_input)
    text_student_type_hidden_layer_1_dropout = Dropout(0.2)(text_student_type_hidden_layer_1)
    text_student_type_hidden_layer_2 = Dense(15, activation='relu')(text_student_type_hidden_layer_1_dropout)
    text_student_type_hidden_layer_2_dropout = Dropout(0.2)(text_student_type_hidden_layer_2)
    combined_student_type_layer = concatenate([numeric_input, text_student_type_hidden_layer_2_dropout])
    student_type_output_layer = Dense(9, activation='softmax')(combined_student_type_layer)
    
    # Use
    text_use_hidden_layer_1 = Dense(30, activation='relu')(text_input)
    text_use_hidden_layer_1_dropout = Dropout(0.2)(text_use_hidden_layer_1)
    text_use_hidden_layer_2 = Dense(15, activation='relu')(text_use_hidden_layer_1_dropout)
    text_use_hidden_layer_2_dropout = Dropout(0.2)(text_use_hidden_layer_2)
    combined_use_layer = concatenate([numeric_input, text_use_hidden_layer_2_dropout])
    use_output_layer = Dense(8, activation='softmax')(combined_use_layer)
    
    # Output
    combined_output_layer = concatenate([function_output_layer, 
                                         object_type_output_layer,
                                         operating_status_output_layer,
                                         position_type_output_layer,
                                         pre_k_output_layer,
                                         reporting_output_layer,
                                         sharing_output_layer,
                                         student_type_output_layer,
                                         use_output_layer])
    
    model = Model(inputs=[numeric_input, text_input], outputs=[combined_output_layer])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [19]:
clf = build_network(X_numeric, X_text, X_numeric_test, X_text_test, y)

early_stopping_monitor = EarlyStopping(patience=3)
checkpointer = ModelCheckpoint(filepath='outputs/functional_model.h5', verbose=1, save_best_only=True)

clf.fit([X_numeric, X_text], y, epochs=30, batch_size=1024, validation_split=0.3, callbacks=[early_stopping_monitor, checkpointer])

clf.save_weights('outputs/functional_model.h5')
model_json = clf.to_json()
with open('outputs/functional_model.json', 'w') as json_file:
    json_file.write(model_json)

Train on 280193 samples, validate on 120084 samples
Epoch 1/30
  3072/280193 [..............................] - ETA: 1:07:39 - loss: 108.4906 - acc: 0.0641

KeyboardInterrupt: 

In [16]:
json_file = open('outputs/functional_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

loaded_model.load_weights('outputs/functional_model.h5')
print("Loaded model from disk")

Loaded model from disk


In [17]:
predictions = loaded_model.predict([X_numeric_test, X_text_test])

submission_columns = pd.get_dummies(data_train[label], prefix_sep='__').columns
submission = pd.DataFrame(predictions, columns=submission_columns, index=data_test.index)

submission.to_csv('submissions/functional_model.csv')