In [1]:
# Set Working Directory
import os
os.chdir('..')

In [2]:
# Load Requirements
import pandas as pd
import  numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint


from helper import *

Using TensorFlow backend.


In [3]:
# Load Data
data_train, data_test = load_data()
print('data_train shape:', data_train.shape)
print('data_test shape:', data_test.shape)

data_train shape: (400277, 25)
data_test shape: (50064, 16)


In [4]:
# Load Features
data_features = load_features(data_train, data_test)
print('data_features shape:', data_features.shape)

data_features shape: (450341, 16)


# Prepare dataset for classification

In [5]:
def init_prep(data_train, data_test, data_features, label=None):
    """
    Return numpy array X: feature matrix for classification model fitting
    Return numpy array y: labels matrix for classification model fitting
    Return numpy array X_test: feature matrix of test set
    
    Param pandas dataframe data_train: training data (features + labels)
    Param pandas dataframe data_test: test data (features)
    Param pandas dataframe data_features: data in feature columns of data_train and data_test
    
    Required Libraries: pandas, numpy, sklearn
    """
    
    # Combined text columns
    data_train['combined_text'] = (data_train[data_features.columns]
                                       .drop(columns=['FTE', 'Total'])
                                       .fillna("")
                                       .apply(lambda x: " ".join(x), axis=1)
                                       .str.lower())
    data_test['combined_text'] = (data_test[data_features.columns]
                                       .drop(columns=['FTE', 'Total'])
                                       .fillna("")
                                       .apply(lambda x: " ".join(x), axis=1)
                                       .str.lower())
    data_features['combined_text'] = (data_features
                                          .drop(columns=['FTE', 'Total'])
                                          .fillna("")
                                          .apply(lambda x: " ".join(x), axis=1)
                                          .str.lower())
    
    # Vectorizer text columns in training data
    vec = CountVectorizer(stop_words='english',
                          token_pattern='[A-Za-z0-9]+(?=\\s+)') # initialize CountVectorizer
    vec.fit(data_features['combined_text']) # fit CountVectorizer to all data (labeled and unlabeled)
    
    word_matrix = vec.transform(data_train['combined_text']) # apply fitted CountVectorizer to only training (labeled) data
    word_matrix = word_matrix.toarray() # convert from sparse to regular matix
    
    word_matrix_test = vec.transform(data_test['combined_text']) # apply fitted CountVectorizer to only test (labeled) data
    word_matrix_test = word_matrix_test.toarray() # convert from sparse to regular matrix
    
    # Impute missing numerical data
    imp_total = Imputer(strategy='median')
    imp_fte = Imputer(strategy='mean')
    
    imp_total.fit(data_features['Total'].values.reshape(-1, 1))
    imp_fte.fit(data_features['FTE'].values.reshape(-1, 1))
    
    total = imp_total.transform(data_train['Total'].values.reshape(-1, 1))
    fte = imp_fte.transform(data_train['FTE'].values.reshape(-1, 1))
    
    total_test = imp_total.transform(data_test['Total'].values.reshape(-1, 1))
    fte_test = imp_fte.transform(data_test['FTE'].values.reshape(-1, 1))
    
    # Create feature matrix
    X = np.concatenate([total, fte, word_matrix], axis=1)
    X_test = np.concatenate([total_test, fte_test, word_matrix_test], axis=1)
    
    # Create labels matrix
    if label:
        y = pd.get_dummies(data_train[label]).values.astype('float64')
    else:
        label = ['Function',
                 'Object_Type',
                 'Operating_Status',
                 'Position_Type',
                 'Pre_K',
                 'Reporting',
                 'Sharing',
                 'Student_Type',
                 'Use']
        y = pd.get_dummies(data_train[label]).values.astype('float64')
    
    return X, y, X_test

In [6]:
X, y, X_test = init_prep(data_train, data_test, data_features)
print('X shape:', X.shape)
print('y shape:', y.shape)
print('X_test shape:', X_test.shape)

X shape: (400277, 3550)
y shape: (400277, 104)
X_test shape: (50064, 3550)


# Build Neural Network

In [7]:
def build_network(X, y):
    """
    Return compiled keras-model model
    
    param numpy array X: feature matrix for classification
    param numpy array y: labels matrix for classification
    
    Required Libraries: keras
    """
    
    model = Sequential()
    
    model.add(Dense(2000, activation='relu', input_shape=(X.shape[1],))) #input layer
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(500, activation='relu'))
    model.add(Dense(y.shape[1], activation='softmax')) # output layer
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
clf = build_network(X, y)

early_stopping_monitor = EarlyStopping(patience=3)
checkpointer = ModelCheckpoint(filepath="outputs/base_model.h5", verbose=1, save_best_only=True)
clf.fit(X, y, epochs=30, validation_split=0.1, callbacks=[early_stopping_monitor, checkpointer])

clf.save_weights("outputs/base_model.h5")
model_json = clf.to_json()
with open("outputs/base_model.json", "w") as json_file:
    json_file.write(model_json)

Train on 360249 samples, validate on 40028 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 29.12430, saving model to outputs/base_model.h5
Epoch 2/30
  7008/360249 [..............................] - ETA: 32:01 - loss: 29.9073 - acc: 0.0265

In [None]:
prediction = clf.predict(X_test)
print("X_test shape:", X_test.shape)
print("predictions shape:", prediction.shape)

label = ['Function',
         'Object_Type',
         'Operating_Status',
         'Position_Type',
         'Pre_K',
         'Reporting',
         'Sharing',
         'Student_Type',
         'Use']

submission = pd.DataFrame(prediction, 
                          index=data_test.index, 
                          columns=pd.get_dummies(data_train[label], prefix_sep='__').columns)
submission.to_csv('outputs/submissions/base_model.csv')

# Load model

In [8]:
json_file = open('outputs/base_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
clf = model_from_json(loaded_model_json)

clf.load_weights('outputs/base_model.h5')

clf.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
prediction = clf.predict(X_test)
print("X_test shape:", X_test.shape)
print("predictions shape:", prediction.shape)

label = ['Function',
         'Object_Type',
         'Operating_Status',
         'Position_Type',
         'Pre_K',
         'Reporting',
         'Sharing',
         'Student_Type',
         'Use']

submission = pd.DataFrame(prediction, 
                          index=data_test.index, 
                          columns=pd.get_dummies(data_train[label], prefix_sep='__').columns)
submission.to_csv('outputs/submissions/base_model.csv')

X_test shape: (50064, 3550)
predictions shape: (50064, 104)


# Basic CNN

In [7]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from keras.callbacks import EarlyStopping

from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint

In [38]:
def build_network(X, y):
    """
    Return compiled keras-model model
    
    param numpy array X: feature matrix for classification
    param numpy array y: labels matrix for classification
    
    Required Libraries: keras
    """
    
    model = Sequential()
    
    model.add(Conv1D(2000, 100, padding='valid', activation='relu', strides=10, input_shape=(X.shape[1], 1))) #input layer
    model.add(MaxPooling1D(pool_size=2, padding='valid'))
    model.add(Conv1D(1000, 50, padding='valid', activation='relu', strides=5))
    model.add(MaxPooling1D(pool_size=2, padding='valid'))
    model.add(Flatten())
    model.add(Dense(500, activation='relu'))
    model.add(Dense(y.shape[1], activation='softmax')) # output layer
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
clf = build_network(X, y)

early_stopping_monitor = EarlyStopping(patience=3)
checkpointer = ModelCheckpoint(filepath='outputs/base_cnn_model.h5', verbose=1, save_best_only=True)

clf.fit(np.expand_dims(X, axis=2), y, epochs=30, validation_split=0.1, callbacks=[early_stopping_monitor, checkpointer])

clf.save_weights('outputs/base_cnn_model.h5')
model_json = clf.to_json()
with open('outputs/base_cnn_model.json', "w") as json_file:
    json_file.write(model_json)

Train on 360249 samples, validate on 40028 samples
Epoch 1/30
