In [1]:
import collections
import os, shutil
import numpy as np
import pandas as pd

import tempfile
import logging

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import tensorflow as tf
print("Tensorflow is installed and is version: ",  tf.__version__)

%matplotlib inline

Tensorflow is installed and is version:  1.3.0


In [2]:
def readData():
    test = pd.read_csv('test_data.csv')
    train = pd.read_csv('train_data.csv')
    return train, test

In [3]:
train_df, test_df = readData()

In [4]:
print('Training data has %d observations with %d features' % train_df.shape)
train_df.head()

Training data has 4584 observations with 670 features


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,z217,z218,z219,z220,z221,z222,subject,phase,state,output
0,0,0,0,1,-300.361218,0.88636,-2.590886,225.001899,0.006204,3.7e-05,...,0.005242,0.024971,-1017.620978,-382.850838,-48.275711,-2.040336,A,3,B,0
1,0,0,0,1,-297.12609,0.622211,-3.96094,220.179017,0.006167,-1.4e-05,...,0.001722,0.023595,91.229094,24.80223,1.78395,0.02262,A,3,C,0
2,0,0,0,1,-236.460253,0.42364,-12.656341,139.453445,0.006276,-2.8e-05,...,-0.010894,-0.036318,-188.232347,-17.474861,-1.005571,-0.021628,A,3,B,0
3,0,0,0,1,33.411458,2.854415,-1.962432,3.208911,0.009752,-0.000273,...,-0.034184,-0.047734,185.122907,-549.282067,542.193381,-178.049926,A,3,A,0
4,0,0,0,1,-118.125214,2.009809,-3.291637,34.874176,0.007598,1e-06,...,0.001963,0.004084,35.207794,-78.143166,57.084208,-13.700212,A,4,C,0


In [5]:
print('Test data has %d observations with %d features' % test_df.shape)
test_df.head()

Test data has 1732 observations with 669 features


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,z216,z217,z218,z219,z220,z221,z222,subject,phase,state
0,0,0,0,1,-18.839131,2.306584,-4.655889,1.013324,0.007153,5.5e-05,...,0.002809,0.014684,0.008373,325.428102,-821.094825,689.695558,-192.867397,B,1,B
1,0,0,0,1,-21.203051,2.112956,-2.503654,1.233657,0.007674,2.5e-05,...,0.00145,-0.012349,-0.029579,377.365602,-943.446587,785.687687,-217.952016,B,1,C
2,0,0,0,1,-83.843508,2.097191,-2.625699,17.607247,0.00729,7.7e-05,...,-0.024253,-0.014029,-0.01509,564.866863,-1445.004242,1231.30459,-349.493698,B,1,C
3,0,0,0,1,-192.522878,0.605105,-2.468908,92.456149,0.006284,7e-06,...,0.019957,-0.03749,0.002925,-552.39872,1163.146256,-816.945306,191.395611,B,1,C
4,0,0,0,1,-188.095799,0.390008,-4.870923,88.257844,0.007859,-5.8e-05,...,0.007063,-0.007101,0.025921,-860.687432,1838.968578,-1310.372483,311.393705,B,1,C


Split into training / validation data sets

In [6]:
train_df, valid_df = train_test_split(train_df, stratify = train_df.output, test_size = 0.10, random_state=100)

In [7]:
train_df = train_df.copy().reset_index(drop = True)
y_train = train_df.output
train_df.drop('output', axis = 1, inplace=True)

In [8]:
valid_df = valid_df.copy().reset_index(drop = True)
y_valid = valid_df.output
valid_df.drop('output', axis = 1, inplace=True)

Scale the numerical features to zero mean and unit variance

In [9]:
scaler = StandardScaler()
scaler = scaler.fit(train_df.iloc[:, :-3])

In [10]:
X_train = scaler.transform(train_df.iloc[:, :-3])
X_train = pd.DataFrame(X_train, columns = train_df.columns[:-3])
X_train['subject'] = train_df.subject
X_train['phase'] = train_df.phase
X_train['state'] = train_df.state

In [11]:
X_valid = scaler.transform(valid_df.iloc[:, :-3])
X_valid = pd.DataFrame(X_valid, columns = valid_df.columns[:-3])
X_valid['subject'] = valid_df.subject
X_valid['phase'] = valid_df.phase
X_valid['state'] = valid_df.state

In [12]:
X_test = scaler.transform(test_df.iloc[:, :-3])
X_test = pd.DataFrame(X_test, columns = test_df.columns[:-3])
X_test['subject'] = test_df.subject
X_test['phase'] = test_df.phase
X_test['state'] = test_df.state

Prepare feature definitions for the tensorflow models

In [13]:
# categorical features
subject = tf.feature_column.categorical_column_with_vocabulary_list(
    key = "subject", 
    vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'k', 'L', 'M'))
phase = tf.feature_column.categorical_column_with_identity(key='phase', num_buckets=10)
state = tf.feature_column.categorical_column_with_vocabulary_list(
    key='state', 
    vocabulary_list=('A', 'B', 'C', 'D', 'E'))
categorical_features = [
    tf.feature_column.indicator_column(subject), 
    tf.feature_column.indicator_column(phase), 
    tf.feature_column.indicator_column(state)
]

In [14]:
# numerical features
numerical_features = []
for i in np.arange(1, 223):
    numerical_features.append(tf.feature_column.numeric_column('x%d'%i))
    numerical_features.append(tf.feature_column.numeric_column('y%d'%i))
    numerical_features.append(tf.feature_column.numeric_column('z%d'%i))

In [15]:
# crossed features
subject_phase = tf.feature_column.crossed_column(["subject", "phase"], hash_bucket_size=100)
crossed_features = [
    tf.feature_column.indicator_column(subject_phase) 
]

In [16]:
print('Training data has %d observations with %d features' % X_train.shape)
print('Validation data has %d observations with %d features' % X_valid.shape)

Training data has 4125 observations with 669 features
Validation data has 459 observations with 669 features


In [17]:
def input_fn(df, labels, num_epochs, shuffle, num_):
    return tf.estimator.inputs.pandas_input_fn(
      x = df,
      y  = labels,
      batch_size = 256,
      num_epochs = num_epochs,
      shuffle = shuffle,
      num_threads = 5)

#### Logistic regression tensorflow style

In [None]:
shutil.rmtree('log_model',  ignore_errors=True)

log_model = tf.estimator.LinearClassifier(
    model_dir = 'log_model',
    # L1 regularization
    optimizer=tf.train.FtrlOptimizer(learning_rate=0.1, l1_regularization_strength=0.001),
    feature_columns = numerical_features + categorical_features + crossed_features
)

In [None]:
log_model.train(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_train, y = y_train, batch_size = 256, num_epochs = None, shuffle = True, num_threads = 5),
    steps=100)

In [None]:
log_results = log_model.evaluate(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_valid, y = y_valid, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
    steps = None
)

In [None]:
for key in sorted(log_results):
    print("%s: %s" % (key, log_results[key]))

In [None]:
log_predict = log_model.predict(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_test, y = None, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
)

In [None]:
log_results = list(log_predict)

#### Prepare the Kaggle submission

In [None]:
p_hat = [r['probabilities'][1] for r in log_results]
test_df['output'] = p_hat
test_df[['output']].to_csv('log_solution.csv', index_label = 'id')
test_df.drop('output', inplace = True, axis = 1)

#### NN tensorflow style
Using high level DNNClassifier model

#### Selecting the model

Try different dropouts 

In [None]:
dropouts = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results = []
for dropout in dropouts:
    
    shutil.rmtree('tmp_model',  ignore_errors=True)
    
    m = tf.estimator.DNNClassifier(
        model_dir = 'tmp_model',
        feature_columns=numerical_features + categorical_features + crossed_features,
        hidden_units=[1024, 512, 256],
        dropout=dropout,
        optimizer=tf.train.AdamOptimizer())
    m = m.train(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x = X_train, y = y_train, batch_size = 256, num_epochs = None, shuffle = True, num_threads = 5),
        steps = 200)
    r = m.evaluate(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x = X_valid, y = y_valid, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
        steps = None)
    results.append(r)

In [None]:
accuracy = [r['accuracy'] for r in results]
auc = [r['auc'] for r in results]

In [None]:
plt.plot(dropouts, accuracy, label='accuracy')
plt.plot(dropouts, auc, label='auc')
plt.legend()

Looks like drooupout 0.5 produces a good model, better than the logistic regression. 

In [None]:
shutil.rmtree('dnn_model',  ignore_errors=True)

nn_model = tf.estimator.DNNClassifier(
    model_dir = 'dnn_model',
    feature_columns=numerical_features + categorical_features + crossed_features,
    hidden_units=[512, 256, 64],
    dropout=0.5,
    optimizer=tf.train.AdamOptimizer()
)

In [None]:
nn_model = nn_model.train(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_train, y = y_train, batch_size = 256, num_epochs = None, shuffle = True, num_threads = 5),
    max_steps=300)

In [None]:
nn_results = nn_model.evaluate(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_valid, y = y_valid, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
    steps = None
)

In [None]:
for key in sorted(nn_results):
    print("%s: %s" % (key, nn_results[key]))

Out of the box NN classifier performance is not as good as the logistic regression model.

In [None]:
nn_predict = nn_model.predict(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_test, y = None, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
)

In [None]:
nn_results = list(nn_predict)

#### Prepare the Kaggle submission

In [None]:
p_hat = [r['probabilities'][1] for r in nn_results]
test_df['output'] = p_hat
test_df[['output']].to_csv('nn_solution.csv', index_label = 'id')
test_df.drop('output', inplace = True, axis = 1)

Another approach would be to try regularization (instead of the dropouts).

In [None]:
results = []
regularization = [0.001, 0.01, 0.1, 1.0, 10.0]
for l1_r in regularization:
    
    shutil.rmtree('tmp_model',  ignore_errors=True)
    
    m = tf.estimator.DNNClassifier(
        model_dir = 'tmp_model',
        feature_columns=numerical_features + categorical_features + crossed_features,
        hidden_units=[256, 128, 64],
        dropout=dropout,
        optimizer=tf.train.ProximalAdagradOptimizer(
            learning_rate = 0.01,
            l1_regularization_strength = l1_r
        )
    )
    m = m.train(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x = X_train, y = y_train, batch_size = 512, num_epochs = None, shuffle = True, num_threads = 5),
        steps = 200)
    r = m.evaluate(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x = X_valid, y = y_valid, batch_size = 512, num_epochs = 1, shuffle = False, num_threads = 1),
        steps = None)
    results.append(r)

In [None]:
accuracy = [r['accuracy'] for r in results]
auc = [r['auc'] for r in results]

In [None]:
plt.plot(regularization, accuracy, label='accuracy')
plt.plot(regularization, auc, label='auc')
plt.xscale('log')
plt.legend()

Smaller net and regularization, more training steps

In [None]:
results = []
regularization = [0.00001, 0.0001, 0.001, 0.01, 0.1]
for l1_r in regularization:
    
    shutil.rmtree('tmp_model',  ignore_errors=True)

    m = tf.estimator.DNNClassifier(
        model_dir = 'tmp_model',
        feature_columns=numerical_features + categorical_features + crossed_features,
        hidden_units=[256, 128],
        dropout=dropout,
        optimizer=tf.train.ProximalAdagradOptimizer(
            learning_rate = 0.01,
            l1_regularization_strength = l1_r
        )
    )
    m = m.train(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x = X_train, y = y_train, batch_size = 512, num_epochs = None, shuffle = True, num_threads = 5),
        steps = 300)
    r = m.evaluate(
        input_fn=tf.estimator.inputs.pandas_input_fn(
            x = X_valid, y = y_valid, batch_size = 512, num_epochs = 1, shuffle = False, num_threads = 1),
        steps = None)
    results.append(r)

In [None]:
accuracy = [r['accuracy'] for r in results]
auc = [r['auc'] for r in results]

In [None]:
plt.plot(regularization, accuracy, label='accuracy')
plt.plot(regularization, auc, label='auc')
plt.xscale('log')
plt.legend()

it looks like the dropout approach worked better on this data.

#### Custom NN model
Custom model with same structure as the winning dropout DNN but with a custom loss function to deal with the class imbalance. The positive class loss is reduced proportionally.

In [18]:
def custom_model_fn(features, labels, mode, params, config):

    input_layer = tf.feature_column.input_layer(
        features = features,
        feature_columns = numerical_features + categorical_features + crossed_features
    )

    global_step = tf.contrib.framework.get_or_create_global_step()

    x = tf.layers.dense(
        inputs=input_layer,
        units=1024,
        activation=tf.nn.relu,
        name="layer1"
    )

    x = tf.layers.dropout(
        inputs=x,
        rate = params.dropout,
        training = (mode == tf.estimator.ModeKeys.TRAIN),
        name="dropout12"
    )

    x = tf.layers.dense(
        inputs=x,
        units=512,
        activation=tf.nn.relu,
        name="layer2"
    )

    x = tf.layers.dropout(
        inputs=x,
        rate = params.dropout,
        training = (mode == tf.estimator.ModeKeys.TRAIN),
        name="dropout23"
    )

    x = tf.layers.dense(
        inputs=x,
        units=256,
        activation=tf.nn.relu,
        name="layer3"
    )

    # logits for two classes
    logits = tf.layers.dense(
        inputs=x, 
        units=2, 
        activation=None
    )

    predictions = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # convert labels into one-hot encodinf array
    # label = o, array element = [1, 0]
    # label = 1, array element = [0, 1]
    targets = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=2)
    
    loss = tf.reduce_mean(
        tf.nn.weighted_cross_entropy_with_logits(
            targets = targets,
            logits = logits,
            pos_weight = 0.2
        )
    )

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate = params.learning_rate)
        train_op = optimizer.minimize(loss=loss, global_step = tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op = train_op)

    eval_metric_ops = {
        "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]),
        "auc": tf.metrics.auc(labels=labels, predictions=predictions["probabilities"][:, 1])
    }
    
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [19]:
hparams = tf.contrib.training.HParams(
    learning_rate=.01,
    dropout = 0.5
)

shutil.rmtree('custom_model',  ignore_errors=True)

custom_model = tf.estimator.Estimator(
    model_dir = 'custom_model',
    model_fn=custom_model_fn,
    params=hparams
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_model_dir': 'custom_model', '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_session_config': None, '_tf_random_seed': 1, '_log_step_count_steps': 100}


In [20]:
logging_hook = tf.train.LoggingTensorHook(tensors = {"probabilities": "softmax_tensor"}, every_n_iter=250)

In [21]:
custom_model.train(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_train, y = y_train, batch_size = 256, num_epochs = None, shuffle = True, num_threads = 5),
    steps=1001,
    hooks=[logging_hook]
)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into custom_model\model.ckpt.
INFO:tensorflow:probabilities = [[ 0.80205649  0.19794346]
 [ 0.96779668  0.03220335]
 [ 0.83222896  0.1677711 ]
 [ 0.73630959  0.26369038]
 [ 0.89389467  0.10610532]
 [ 0.81621438  0.18378562]
 [ 0.81042671  0.18957326]
 [ 0.95622182  0.04377822]
 [ 0.80781138  0.19218867]
 [ 0.9196173   0.08038273]
 [ 0.42867419  0.57132584]
 [ 0.75594819  0.24405184]
 [ 0.83246768  0.16753232]
 [ 0.79267824  0.20732169]
 [ 0.85376978  0.14623024]
 [ 0.85229534  0.14770468]
 [ 0.70796371  0.29203635]
 [ 0.99395084  0.00604916]
 [ 0.83096862  0.16903137]
 [ 0.92108417  0.07891586]
 [ 0.71087784  0.28912213]
 [ 0.88390619  0.11609384]
 [ 0.93511361  0.06488635]
 [ 0.94885617  0.05114384]
 [ 0.65646315  0.34353685]
 [ 0.98698026  0.01301973]
 [ 0.44641349  0.55358648]
 [ 0.95663011  0.04336995]
 [ 0.66858798  0.33141202]
 [ 0.75499356  0.24500647]
 [ 0.82918572  0.17081434]
 [ 0.95945716  0

INFO:tensorflow:global_step/sec: 4.27158
INFO:tensorflow:step = 301, loss = 0.083393 (23.385 sec)
INFO:tensorflow:global_step/sec: 4.48842
INFO:tensorflow:step = 401, loss = 0.0659096 (22.325 sec)
INFO:tensorflow:global_step/sec: 4.45047
INFO:tensorflow:probabilities = [[ 0.16611175  0.83388823]
 [ 0.          1.        ]
 [ 0.00003611  0.99996388]
 [ 0.00115155  0.99884844]
 [ 0.00000015  0.99999988]
 [ 0.01334421  0.98665583]
 [ 0.          1.        ]
 [ 0.16611175  0.83388823]
 [ 0.00001767  0.99998236]
 [ 0.63332218  0.36667788]
 [ 0.00000028  0.99999976]
 [ 0.16611175  0.83388823]
 [ 0.          1.        ]
 [ 0.00914151  0.9908585 ]
 [ 0.          1.        ]
 [ 0.03651742  0.96348256]
 [ 0.00140649  0.99859351]
 [ 0.0000036   0.99999642]
 [ 0.0000584   0.99994159]
 [ 0.00002415  0.9999758 ]
 [ 0.0000105   0.99998951]
 [ 0.00411035  0.99588972]
 [ 0.45132065  0.54867935]
 [ 0.99995148  0.00004857]
 [ 0.0265502   0.97344977]
 [ 0.16611175  0.83388823]
 [ 0.          1.        ]
 

INFO:tensorflow:global_step/sec: 4.44078
INFO:tensorflow:step = 801, loss = 0.0728116 (22.523 sec)
INFO:tensorflow:global_step/sec: 4.43153
INFO:tensorflow:step = 901, loss = 0.0562863 (22.568 sec)
INFO:tensorflow:global_step/sec: 4.48258
INFO:tensorflow:probabilities = [[ 0.18340282  0.81659716]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.09155475  0.90844524]
 [ 0.18340282  0.81659716]
 [ 0.00000184  0.99999821]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.00007254  0.9999274 ]
 [ 0.          1.        ]
 [ 0.00178141  0.9982186 ]
 [ 0.          1.        ]
 [ 0.83210289  0.16789709]
 [ 0.18340282  0.81659716]
 [ 0.18340282  0.81659716]
 [ 0.00002306  0.99997699]
 [ 0.40039104  0.59960902]
 [ 0.          1.        ]
 [ 0.00000013  0.99999988]
 [ 0.18340282  0.81659716]
 [ 0.18340282  0.81659716]
 [ 0.          1.        ]
 [ 0.          1.        ]
 [ 0.          1.        ]


<tensorflow.python.estimator.estimator.Estimator at 0x1b02c2b6e48>

In [22]:
custom_results = custom_model.evaluate(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_valid, y = y_valid, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
    steps = None
)

INFO:tensorflow:Starting evaluation at 2017-10-30-16:06:53
INFO:tensorflow:Restoring parameters from custom_model\model.ckpt-1001
INFO:tensorflow:Finished evaluation at 2017-10-30-16:06:58
INFO:tensorflow:Saving dict for global step 1001: accuracy = 0.847495, auc = 0.737349, global_step = 1001, loss = 0.175602


In [23]:
custom_predict = custom_model.predict(
    input_fn=tf.estimator.inputs.pandas_input_fn(
        x = X_test, y = None, batch_size = 256, num_epochs = 1, shuffle = False, num_threads = 1),
)

In [24]:
custom_results = list(custom_predict)

INFO:tensorflow:Restoring parameters from custom_model\model.ckpt-1001


In [25]:
p_hat = [r['probabilities'][1] for r in custom_results]
test_df['output'] = p_hat
test_df[['output']].to_csv('custom_solution.csv', index_label = 'id')
test_df.drop('output', inplace = True, axis = 1)