In [25]:
import pandas as pd

In [92]:
# Parameters
TRAIN_STEPS = 1000
TRAIN_SAMPLES = 33600
TRAIN_CSV = 'data/train.csv'
TEST_CSV = 'data/test.csv'
OUTPUT_DIR = 'data'

In [64]:
# Import data
def import_data(csv_filepath, nosplit=False, **kwargs):
    df = pd.read_csv(csv_filepath, **kwargs)
    if nosplit:
        return df
    
    dfx = df.ix[:, 1:]
    dfy = df.ix[:, 0]
    
    return dfx, dfy

# Train data: 42K samples
# Split this into train and validation sets
dfx, dfy = import_data(TRAIN_CSV, nrows=TRAIN_SAMPLES)
dfx_cv, dfy_cv = import_data(TRAIN_CSV, skiprows=TRAIN_SAMPLES)

# Test data: 28K samples
df_test = import_data(TEST_CSV, nosplit=True)

In [65]:
# Preprocess the data
import numpy as np
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

def preproc_x(dfx):
    x = min_max_scaler.fit_transform(dfx.values).astype(np.float32)
    return x

def preproc_y(dfy):
    y = dfy.values.astype(np.int32)
    return y

x = preproc_x(dfx)
y = preproc_y(dfy)
x_cv = preproc_x(dfx_cv)
y_cv = preproc_y(dfy_cv)
x_test = preproc_x(df_test)

In [57]:
# Build the classifier
import tensorflow as tf

feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x)

classifier = tf.contrib.learn.DNNClassifier(
    feature_columns=feature_columns,
    hidden_units=[10, 20, 10],
    n_classes=len(dfy_.unique()),
    model_dir='/tmp/mnist_model')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_tf_random_seed': None, '_task_type': None, '_environment': 'local', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x122644890>, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_task_id': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_evaluation_master': '', '_keep_checkpoint_every_n_hours': 10000, '_master': ''}


In [59]:
# Train classifier
classifier.fit(
    input_fn=lambda: (
        tf.constant(x),
        tf.constant(y)
    ),
    steps=TRAIN_STEPS)

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 2002 into /tmp/mnist_model/model.ckpt.
INFO:tensorflow:loss = 1.09114, step = 2002
INFO:tensorflow:global_step/sec: 6.47893
INFO:tensorflow:loss = 0.474986, step = 2102
INFO:tensorflow:global_step/sec: 6.2635
INFO:tensorflow:loss = 0.392118, step = 2202
INFO:tensorflow:global_step/sec: 6.64297
INFO:tensorflow:loss = 0.349716, step = 2302
INFO:tensorflow:global_step/sec: 6.60319
INFO:tensorflow:loss = 0.324104, step = 2402
INFO:tensorflow:global_step/sec: 6.73643
INFO:tensorflow:loss = 0.305742, step = 2502
INFO:tensorflow:global_step/sec: 6.58223
INFO:tensorflow:loss = 0.291197, st

DNNClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._MultiClassHead object at 0x11193af50>, 'hidden_units': [10, 20, 10], 'feature_columns': (_RealValuedColumn(column_name='', dimension=784, default_value=None, dtype=tf.float32, normalizer=None),), 'embedding_lr_multipliers': None, 'optimizer': None, 'dropout': None, 'gradient_clip_norm': None, 'activation_fn': <function relu at 0x11b328d70>, 'input_layer_min_slice_size': None})

In [60]:
# Evaluate accuracy
accuracy_score = classifier.evaluate(
    input_fn=lambda: (
        tf.constant(x_cv),
        tf.constant(y_cv)
    ),
    steps=1)['accuracy']
print("\nCross validation accuracy: {0:f}\n".format(accuracy_score))     

Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Starting evaluation at 2017-04-13-15:18:51
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-04-13-15:18:53
INFO:tensorflow:Saving dict for global step 3001: accuracy = 0.916429, auc = 0.991998, global_step = 3001, loss = 0.289948

Cross validation accuracy: 0.916429



In [104]:
# Generate predictions using test data
y_test = classifier.predict(input_fn=lambda: tf.constant(x_test))

In [114]:
# Generate the output file
import csv
import datetime
from os import path

timestamp_str = (str(datetime.datetime.now())
                 .replace(' ', '_')
                 .replace(':', '-'))
filename = 'output_{}.csv'.format(timestamp_str)
filepath = path.join(OUTPUT_DIR, filename)
with open(filepath, 'w') as f:
    w = csv.writer(f)
    w.writerow(['ImageId', 'Label'])
    for i, result in enumerate(y_test, 1):
        w.writerow([i, result])

In [110]:
# Check output file
out = pd.read_csv(filepath)
assert len(out) == 28000