In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from copy import deepcopy

  from ._conv import register_converters as _register_converters


In [2]:
%%time
df = pd.read_csv("../input/blocks.csv")
df["minute"] = df["quarter"]

Wall time: 3.51 s


In [3]:
print(df.shape)
df.head()

(10046491, 9)


Unnamed: 0,Level,latBlock,lngBlock,month,day,hour,quarter,count,minute
0,0,0,17,4,14,16,0,2,0
1,0,0,17,4,21,16,2,1,2
2,0,0,17,5,11,11,0,2,0
3,0,0,17,5,11,12,0,11,0
4,0,0,17,5,11,12,1,11,1


In [4]:
df_val = df.loc[df.month>=6]
df = df.loc[df.month<6]
print(df_val.shape)
print(df.shape)

(3858677, 9)
(6187814, 9)


In [5]:
%%time
# format the input and output
def matrixTrans(df, level, between=7):
    tf = deepcopy(df.loc[df['Level'] == level])
    latMax = tf.latBlock.max()+1
    lngMax = tf.lngBlock.max()+1
    tf["order"] = tf.month * 10**2+ tf.day
    orders = sorted(tf["order"].unique())

    def generator():
        for hour in tf.hour.unique():
            for minute in tf.minute.unique():
                yield hour,minute

    g = generator()
    num = 0
    for h,m in g:
        num+= len(orders)-between
    features = np.zeros((num,lngMax,latMax,between))
    labels = np.zeros((num,lngMax,latMax))

    g = generator()
    index = 0
    for hour,minute in g:
        temp = tf.loc[(tf.hour==hour)&(tf.minute==minute)]
        for i in range(between, len(orders)):
            next_ = temp.loc[tf.order==orders[i]]
            for _,row in next_.iterrows():
                labels[index, row['lngBlock'],row['latBlock']] = row['count']
            for b in range(between):
                prev = temp.loc[tf.order==orders[i-1-b]]
                for _, row in prev.iterrows():
                    features[index, row['lngBlock'],row['latBlock'],b] = row['count']
            index += 1
            if index==num:
                return features, labels

    del tf
    features = features[:index,:,:,:]
    labels = labels[:index,:,:]
    return features, labels

features, labels = matrixTrans(df, 0)
features_val, labels_val = matrixTrans(df_val, 0)
print(features.shape)
print(labels.shape)

(4896, 50, 50, 7)
(4896, 50, 50)
Wall time: 21min 51s


In [78]:
def cnn_model(features, labels, mode):
    # Input Layer
    size = features.shape
    kernel_size = [(3,3),(2,2),(2,2),(1,1)]
    #pool_size = parameters.get("pool_size", [(2,2),(3,3),(2,2)])
    strides = [1,1,1]
    filters = [32, 16, 16,1]

    input_layer = tf.reshape(features, [-1, size[1], size[2], size[3]])
    label_layer = tf.reshape(labels, [-1, size[1]*size[2]])

    # We only add conv layers with 'same' padding in our model
    # since It's a n*n -> n*n prediction (previous distribution -> future prediction)
    # dimensions should be kept same

    # Convolutional Layer and Pooling Layer#1
    conv1 = tf.layers.conv2d(
      inputs=input_layer,
      filters=filters[0],
      kernel_size=kernel_size[0],
      padding="same",
      activation=tf.nn.elu)

    # Convolutional Layer #2 and Pooling Layer #2
    conv2 = tf.layers.conv2d(
      inputs=conv1,
      filters=filters[1],
      kernel_size=kernel_size[1],
      padding="same",
      activation=tf.nn.elu)
    
    # Convolutional Layer #2 and Pooling Layer #2
    conv3 = tf.layers.conv2d(
      inputs=conv2,
      filters=filters[2],
      kernel_size=kernel_size[2],
      padding="same",
      activation=tf.nn.elu)
    
    conv4 = tf.layers.conv2d(
      inputs=conv3,
      filters=filters[3],
      kernel_size=kernel_size[3],
      padding="same",
      activation=tf.nn.relu)

    # Dense Layer
    conv4_flat = tf.reshape(conv4, [-1, size[1]*size[2]*filters[3]])
    #dense = tf.layers.dense(inputs=conv3_flat, units=1024)
    #dropout = tf.layers.dropout(inputs=dense, rate=0.8, training= mode == tf.estimator.ModeKeys.TRAIN)

    # Logits Layer
    logits = conv4_flat # tf.layers.dense(inputs=dropout, units= size[1]*size[2], activation=tf.nn.relu)

    # Loss
    loss = tf.losses.mean_squared_error(labels=label_layer, predictions=logits)
    
    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "distribution": logits
      # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
      # `logging_hook`.
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
      "mse": tf.metrics.mean_squared_error(
          labels=label_layer, predictions=logits)
    }
    return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [79]:
# Build the estimator
distribution_estimator = tf.estimator.Estimator(
    model_fn=cnn_model)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=features,
    y=labels,
    num_epochs=None,
    shuffle=True)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\yuhan\\AppData\\Local\\Temp\\tmp9wrleth5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001E490C01A20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [80]:
# Train the model
distribution_estimator.train(
    input_fn=train_input_fn,
    steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into C:\Users\yuhan\AppData\Local\Temp\tmp9wrleth5\model.ckpt.
INFO:tensorflow:loss = 487.6178, step = 0
INFO:tensorflow:global_step/sec: 11.1428
INFO:tensorflow:loss = 788.13086, step = 100 (8.974 sec)
INFO:tensorflow:global_step/sec: 11.2284
INFO:tensorflow:loss = 244.38185, step = 200 (8.906 sec)
INFO:tensorflow:global_step/sec: 11.2298
INFO:tensorflow:loss = 665.4614, step = 300 (8.905 sec)
INFO:tensorflow:global_step/sec: 11.1675
INFO:tensorflow:loss = 162.47899, step = 400 (8.955 sec)
INFO:tensorflow:global_step/sec: 11.1343
INFO:tensorflow:loss = 289.26343, step = 500 (8.997 sec)
INFO:tensorflow:global_step/sec: 11.1075
INFO:tensorflow:loss = 669.86676, step = 600 (8.987 sec)
INFO:tensorflow:global_step/se

<tensorflow.python.estimator.estimator.Estimator at 0x1e490c01908>

In [81]:
# error on train dataset
# print out mse=401, then rmse = 20
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=features,
    y=labels,
    num_epochs=1,
    shuffle=False)

eval_results = distribution_estimator.evaluate(input_fn=eval_input_fn)
print(eval_results)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-11-04:42:47
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\yuhan\AppData\Local\Temp\tmp9wrleth5\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-11-04:42:49
INFO:tensorflow:Saving dict for global step 1000: global_step = 1000, loss = 402.3088, mse = 410.09708
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: C:\Users\yuhan\AppData\Local\Temp\tmp9wrleth5\model.ckpt-1000
{'loss': 402.3088, 'mse': 410.09708, 'global_step': 1000}


In [82]:
# error on validation dataset
# print out mse=253, then rmse = 16
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
    x=features_val,
    y=labels_val,
    num_epochs=1,
    shuffle=False)

eval_results = distribution_estimator.evaluate(input_fn=eval_input_fn)
print(eval_results)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-11-04:42:49
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\yuhan\AppData\Local\Temp\tmp9wrleth5\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-11-04:42:51
INFO:tensorflow:Saving dict for global step 1000: global_step = 1000, loss = 253.90901, mse = 256.74518
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: C:\Users\yuhan\AppData\Local\Temp\tmp9wrleth5\model.ckpt-1000
{'loss': 253.90901, 'mse': 256.74518, 'global_step': 1000}


In [None]:
# the aim of rmse is 15, for now the rmse from cnn model is 20 

In [55]:
np.sqrt(np.mean((labels_val-np.mean(labels_val))**2))

21.220634339470006

In [56]:
np.sqrt(np.mean((labels-np.mean(labels))**2))

26.23598973610478

In [84]:
256.74518**0.5

16.023269953414626