In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

  from ._conv import register_converters as _register_converters


In [2]:
n_rows = 10000000

train = pd.read_csv(
    '../input/train.csv.zip', 
    index_col='key', 
    compression='infer', 
    nrows=n_rows)

valid = pd.read_csv(
    '../input/train.csv.zip', 
    index_col='key', 
    compression='infer', 
    nrows=n_rows/10, 
    skiprows=list(range(1,n_rows+1)))

infer = pd.read_csv('../input/test.csv', index_col='key')

In [3]:
def feature_engineering(df):
    df['pickup'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['weekday'] = df['pickup'].dt.dayofweek
    df['month'] = df['pickup'].dt.month
    df['hour'] = df['pickup'].dt.hour
    df['year'] = df['pickup'].dt.year
        
    df['lat_diff'] = abs(df['pickup_latitude'] - df['dropoff_latitude'])
    df['lon_diff'] = abs(df['pickup_longitude'] - df['dropoff_longitude'])
    df['taxicab'] = df['lat_diff'] + df['lon_diff']
        
    df.drop(['pickup_datetime','pickup'], axis=1, inplace=True)
    
feature_engineering(train)
feature_engineering(valid)
feature_engineering(infer)

train.dropna(how='any', axis=0, inplace=True)
valid.dropna(how='any', axis=0, inplace=True)

In [4]:
y_train = train['fare_amount']
y_valid = valid['fare_amount']

X_train = train.drop('fare_amount', axis=1, inplace=True)
X_valid = valid.drop('fare_amount', axis=1, inplace=True)

In [5]:
# definition of the numeric columns
p_lat = tf.feature_column.numeric_column('pickup_latitude')
p_lon = tf.feature_column.numeric_column('pickup_longitude')
d_lat = tf.feature_column.numeric_column('dropoff_latitude')
d_lon = tf.feature_column.numeric_column('dropoff_longitude')
p_cnt = tf.feature_column.numeric_column('passenger_count')
lat_diff = tf.feature_column.numeric_column('lat_diff')
lon_diff = tf.feature_column.numeric_column('lon_diff')
taxicab = tf.feature_column.numeric_column('taxicab')

# definition of the categorical columns
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
month = tf.feature_column.categorical_column_with_identity('month', num_buckets = 13)
year = tf.feature_column.categorical_column_with_identity('year', num_buckets = 2016)

# definition of the bucketized columns
numbuckets = 16

latbuckets = np.linspace(38,42,numbuckets).tolist()
lonbuckets = np.linspace(-76,-72,numbuckets).tolist()

p_latB = tf.feature_column.bucketized_column(p_lat, latbuckets)
d_latB = tf.feature_column.bucketized_column(d_lat, latbuckets)
p_lonB = tf.feature_column.bucketized_column(p_lon, lonbuckets)
d_lonB = tf.feature_column.bucketized_column(d_lon, lonbuckets)

# definition of the feature crosses
ploc = tf.feature_column.crossed_column([p_latB, p_lonB], numbuckets * numbuckets)
dloc = tf.feature_column.crossed_column([d_latB, d_lonB], numbuckets * numbuckets)
pd_pair = tf.feature_column.crossed_column([ploc, dloc], numbuckets**4)
day_hr = tf.feature_column.crossed_column([weekday, hour], 7 * 24)
mth_yr = tf.feature_column.crossed_column([month, year], 12 * 10)

In [6]:
wide_columns = [
    # Feature crosses
    ploc, dloc, pd_pair, day_hr, mth_yr,

    # Sparse columns
    weekday, hour, month, year,

    # Anything with a linear relationship
    p_cnt 
]

In [7]:
deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(pd_pair, 10),
    tf.feature_column.embedding_column(day_hr, 10),
    tf.feature_column.embedding_column(mth_yr, 10),

    # Numeric columns
    p_lat, p_lon, d_lat, d_lon, lat_diff, lon_diff, taxicab
]

In [8]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [9]:
model_dir = './DNN_Linear_Combined_Regressor'

file_writer = tf.summary.FileWriter(model_dir)

#sess_config = tf.ConfigProto(log_device_placement=True)
#run_config = tf.estimator.RunConfig(session_config = sess_config)

estimator = tf.estimator.DNNLinearCombinedRegressor(
    model_dir = model_dir,
    linear_feature_columns = wide_columns,
    dnn_feature_columns = deep_columns,
    dnn_optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    dnn_hidden_units = [512,256,256,64], 
    batch_norm = True,
    dnn_dropout = 0.1,
    #config=run_config
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Linear_Combined_Regressor', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000014AF1CE4630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [10]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(train, y_train, batch_size=10000))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(valid, y_valid))

In [None]:
for i in range(500):
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 2283168.2, step = 0
INFO:tensorflow:global_step/sec: 4.2606
INFO:tensorflow:loss = 1766495.6, step = 100 (23.471 sec)
INFO:tensorflow:global_step/sec: 4.48384
INFO:tensorflow:loss = 1563717.8, step = 200 (22.304 sec)
INFO:tensorflow:global_step/sec: 4.63105
INFO:tensorflow:loss = 1310638.8, step = 300 (21.591 sec)
INFO:tensorflow:global_step/sec: 4.60398

INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3000: ./DNN_Linear_Combined_Regressor\model.ckpt-3000
INFO:tensorflow:Loss for final step: 262951.56.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-3000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 3000 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 280387.56, step = 3000
INFO:tensorflow:global_step/sec: 4.30732
INFO:tensorflow:loss = 282364.3,

INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-14-03:24:09
INFO:tensorflow:Saving dict for global step 6000: average_loss = 76.79139, global_step = 6000, label/mean = 11.258042, loss = 9829.298, prediction/mean = 11.278569
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 6000: ./DNN_Linear_Combined_Regressor\model.ckpt-6000
INFO:tensorflow:Loss for final step: 211897.53.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linea

INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-14-03:35:43
INFO:tensorflow:Saving dict for global step 9000: average_loss = 53.942432, global_step = 9000, label/mean = 11.355141, loss = 6904.6313, prediction/mean = 10.770954
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 9000: ./DNN_Linear_Combined_Regressor\model.ckpt-9000
INFO:tensorflow:Loss for final step: 256528.94.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every check

INFO:tensorflow:Saving checkpoints for 12000 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-14-03:47:13
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-12000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-14-03:47:16
INFO:tensorflow:Saving dict for global step 12000: average_loss = 64.74303, global_step = 12000, label/mean = 11.556066, loss = 8287.107, predi

INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-14000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 14000 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 211386.6, step = 14000
INFO:tensorflow:global_step/sec: 4.22034
INFO:tensorflow:loss = 262157.66, step = 14100 (23.696 sec)
INFO:tensorflow:global_step/sec: 4.53254
INFO:tensorflow:loss = 188885.66, step = 14200 (22.063 sec)
INFO:tensorflow:global_step/sec: 4.43149
INFO:tensorflow:loss = 201192.66, step = 14300 (22.566 sec)
INFO:tenso

INFO:tensorflow:Saving 'checkpoint_path' summary for global step 17000: ./DNN_Linear_Combined_Regressor\model.ckpt-17000
INFO:tensorflow:Loss for final step: 256859.19.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-17000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 17000 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 249233.14, step = 17000
INFO:tensorflow:global_step/sec: 4.24791
INFO:tensorflow:loss = 2069

INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-14-04:25:31
INFO:tensorflow:Saving dict for global step 20000: average_loss = 46.397964, global_step = 20000, label/mean = 11.335374, loss = 5938.9395, prediction/mean = 11.421063
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 20000: ./DNN_Linear_Combined_Regressor\model.ckpt-20000
INFO:tensorflow:Loss for final step: 223572.34.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensor

INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-23000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-14-04:37:19
INFO:tensorflow:Saving dict for global step 23000: average_loss = 41.669067, global_step = 23000, label/mean = 11.288266, loss = 5333.6406, prediction/mean = 10.426871
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 23000: ./DNN_Linear_Combined_Regressor\model.ckpt-23000
INFO:tensorflow:Loss for final step: 261212.1.
INFO:tensorflow:Running training and evaluation locally (non-distribut

INFO:tensorflow:global_step/sec: 4.48016
INFO:tensorflow:loss = 207896.38, step = 25900 (22.328 sec)
INFO:tensorflow:Saving checkpoints for 26000 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-14-04:49:05
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-26000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-14-04:49:07
INFO:tensorflow:Saving dict for global ste

INFO:tensorflow:loss = 181433.4, step = 28500 (22.354 sec)
INFO:tensorflow:global_step/sec: 4.41699
INFO:tensorflow:loss = 314841.56, step = 28600 (22.648 sec)
INFO:tensorflow:global_step/sec: 4.52059
INFO:tensorflow:loss = 242851.0, step = 28700 (22.113 sec)
INFO:tensorflow:global_step/sec: 4.47116
INFO:tensorflow:loss = 384935.0, step = 28800 (22.366 sec)
INFO:tensorflow:global_step/sec: 4.46336
INFO:tensorflow:loss = 190052.6, step = 28900 (22.405 sec)
INFO:tensorflow:Saving checkpoints for 29000 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-14-05:00:53
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-29000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/

In [None]:
generator = estimator.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

In [None]:
ids = infer.index

submission = pd.DataFrame()
submission['fare_amount'] = values
submission['key'] = ids
submission.set_index('key', inplace=True)

submission.to_csv('../output/03.dnn_linear_combined_regressor_r2.csv')