In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
def read_csv(TRAIN_PATH):
    chunksize = 5_000_000 # batch size
    
    # defining the columns datatypes (optimize memory usage)
    traintypes = {
        'fare': 'float32',
        'num_pass': 'uint8', 
        'year': 'uint8',
        'hour': 'uint8',
        'day': 'uint8',
        'month': 'uint8',
        'weekday': 'uint8',
        'x0': 'float32',
        'x1': 'float32',
        'y0': 'float32',
        'y1': 'float32',
        'dist_e': 'float32',
        'dist_t': 'float32'
    }

    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize):
        df_list.append(df_chunk) 
        df = pd.concat(df_list)
        
    return df

In [3]:
df = read_csv('../input/train_cleaned.csv')

def norm_params(df):
    params = {}
    columns = [
        'x0', 'x1', 'y0', 'y1', 'dist_e', 'dist_t']
    
    for column in columns:
        #params[column] = {'mean': df[column].mean(), 'std': df[column].std()}
        params[column] = {'min': df[column].min(), 'max': df[column].max()}
        
    return params

params = norm_params(df)

def normalize(df, params):
    for key in params:
        #mean = params[key]['mean']
        #std = params[key]['std']
        #df[key+'_norm'] = (df[key] - mean)/std
            
        minimum = params[key]['min']
        maximum = params[key]['max']

        df[key+'_norm'] = (df[key] - minimum) / (maximum - minimum)
            
        
normalize(df, params)

In [4]:
y = df['fare']
X = df.drop(['fare'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=379582)

del X,y

In [5]:
# definition of the numeric columns
x0 = tf.feature_column.numeric_column('x0')
x1 = tf.feature_column.numeric_column('x1')
y0 = tf.feature_column.numeric_column('y0')
y1 = tf.feature_column.numeric_column('y1')
num_pass = tf.feature_column.numeric_column('num_pass')

# definition of the categorical columns
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
month = tf.feature_column.categorical_column_with_identity('month', num_buckets = 13)
year = tf.feature_column.categorical_column_with_identity('year', num_buckets = 7)

# definition of the bucketized columns
numbuckets = 15

x = np.linspace(-74.27, -72.98, numbuckets).tolist()
y = np.linspace(40.56, 41.71, numbuckets).tolist()

x0_B = tf.feature_column.bucketized_column(x0, x)
x1_B = tf.feature_column.bucketized_column(x1, x)
y0_B = tf.feature_column.bucketized_column(y0, y)
y1_B = tf.feature_column.bucketized_column(y1, y)

# definition of the feature crosses
xy_0 = tf.feature_column.crossed_column([x0_B, y0_B], numbuckets * numbuckets)
xy_1 = tf.feature_column.crossed_column([x1_B, y1_B], numbuckets * numbuckets)
xy_pair = tf.feature_column.crossed_column([xy_0, xy_1], numbuckets**4)
day_hr = tf.feature_column.crossed_column([weekday, hour], 7 * 24)
mth_yr = tf.feature_column.crossed_column([month, year], 12 * 10)
day_yr = tf.feature_column.crossed_column([weekday, year], 7 * 7)
space_time = tf.feature_column.crossed_column([xy_pair, month], numbuckets**4 * 13)

x0_norm = tf.feature_column.numeric_column('x0_norm')
x1_norm = tf.feature_column.numeric_column('x1_norm')
y0_norm = tf.feature_column.numeric_column('y0_norm')
y1_norm = tf.feature_column.numeric_column('y1_norm')

taxicab = tf.feature_column.numeric_column('dist_t_norm')
euclidean = tf.feature_column.numeric_column('dist_e_norm')

In [6]:
wide_columns = [
    # Feature crosses
    xy_0, xy_1, xy_pair, day_hr, mth_yr, day_yr, space_time,

    # Sparse columns
    weekday, hour, year, month,

    # Anything with a linear relationship
    num_pass 
]

deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(xy_pair, 10),
    tf.feature_column.embedding_column(day_hr, 10),
    tf.feature_column.embedding_column(mth_yr, 10),
    tf.feature_column.embedding_column(space_time, 20),

    # Numeric columns
    x0_norm, x1_norm, y0_norm, y1_norm, taxicab, euclidean
]

In [7]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [8]:
model_dir = './DNN_Linear_Combined_Regressor'

file_writer = tf.summary.FileWriter(model_dir)

estimator = tf.estimator.DNNLinearCombinedRegressor(
    model_dir = model_dir,
    linear_feature_columns = wide_columns,
    dnn_feature_columns = deep_columns,
    dnn_optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    dnn_hidden_units = [128,128,64,32,16], 
    batch_norm = True,
    dnn_dropout = 0.1,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Linear_Combined_Regressor', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002BF93DE07F0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(X_train, y_train, batch_size=2500))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(X_eval, y_eval))

In [33]:
for i in range(40):
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-370148
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 370148 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 29236.234, step = 370148
INFO:tensorflow:global_step/sec: 9.51535
INFO:tensorflow:loss = 36649.543, step = 370248 (10.509 sec)
INFO:tensorflow:global_step/sec: 11.484
INFO:tensorflow:loss = 36764.523, step = 370348 (8.710 sec)
INFO:tensorflow:global_step/sec:

INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-17-11:49:58
INFO:tensorflow:Saving dict for global step 376963: average_loss = 11.316337, global_step = 376963, label/mean = 11.2290535, loss = 1448.4911, prediction/mean = 11.164203
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 376963: ./DNN_Linear_Combined_Regressor\model.ckpt-376963
INFO:tensorflow:global_step/sec: 6.38222
INFO:tensorflow:loss = 29331.646, step = 377048 (15.674 sec)
INFO:tensorflow:global_step/sec: 11.359
INFO:tensorflow:loss = 37350.793, step = 377148 (8.796 sec)
INFO:tensorflow:global_step/sec: 11.166
INFO:tensorflow:loss = 30713.717, step = 377248 (8.958 sec)
INFO:tensorflow:global_step/sec: 11.3557
INFO:tensorflow:loss = 26054.154, step = 377348 (8.804 sec)
INFO:tensorflow:global_step

INFO:tensorflow:Finished evaluation at 2018-08-17-11:59:58
INFO:tensorflow:Saving dict for global step 383813: average_loss = 11.971196, global_step = 383813, label/mean = 11.241219, loss = 1532.3131, prediction/mean = 11.128868
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 383813: ./DNN_Linear_Combined_Regressor\model.ckpt-383813
INFO:tensorflow:global_step/sec: 6.66028
INFO:tensorflow:loss = 49544.652, step = 383848 (15.014 sec)
INFO:tensorflow:global_step/sec: 11.4629
INFO:tensorflow:loss = 27516.447, step = 383948 (8.724 sec)
INFO:tensorflow:global_step/sec: 11.5699
INFO:tensorflow:loss = 26621.0, step = 384048 (8.643 sec)
INFO:tensorflow:global_step/sec: 11.6111
INFO:tensorflow:loss = 33194.195, step = 384148 (8.612 sec)
INFO:tensorflow:global_step/sec: 11.6128
INFO:tensorflow:loss = 31537.227, step = 384248 (8.627 sec)
INFO:tensorflow:global_step/sec: 11.5423
INFO:tensorflow:loss = 27406.453, step = 384348 (8.648 sec)
INFO:tensorflow:global_step/sec: 11.6188
IN

INFO:tensorflow:global_step/sec: 10.4167
INFO:tensorflow:loss = 22623.6, step = 389652 (9.599 sec)
INFO:tensorflow:global_step/sec: 10.6073
INFO:tensorflow:loss = 32708.271, step = 389752 (9.426 sec)
INFO:tensorflow:global_step/sec: 12.177
INFO:tensorflow:loss = 37946.01, step = 389852 (8.212 sec)
INFO:tensorflow:global_step/sec: 12.2763
INFO:tensorflow:loss = 27062.127, step = 389952 (8.146 sec)
INFO:tensorflow:global_step/sec: 12.2605
INFO:tensorflow:loss = 29348.3, step = 390052 (8.156 sec)
INFO:tensorflow:global_step/sec: 12.2405
INFO:tensorflow:loss = 29728.828, step = 390152 (8.170 sec)
INFO:tensorflow:global_step/sec: 12.0306
INFO:tensorflow:loss = 28682.02, step = 390252 (8.312 sec)
INFO:tensorflow:global_step/sec: 11.6377
INFO:tensorflow:loss = 27456.297, step = 390352 (8.593 sec)
INFO:tensorflow:global_step/sec: 11.5808
INFO:tensorflow:loss = 32251.72, step = 390452 (8.637 sec)
INFO:tensorflow:global_step/sec: 11.3816
INFO:tensorflow:loss = 28131.484, step = 390552 (8.786 sec

INFO:tensorflow:global_step/sec: 11.6353
INFO:tensorflow:loss = 26620.19, step = 396652 (8.595 sec)
INFO:tensorflow:global_step/sec: 11.6608
INFO:tensorflow:loss = 35768.305, step = 396752 (8.576 sec)
INFO:tensorflow:global_step/sec: 11.6852
INFO:tensorflow:loss = 54854.027, step = 396852 (8.558 sec)
INFO:tensorflow:global_step/sec: 11.658
INFO:tensorflow:loss = 27932.947, step = 396952 (8.578 sec)
INFO:tensorflow:global_step/sec: 11.629
INFO:tensorflow:loss = 43168.844, step = 397052 (8.615 sec)
INFO:tensorflow:global_step/sec: 11.6303
INFO:tensorflow:loss = 55369.438, step = 397152 (8.583 sec)
INFO:tensorflow:global_step/sec: 11.6695
INFO:tensorflow:loss = 26832.064, step = 397252 (8.569 sec)
INFO:tensorflow:global_step/sec: 11.631
INFO:tensorflow:loss = 33692.74, step = 397352 (8.600 sec)
INFO:tensorflow:global_step/sec: 11.6705
INFO:tensorflow:loss = 31564.332, step = 397452 (8.567 sec)
INFO:tensorflow:global_step/sec: 11.6588
INFO:tensorflow:loss = 32979.72, step = 397552 (8.577 s

INFO:tensorflow:Saving 'checkpoint_path' summary for global step 402556: ./DNN_Linear_Combined_Regressor\model.ckpt-402556
INFO:tensorflow:Loss for final step: 4726.453.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-402556
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 402556 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 39482.69, step = 402556
INFO:tensorflow:global_step/sec: 9.73354
INFO:tensorflow:loss = 3

INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-17-12:38:11
INFO:tensorflow:Saving dict for global step 409366: average_loss = 12.504503, global_step = 409366, label/mean = 11.326251, loss = 1600.5764, prediction/mean = 11.21415
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 409366: ./DNN_Linear_Combined_Regressor\model.ckpt-409366
INFO:tensorflow:global_step/sec: 4.83095
INFO:tensorflow:loss = 33989.03, step = 409456 (20.698 sec)
INFO:tensorflow:global_step/sec: 11.6215
INFO:tensorflow:loss = 22229.727, step = 409556 (8.605 sec)
INFO:tensorflow:global_step/sec: 11.5743
INFO:tensorflow:loss = 31294.203, step = 4096

INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-08-17-12:48:06
INFO:tensorflow:Saving dict for global step 416193: average_loss = 10.979837, global_step = 416193, label/mean = 11.289923, loss = 1405.4192, prediction/mean = 11.258632
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 416193: ./DNN_Linear_Combined_Regressor\model.ckpt-416193
INFO:tensorflow:global_step/sec: 6.65528
INFO:tensorflow:loss = 27330.584, step = 416256 (15.026 sec)
INFO:tensorflow:global_step/sec: 11.6187
INFO:tensorflow:loss = 49282.094, step = 416356 (8.607 sec)
INFO:tensorflow:global_step/sec: 11.6366
INFO:tensorflow:loss = 36844.047, step = 416456 (8.609 sec)
INFO:tensorflow:global_step/sec: 11.5538
INFO:tensorflow:loss = 20911.508, step = 416556 (8.640 sec)
INFO:tensorflow:global_step/sec: 11.5892
INFO:tensorflow:loss = 35428.836, step = 416656 (8.629 se

INFO:tensorflow:loss = 30976.875, step = 421860 (8.760 sec)
INFO:tensorflow:global_step/sec: 11.5493
INFO:tensorflow:loss = 25847.104, step = 421960 (8.645 sec)
INFO:tensorflow:global_step/sec: 11.4583
INFO:tensorflow:loss = 37796.46, step = 422060 (8.725 sec)
INFO:tensorflow:global_step/sec: 11.3867
INFO:tensorflow:loss = 27902.201, step = 422160 (8.782 sec)
INFO:tensorflow:global_step/sec: 11.3378
INFO:tensorflow:loss = 35169.727, step = 422260 (8.822 sec)
INFO:tensorflow:global_step/sec: 11.3882
INFO:tensorflow:loss = 26356.287, step = 422360 (8.779 sec)
INFO:tensorflow:global_step/sec: 11.45
INFO:tensorflow:loss = 33024.832, step = 422460 (8.747 sec)
INFO:tensorflow:global_step/sec: 11.3721
INFO:tensorflow:loss = 32358.438, step = 422560 (8.783 sec)
INFO:tensorflow:global_step/sec: 11.456
INFO:tensorflow:loss = 25864.473, step = 422660 (8.728 sec)
INFO:tensorflow:global_step/sec: 11.5183
INFO:tensorflow:loss = 40287.652, step = 422760 (8.680 sec)
INFO:tensorflow:global_step/sec: 11

INFO:tensorflow:global_step/sec: 11.5829
INFO:tensorflow:loss = 31691.002, step = 428960 (8.618 sec)
INFO:tensorflow:global_step/sec: 11.5171
INFO:tensorflow:loss = 24484.379, step = 429060 (8.698 sec)
INFO:tensorflow:global_step/sec: 11.5788
INFO:tensorflow:loss = 38082.145, step = 429160 (8.623 sec)
INFO:tensorflow:global_step/sec: 11.0697
INFO:tensorflow:loss = 31218.545, step = 429260 (9.034 sec)
INFO:tensorflow:global_step/sec: 11.249
INFO:tensorflow:loss = 30928.234, step = 429360 (8.890 sec)
INFO:tensorflow:global_step/sec: 11.5076
INFO:tensorflow:loss = 39665.062, step = 429460 (8.701 sec)
INFO:tensorflow:global_step/sec: 11.4512
INFO:tensorflow:loss = 39071.516, step = 429560 (8.721 sec)
INFO:tensorflow:global_step/sec: 11.4956
INFO:tensorflow:loss = 38759.266, step = 429660 (8.700 sec)
INFO:tensorflow:global_step/sec: 11.5236
INFO:tensorflow:loss = 25885.736, step = 429760 (8.676 sec)
INFO:tensorflow:global_step/sec: 11.6827
INFO:tensorflow:loss = 24040.914, step = 429860 (8.

KeyboardInterrupt: 

In [26]:
infer = pd.read_csv('../input/test.csv', index_col='key')

In [27]:
def process_datetime(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year - 2009
    df['weekday'] = df['pickup_datetime'].dt.dayofweek
    df.drop('pickup_datetime', axis=1, inplace=True)
    
process_datetime(infer)

In [28]:
def rename_columns(df):
    cols = [
        'x0',
        'y0',
        'x1',
        'y1',
        'num_pass',
        'hour',
        'day',
        'month',
        'year',
        'weekday'
    ]

    df.columns = cols
    
def distances(df):
    df.loc[:, 'dist_e'] = np.sqrt((df.loc[:,'x1'] - df.loc[:,'x0'])**2 + (df.loc[:,'y1'] - df.loc[:,'y0'])**2)
    df.loc[:, 'dist_t'] = abs(df.loc[:,'x1'] - df.loc[:,'x0']) + abs(df.loc[:,'y1'] - df.loc[:,'y0'])

rename_columns(infer)
distances(infer)

In [29]:
normalize(infer, params)

In [31]:
generator = estimator.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-370148
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [32]:
ids = infer.index

submission = pd.DataFrame()
submission['fare_amount'] = values
submission['key'] = ids
submission.set_index('key', inplace=True)

submission.to_csv('../output/17.dnn_linear_combined_regressor_rmse_3.22.csv')