In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
def read_csv(TRAIN_PATH):
    chunksize = 5_000_000 # batch size
    
    # defining the columns datatypes (optimize memory usage)
    traintypes = {
        'fare': 'float32',
        'num_pass': 'uint8', 
        'year': 'uint8',
        'hour': 'uint8',
        'day': 'uint8',
        'month': 'uint8',
        'weekday': 'uint8',
        'x0': 'float32',
        'x1': 'float32',
        'y0': 'float32',
        'y1': 'float32',
        'dist_e': 'float32',
        'dist_t': 'float32'
    }

    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize):
        df_list.append(df_chunk) 
        df = pd.concat(df_list)
        
    return df

In [3]:
df = read_csv('../input/train_cleaned.csv')

df.drop(axis=0, index=df[df['fare'] < 2.5].index, inplace=True)
df.drop(axis=0, index=df[(df['fare'] == 2.5) & (df['dist_e'] > 0.01)].index, inplace=True)
df.drop(axis=0, index=df[(df['fare'] == 2.5) & (df['dist_t'] > 0.01)].index, inplace=True)
df.drop(axis=0, index=df[(df['fare'] > 100) & ( (df['dist_e'] < 0.1) | (df['dist_t'] < 0.1))].index, inplace=True)
df.drop(axis=0, index=df[df['fare'] > 250].index, inplace=True)

In [None]:
def norm_params(df):
    params = {}
    columns = [
        'x0', 'x1', 'y0', 'y1', 'dist_e', 'dist_t']
    
    for column in columns:
        #params[column] = {'mean': df[column].mean(), 'std': df[column].std()}
        params[column] = {'min': df[column].min(), 'max': df[column].max()}
        
    return params

params = norm_params(train)

def normalize(df, params):
    for key in params:
        #mean = params[key]['mean']
        #std = params[key]['std']
        #df[key+'_norm'] = (df[key] - mean)/std
            
        minimum = params[key]['min']
        maximum = params[key]['max']

        df[key+'_norm'] = (df[key] - minimum) / (maximum - minimum)
            
        
normalize(train, params)

In [138]:
# df = read_csv('../input/train_cleaned.csv')

y = df['fare']
X = df.drop(['fare'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=379582)

del X,y

In [139]:
# definition of the numeric columns
x0 = tf.feature_column.numeric_column('x0')
x1 = tf.feature_column.numeric_column('x1')
y0 = tf.feature_column.numeric_column('y0')
y1 = tf.feature_column.numeric_column('y1')
num_pass = tf.feature_column.numeric_column('num_pass')

taxicab = tf.feature_column.numeric_column('dist_t')
euclidean = tf.feature_column.numeric_column('dist_e')

# definition of the categorical columns
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
month = tf.feature_column.categorical_column_with_identity('month', num_buckets = 13)
year = tf.feature_column.categorical_column_with_identity('year', num_buckets = 7)

# definition of the bucketized columns
numbuckets = 20

x = np.linspace(-76,-70,numbuckets).tolist()
y = np.linspace(38,42,numbuckets).tolist()

x0_B = tf.feature_column.bucketized_column(x0, x)
x1_B = tf.feature_column.bucketized_column(x1, x)
y0_B = tf.feature_column.bucketized_column(y0, y)
y1_B = tf.feature_column.bucketized_column(y1, y)

# definition of the feature crosses
xy_0 = tf.feature_column.crossed_column([x0_B, y0_B], numbuckets * numbuckets)
xy_1 = tf.feature_column.crossed_column([x1_B, y1_B], numbuckets * numbuckets)
xy_pair = tf.feature_column.crossed_column([xy_0, xy_1], numbuckets**4)
day_hr = tf.feature_column.crossed_column([weekday, hour], 7 * 24)
mth_yr = tf.feature_column.crossed_column([month, year], 12 * 10)

In [140]:
wide_columns = [
    # Feature crosses
    xy_0, xy_1, xy_pair, day_hr, mth_yr,

    # Sparse columns
    weekday, hour, year,

    # Anything with a linear relationship
    num_pass 
]

deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(xy_pair, 10),
    tf.feature_column.embedding_column(day_hr, 10),
    tf.feature_column.embedding_column(mth_yr, 10),

    # Numeric columns
    x0, x1, y0, y1, taxicab, euclidean
]

In [141]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [142]:
model_dir = './DNN_Linear_Combined_Regressor'

file_writer = tf.summary.FileWriter(model_dir)

estimator = tf.estimator.DNNLinearCombinedRegressor(
    model_dir = model_dir,
    linear_feature_columns = wide_columns,
    dnn_feature_columns = deep_columns,
    dnn_optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    dnn_hidden_units = [128,64,32,8], 
    batch_norm = True,
    dnn_dropout = 0.1,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Linear_Combined_Regressor', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000237014E66D8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [143]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(X_train, y_train, batch_size=2500))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(X_eval, y_eval))

In [144]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 563877.2, step = 0
INFO:tensorflow:global_step/sec: 14.3573
INFO:tensorflow:loss = 534694.2, step = 100 (6.965 sec)
INFO:tensorflow:global_step/sec: 18.1312
INFO:tensorflow:loss = 482370.56, step = 200 (5.515 sec)
INFO:tensorflow:global_step/sec: 18.548
INFO:tensorflow:loss = 410422.88, step = 300 (5.391 sec)
INFO:tensorflow:global_step/sec: 17.9743
INFO

INFO:tensorflow:global_step/sec: 17.6678
INFO:tensorflow:loss = 46176.77, step = 7800 (5.660 sec)
INFO:tensorflow:global_step/sec: 17.5118
INFO:tensorflow:loss = 71977.555, step = 7900 (5.710 sec)
INFO:tensorflow:global_step/sec: 17.2305
INFO:tensorflow:loss = 44964.715, step = 8000 (5.804 sec)
INFO:tensorflow:global_step/sec: 17.7722
INFO:tensorflow:loss = 63945.855, step = 8100 (5.625 sec)
INFO:tensorflow:global_step/sec: 17.4702
INFO:tensorflow:loss = 76568.125, step = 8200 (5.725 sec)
INFO:tensorflow:global_step/sec: 17.8313
INFO:tensorflow:loss = 88840.79, step = 8300 (5.607 sec)
INFO:tensorflow:global_step/sec: 17.1846
INFO:tensorflow:loss = 49348.277, step = 8400 (5.822 sec)
INFO:tensorflow:global_step/sec: 17.969
INFO:tensorflow:loss = 49068.93, step = 8500 (5.578 sec)
INFO:tensorflow:global_step/sec: 17.5147
INFO:tensorflow:loss = 63086.273, step = 8600 (5.694 sec)
INFO:tensorflow:global_step/sec: 17.485
INFO:tensorflow:loss = 37840.285, step = 8700 (5.720 sec)
INFO:tensorflow

INFO:tensorflow:loss = 53396.37, step = 14900 (5.794 sec)
INFO:tensorflow:global_step/sec: 17.5442
INFO:tensorflow:loss = 57095.812, step = 15000 (5.700 sec)
INFO:tensorflow:global_step/sec: 17.6071
INFO:tensorflow:loss = 53851.17, step = 15100 (5.693 sec)
INFO:tensorflow:global_step/sec: 17.8751
INFO:tensorflow:loss = 39146.887, step = 15200 (5.581 sec)
INFO:tensorflow:global_step/sec: 17.5979
INFO:tensorflow:loss = 47229.11, step = 15300 (5.681 sec)
INFO:tensorflow:global_step/sec: 18.3006
INFO:tensorflow:loss = 55170.3, step = 15400 (5.464 sec)
INFO:tensorflow:global_step/sec: 17.3407
INFO:tensorflow:loss = 58816.39, step = 15500 (5.769 sec)
INFO:tensorflow:global_step/sec: 17.2911
INFO:tensorflow:loss = 42860.78, step = 15600 (5.783 sec)
INFO:tensorflow:global_step/sec: 17.4245
INFO:tensorflow:loss = 37069.234, step = 15700 (5.738 sec)
INFO:tensorflow:global_step/sec: 17.0916
INFO:tensorflow:loss = 78384.34, step = 15800 (5.852 sec)
INFO:tensorflow:global_step/sec: 16.7774
INFO:ten

({'average_loss': 19.19928,
  'label/mean': 11.728828,
  'loss': 2457.5078,
  'prediction/mean': 11.162024,
  'global_step': 16216},
 [])

In [150]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-16216
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 16216 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 53154.285, step = 16216
INFO:tensorflow:global_step/sec: 14.0237
INFO:tensorflow:loss = 48935.008, step = 16316 (7.133 sec)
INFO:tensorflow:global_step/sec: 17.0037
INFO:tensorflow:loss = 53313.82, step = 16416 (5.879 sec)
INFO:tensorflow:global_step/sec: 16.96

INFO:tensorflow:global_step/sec: 17.433
INFO:tensorflow:loss = 52369.613, step = 23816 (5.734 sec)
INFO:tensorflow:global_step/sec: 16.9906
INFO:tensorflow:loss = 47218.027, step = 23916 (5.886 sec)
INFO:tensorflow:global_step/sec: 17.3755
INFO:tensorflow:loss = 42201.156, step = 24016 (5.771 sec)
INFO:tensorflow:global_step/sec: 18.0504
INFO:tensorflow:loss = 57636.742, step = 24116 (5.540 sec)
INFO:tensorflow:global_step/sec: 17.547
INFO:tensorflow:loss = 44821.465, step = 24216 (5.685 sec)
INFO:tensorflow:global_step/sec: 17.4908
INFO:tensorflow:loss = 63261.793, step = 24316 (5.715 sec)
INFO:tensorflow:global_step/sec: 16.6738
INFO:tensorflow:loss = 42498.72, step = 24416 (5.997 sec)
INFO:tensorflow:global_step/sec: 18.1315
INFO:tensorflow:loss = 50037.91, step = 24516 (5.517 sec)
INFO:tensorflow:global_step/sec: 17.905
INFO:tensorflow:loss = 52415.855, step = 24616 (5.583 sec)
INFO:tensorflow:global_step/sec: 18.0812
INFO:tensorflow:loss = 41531.5, step = 24716 (5.546 sec)
INFO:te

INFO:tensorflow:global_step/sec: 17.549
INFO:tensorflow:loss = 73751.89, step = 30916 (5.698 sec)
INFO:tensorflow:global_step/sec: 16.816
INFO:tensorflow:loss = 35412.22, step = 31016 (5.949 sec)
INFO:tensorflow:global_step/sec: 17.2209
INFO:tensorflow:loss = 58586.37, step = 31116 (5.817 sec)
INFO:tensorflow:global_step/sec: 17.547
INFO:tensorflow:loss = 44155.78, step = 31216 (5.690 sec)
INFO:tensorflow:global_step/sec: 17.0449
INFO:tensorflow:loss = 85833.95, step = 31316 (5.866 sec)
INFO:tensorflow:global_step/sec: 17.9926
INFO:tensorflow:loss = 47449.863, step = 31416 (5.556 sec)
INFO:tensorflow:global_step/sec: 17.9791
INFO:tensorflow:loss = 52522.914, step = 31516 (5.562 sec)
INFO:tensorflow:global_step/sec: 18.1485
INFO:tensorflow:loss = 42670.22, step = 31616 (5.510 sec)
INFO:tensorflow:global_step/sec: 17.0209
INFO:tensorflow:loss = 67549.16, step = 31716 (5.876 sec)
INFO:tensorflow:global_step/sec: 17.0283
INFO:tensorflow:loss = 45795.324, step = 31816 (5.874 sec)
INFO:tenso

({'average_loss': 21.135492,
  'label/mean': 11.673906,
  'loss': 2705.343,
  'prediction/mean': 11.954746,
  'global_step': 32432},
 [])

In [154]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-81080
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 81080 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 38766.61, step = 81080
INFO:tensorflow:global_step/sec: 14.6171
INFO:tensorflow:loss = 58231.414, step = 81180 (6.841 sec)
INFO:tensorflow:global_step/sec: 17.9806
INFO:tensorflow:loss = 53925.73, step = 81280 (5.577 sec)
INFO:tensorflow:global_step/sec: 17.914

INFO:tensorflow:global_step/sec: 17.673
INFO:tensorflow:loss = 42100.434, step = 88680 (5.658 sec)
INFO:tensorflow:global_step/sec: 17.6228
INFO:tensorflow:loss = 43621.254, step = 88780 (5.674 sec)
INFO:tensorflow:global_step/sec: 17.6945
INFO:tensorflow:loss = 41492.547, step = 88880 (5.650 sec)
INFO:tensorflow:global_step/sec: 17.6127
INFO:tensorflow:loss = 37311.223, step = 88980 (5.679 sec)
INFO:tensorflow:global_step/sec: 17.6938
INFO:tensorflow:loss = 50022.367, step = 89080 (5.652 sec)
INFO:tensorflow:global_step/sec: 17.6862
INFO:tensorflow:loss = 38755.48, step = 89180 (5.653 sec)
INFO:tensorflow:global_step/sec: 17.6282
INFO:tensorflow:loss = 49325.04, step = 89280 (5.674 sec)
INFO:tensorflow:global_step/sec: 17.6745
INFO:tensorflow:loss = 41170.438, step = 89380 (5.658 sec)
INFO:tensorflow:global_step/sec: 17.6981
INFO:tensorflow:loss = 44972.434, step = 89480 (5.650 sec)
INFO:tensorflow:global_step/sec: 17.6551
INFO:tensorflow:loss = 49601.8, step = 89580 (5.663 sec)
INFO:

INFO:tensorflow:global_step/sec: 17.5342
INFO:tensorflow:loss = 52606.46, step = 95780 (5.704 sec)
INFO:tensorflow:global_step/sec: 17.5859
INFO:tensorflow:loss = 52745.395, step = 95880 (5.686 sec)
INFO:tensorflow:global_step/sec: 17.6196
INFO:tensorflow:loss = 40330.77, step = 95980 (5.674 sec)
INFO:tensorflow:global_step/sec: 17.5309
INFO:tensorflow:loss = 56768.51, step = 96080 (5.713 sec)
INFO:tensorflow:global_step/sec: 17.5922
INFO:tensorflow:loss = 46040.45, step = 96180 (5.675 sec)
INFO:tensorflow:global_step/sec: 17.542
INFO:tensorflow:loss = 68732.53, step = 96280 (5.702 sec)
INFO:tensorflow:global_step/sec: 17.522
INFO:tensorflow:loss = 41779.613, step = 96380 (5.707 sec)
INFO:tensorflow:global_step/sec: 17.6297
INFO:tensorflow:loss = 57549.266, step = 96480 (5.672 sec)
INFO:tensorflow:global_step/sec: 17.5789
INFO:tensorflow:loss = 54674.758, step = 96580 (5.689 sec)
INFO:tensorflow:global_step/sec: 17.6043
INFO:tensorflow:loss = 48831.676, step = 96680 (5.680 sec)
INFO:te

({'average_loss': 18.057217,
  'label/mean': 11.570937,
  'loss': 2311.3237,
  'prediction/mean': 12.261183,
  'global_step': 97296},
 [])

In [145]:
infer = pd.read_csv('../input/test.csv', index_col='key')

In [146]:
def process_datetime(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year - 2009
    df['weekday'] = df['pickup_datetime'].dt.dayofweek
    df.drop('pickup_datetime', axis=1, inplace=True)
    
process_datetime(infer)

In [147]:
def rename_columns(df):
    cols = [
        'x0',
        'y0',
        'x1',
        'y1',
        'num_pass',
        'hour',
        'day',
        'month',
        'year',
        'weekday'
    ]

    df.columns = cols
    
def distances(df):
    df.loc[:, 'dist_e'] = np.sqrt((df.loc[:,'x1'] - df.loc[:,'x0'])**2 + (df.loc[:,'y1'] - df.loc[:,'y0'])**2)
    df.loc[:, 'dist_t'] = abs(df.loc[:,'x1'] - df.loc[:,'x0']) + abs(df.loc[:,'y1'] - df.loc[:,'y0'])

rename_columns(infer)
distances(infer)

In [155]:
generator = estimator.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-97296
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [156]:
ids = infer.index

submission = pd.DataFrame()
submission['fare_amount'] = values
submission['key'] = ids
submission.set_index('key', inplace=True)

submission.to_csv('../output/06.dnn_linear_combined_regressor_r5.csv')