In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (16,8)

In [2]:
def read_csv(TRAIN_PATH):
    chunksize = 5_000_000 # batch size
    
    # defining the columns datatypes (optimize memory usage)
    traintypes = {
        'fare': 'float32',
        'num_pass': 'uint8', 
        'year': 'uint8',
        'hour': 'uint8',
        'day': 'uint8',
        'month': 'uint8',
        'weekday': 'uint8',
        'x0': 'float32',
        'x1': 'float32',
        'y0': 'float32',
        'y1': 'float32',
        'dist_e': 'float32',
        'dist_t': 'float32'
    }

    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize):
        df_list.append(df_chunk) 
        df = pd.concat(df_list)
        
    return df

In [3]:
df = read_csv('../input/train_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,fare,num_pass,year,hour,day,month,weekday,x0,x1,y0,y1,dist_e,dist_t
0,4.5,1,0,17,15,6,0,-73.844315,-73.841614,40.721317,40.712276,0.009436,0.011742
1,16.9,1,1,16,5,1,1,-74.016045,-73.979271,40.711304,40.782005,0.079693,0.107475
2,5.7,2,2,0,18,8,3,-73.982735,-73.991241,40.761269,40.750561,0.013676,0.019215
3,7.7,1,3,4,21,4,5,-73.987129,-73.99157,40.733143,40.758091,0.02534,0.029388
4,5.3,1,1,7,9,3,1,-73.968094,-73.956657,40.768009,40.783764,0.019468,0.027191


In [5]:
def norm_params(df):
    params = {}
    columns = [
        'x0', 'x1', 'y0', 'y1', 'dist_e', 'dist_t']
    
    for column in columns:
        #params[column] = {'mean': df[column].mean(), 'std': df[column].std()}
        params[column] = {'min': df[column].min(), 'max': df[column].max()}
        
    return params

params = norm_params(df)

def normalize(df, params):
    for key in params:
        #mean = params[key]['mean']
        #std = params[key]['std']
        #df[key+'_norm'] = (df[key] - mean)/std
            
        minimum = params[key]['min']
        maximum = params[key]['max']

        df[key+'_norm'] = (df[key] - minimum) / (maximum - minimum)
            
normalize(df, params)

In [6]:
y = df['fare']
X = df.drop(['fare'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=379582)

del X,y

In [7]:
# definition of the numeric columns
x0 = tf.feature_column.numeric_column('x0')
x1 = tf.feature_column.numeric_column('x1')
y0 = tf.feature_column.numeric_column('y0')
y1 = tf.feature_column.numeric_column('y1')
num_pass = tf.feature_column.numeric_column('num_pass')
x0_norm = tf.feature_column.numeric_column('x0_norm')
x1_norm = tf.feature_column.numeric_column('x1_norm')
y0_norm = tf.feature_column.numeric_column('y0_norm')
y1_norm = tf.feature_column.numeric_column('y1_norm')
taxicab = tf.feature_column.numeric_column('dist_t_norm')
euclidean = tf.feature_column.numeric_column('dist_e_norm')

# definition of the categorical columns
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
month = tf.feature_column.categorical_column_with_identity('month', num_buckets = 13)
year = tf.feature_column.categorical_column_with_identity('year', num_buckets = 7)

# definition of the bucketized columns
numbuckets = 15

x = np.linspace(-74.27, -72.98, numbuckets).tolist()
y = np.linspace(40.56, 41.71, numbuckets).tolist()

x0_B = tf.feature_column.bucketized_column(x0, x)
x1_B = tf.feature_column.bucketized_column(x1, x)
y0_B = tf.feature_column.bucketized_column(y0, y)
y1_B = tf.feature_column.bucketized_column(y1, y)

# definition of the feature crosses
xy_0 = tf.feature_column.crossed_column([x0_B, y0_B], numbuckets * numbuckets)
xy_1 = tf.feature_column.crossed_column([x1_B, y1_B], numbuckets * numbuckets)
xy_pair = tf.feature_column.crossed_column([xy_0, xy_1], numbuckets**4)
day_hr = tf.feature_column.crossed_column([weekday, hour], 7 * 24)
mth_yr = tf.feature_column.crossed_column([month, year], 12 * 10)
day_yr = tf.feature_column.crossed_column([weekday, year], 7 * 7)
space_time = tf.feature_column.crossed_column([xy_pair, month], numbuckets**4 * 13)

In [8]:
deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(xy_pair, 40),
    tf.feature_column.embedding_column(day_hr, 40),
    tf.feature_column.embedding_column(mth_yr, 40),
    tf.feature_column.embedding_column(space_time, 40),
    tf.feature_column.embedding_column(day_yr, 40),

    # indicator columns
    tf.feature_column.indicator_column(xy_0),
    tf.feature_column.indicator_column(xy_1),
    tf.feature_column.indicator_column(weekday),
    tf.feature_column.indicator_column(hour),
    tf.feature_column.indicator_column(year),
    tf.feature_column.indicator_column(month),
    
    # numeric columns
    taxicab, euclidean, num_pass
]

In [9]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [10]:
model_dir = './DNN_Regressor'

file_writer = tf.summary.FileWriter(model_dir)

estimator = tf.estimator.DNNRegressor(
    model_dir = model_dir,
    feature_columns = deep_columns,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    hidden_units = [128,128,64,32,16], 
    batch_norm = True,
    dropout = 0.1,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Regressor', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001DA0AD9E240>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(X_train, y_train, batch_size=2500))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(X_eval, y_eval))

In [12]:
for i in range(40):
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./DNN_Regressor\model.ckpt.
INFO:tensorflow:loss = 558917.0, step = 0
INFO:tensorflow:global_step/sec: 9.1959
INFO:tensorflow:loss = 510157.28, step = 100 (10.859 sec)
INFO:tensorflow:global_step/sec: 10.4168
INFO:tensorflow:loss = 484865.28, step = 200 (9.600 sec)
INFO:tensorflow:global_step/sec: 10.3789
INFO:tensorflow:loss = 482961.72, step = 300 (9.635 sec)
INFO:tensorflow:global_step/sec: 10.5065
INFO:tensorflow:lo

INFO:tensorflow:global_step/sec: 10.9352
INFO:tensorflow:loss = 46425.938, step = 6700 (9.147 sec)
INFO:tensorflow:global_step/sec: 10.9559
INFO:tensorflow:loss = 46135.547, step = 6800 (9.126 sec)
INFO:tensorflow:global_step/sec: 10.9777
INFO:tensorflow:loss = 40904.457, step = 6900 (9.109 sec)
INFO:tensorflow:global_step/sec: 10.9405
INFO:tensorflow:loss = 43738.047, step = 7000 (9.140 sec)
INFO:tensorflow:global_step/sec: 10.9782
INFO:tensorflow:loss = 42332.54, step = 7100 (9.109 sec)
INFO:tensorflow:global_step/sec: 10.9388
INFO:tensorflow:loss = 45564.445, step = 7200 (9.142 sec)
INFO:tensorflow:global_step/sec: 10.9203
INFO:tensorflow:loss = 43414.363, step = 7300 (9.157 sec)
INFO:tensorflow:global_step/sec: 10.9413
INFO:tensorflow:loss = 47684.89, step = 7400 (9.140 sec)
INFO:tensorflow:global_step/sec: 10.9602
INFO:tensorflow:loss = 54108.824, step = 7500 (9.124 sec)
INFO:tensorflow:global_step/sec: 10.9575
INFO:tensorflow:loss = 50794.508, step = 7600 (9.142 sec)
INFO:tensorf

INFO:tensorflow:loss = 33812.36, step = 14800 (9.446 sec)
INFO:tensorflow:global_step/sec: 10.7379
INFO:tensorflow:loss = 42382.297, step = 14900 (9.300 sec)
INFO:tensorflow:global_step/sec: 10.3706
INFO:tensorflow:loss = 36253.52, step = 15000 (9.642 sec)
INFO:tensorflow:global_step/sec: 10.0286
INFO:tensorflow:loss = 45652.367, step = 15100 (9.972 sec)
INFO:tensorflow:global_step/sec: 10.2231
INFO:tensorflow:loss = 32385.658, step = 15200 (9.783 sec)
INFO:tensorflow:global_step/sec: 10.334
INFO:tensorflow:loss = 28594.879, step = 15300 (9.676 sec)
INFO:tensorflow:global_step/sec: 9.61795
INFO:tensorflow:loss = 39139.098, step = 15400 (10.397 sec)
INFO:tensorflow:global_step/sec: 9.69415
INFO:tensorflow:loss = 39998.0, step = 15500 (10.315 sec)
INFO:tensorflow:global_step/sec: 10.1965
INFO:tensorflow:loss = 38893.344, step = 15600 (9.807 sec)
INFO:tensorflow:global_step/sec: 10.1391
INFO:tensorflow:loss = 25545.125, step = 15700 (9.861 sec)
INFO:tensorflow:global_step/sec: 10.2961
INF

INFO:tensorflow:global_step/sec: 10.5052
INFO:tensorflow:loss = 38129.363, step = 21204 (9.510 sec)
INFO:tensorflow:global_step/sec: 10.8807
INFO:tensorflow:loss = 41659.64, step = 21304 (9.206 sec)
INFO:tensorflow:global_step/sec: 10.7902
INFO:tensorflow:loss = 40547.656, step = 21404 (9.252 sec)
INFO:tensorflow:global_step/sec: 10.885
INFO:tensorflow:loss = 37177.586, step = 21504 (9.187 sec)
INFO:tensorflow:global_step/sec: 10.718
INFO:tensorflow:loss = 44504.92, step = 21604 (9.330 sec)
INFO:tensorflow:global_step/sec: 10.2575
INFO:tensorflow:loss = 36161.84, step = 21704 (9.751 sec)
INFO:tensorflow:global_step/sec: 10.5976
INFO:tensorflow:loss = 36449.117, step = 21804 (9.436 sec)
INFO:tensorflow:global_step/sec: 10.5591
INFO:tensorflow:loss = 35504.28, step = 21904 (9.471 sec)
INFO:tensorflow:global_step/sec: 10.5562
INFO:tensorflow:loss = 38610.29, step = 22004 (9.470 sec)
INFO:tensorflow:global_step/sec: 10.8013
INFO:tensorflow:loss = 36218.062, step = 22104 (9.258 sec)
INFO:te

INFO:tensorflow:loss = 36242.05, step = 28304 (9.157 sec)
INFO:tensorflow:global_step/sec: 10.8474
INFO:tensorflow:loss = 36348.86, step = 28404 (9.219 sec)
INFO:tensorflow:global_step/sec: 10.9408
INFO:tensorflow:loss = 37920.414, step = 28504 (9.140 sec)
INFO:tensorflow:global_step/sec: 10.9213
INFO:tensorflow:loss = 36236.74, step = 28604 (9.156 sec)
INFO:tensorflow:global_step/sec: 10.9034
INFO:tensorflow:loss = 27186.135, step = 28704 (9.171 sec)
INFO:tensorflow:global_step/sec: 10.927
INFO:tensorflow:loss = 38772.184, step = 28804 (9.152 sec)
INFO:tensorflow:global_step/sec: 10.9162
INFO:tensorflow:loss = 37807.86, step = 28904 (9.176 sec)
INFO:tensorflow:global_step/sec: 10.9218
INFO:tensorflow:loss = 42267.918, step = 29004 (9.141 sec)
INFO:tensorflow:Saving checkpoints for 29049 into ./DNN_Regressor\model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-27-06:54:56
INFO:tensorflow:Graph was finalized.

KeyboardInterrupt: 

In [None]:
infer = pd.read_csv('../input/test.csv', index_col='key')

In [None]:
def process_datetime(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour'] = df['pickup_datetime'].dt.hour
    df['day'] = df['pickup_datetime'].dt.day
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year - 2009
    df['weekday'] = df['pickup_datetime'].dt.dayofweek
    df.drop('pickup_datetime', axis=1, inplace=True)
    
process_datetime(infer)

In [None]:
def rename_columns(df):
    cols = [
        'x0',
        'y0',
        'x1',
        'y1',
        'num_pass',
        'hour',
        'day',
        'month',
        'year',
        'weekday'
    ]

    df.columns = cols
    
def distances(df):
    df.loc[:, 'dist_e'] = np.sqrt((df.loc[:,'x1'] - df.loc[:,'x0'])**2 + (df.loc[:,'y1'] - df.loc[:,'y0'])**2)
    df.loc[:, 'dist_t'] = abs(df.loc[:,'x1'] - df.loc[:,'x0']) + abs(df.loc[:,'y1'] - df.loc[:,'y0'])

rename_columns(infer)
distances(infer)

In [None]:
normalize(infer, params)

In [None]:
generator = estimator.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

In [None]:
ids = infer.index

submission = pd.DataFrame()
submission['fare_amount'] = values
submission['key'] = ids
submission.set_index('key', inplace=True)

submission.to_csv('../output/17.dnn_linear_combined_regressor_rmse_3.22.csv')