In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

  from ._conv import register_converters as _register_converters


In [2]:
n_rows = 5000000

train = pd.read_csv(
    '../input/train.csv.zip', 
    index_col='key', 
    compression='infer', 
    nrows=n_rows)

valid = pd.read_csv(
    '../input/train.csv.zip', 
    index_col='key', 
    compression='infer', 
    nrows=n_rows/10, 
    skiprows=list(range(1,n_rows+1)))

infer = pd.read_csv('../input/test.csv', index_col='key')

In [3]:
def feature_engineering(df):
    df['pickup'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['weekday'] = df['pickup'].dt.dayofweek
    df['month'] = df['pickup'].dt.month
    df['hour'] = df['pickup'].dt.hour
    df['year'] = df['pickup'].dt.year
        
    df['lat_diff'] = abs(df['pickup_latitude'] - df['dropoff_latitude'])
    df['lon_diff'] = abs(df['pickup_longitude'] - df['dropoff_longitude'])
    df['taxicab'] = df['lat_diff'] + df['lon_diff']
        
    df.drop(['pickup_datetime','pickup'], axis=1, inplace=True)
    
feature_engineering(train)
feature_engineering(valid)
feature_engineering(infer)

train.dropna(how='any', axis=0, inplace=True)
valid.dropna(how='any', axis=0, inplace=True)

In [4]:
y_train = train['fare_amount']
y_valid = valid['fare_amount']

X_train = train.drop('fare_amount', axis=1, inplace=True)
X_valid = valid.drop('fare_amount', axis=1, inplace=True)

In [5]:
# definition of the numeric columns
p_lat = tf.feature_column.numeric_column('pickup_latitude')
p_lon = tf.feature_column.numeric_column('pickup_longitude')
d_lat = tf.feature_column.numeric_column('dropoff_latitude')
d_lon = tf.feature_column.numeric_column('dropoff_longitude')
p_cnt = tf.feature_column.numeric_column('passenger_count')
lat_diff = tf.feature_column.numeric_column('lat_diff')
lon_diff = tf.feature_column.numeric_column('lon_diff')
taxicab = tf.feature_column.numeric_column('taxicab')

# definition of the categorical columns
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
month = tf.feature_column.categorical_column_with_identity('month', num_buckets = 13)
year = tf.feature_column.categorical_column_with_identity('year', num_buckets = 2016)

# definition of the bucketized columns
numbuckets = 16

latbuckets = np.linspace(38,42,numbuckets).tolist()
lonbuckets = np.linspace(-76,-72,numbuckets).tolist()

p_latB = tf.feature_column.bucketized_column(p_lat, latbuckets)
d_latB = tf.feature_column.bucketized_column(d_lat, latbuckets)
p_lonB = tf.feature_column.bucketized_column(p_lon, lonbuckets)
d_lonB = tf.feature_column.bucketized_column(d_lon, lonbuckets)

# definition of the feature crosses
ploc = tf.feature_column.crossed_column([p_latB, p_lonB], numbuckets * numbuckets)
dloc = tf.feature_column.crossed_column([d_latB, d_lonB], numbuckets * numbuckets)
pd_pair = tf.feature_column.crossed_column([ploc, dloc], numbuckets**4)
day_hr = tf.feature_column.crossed_column([weekday, hour], 7 * 24)
mth_yr = tf.feature_column.crossed_column([month, year], 12 * 10)

In [6]:
wide_columns = [
    # Feature crosses
    ploc, dloc, pd_pair, day_hr, mth_yr,

    # Sparse columns
    weekday, hour,

    # Anything with a linear relationship
    p_cnt 
]

In [7]:
deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(pd_pair, 10),
    tf.feature_column.embedding_column(day_hr, 10),
    tf.feature_column.embedding_column(mth_yr, 10),

    # Numeric columns
    p_lat, p_lon, d_lat, d_lon, lat_diff, lon_diff, taxicab
]

In [8]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [9]:
model_dir = './DNN_Linear_Combined_Regressor'

file_writer = tf.summary.FileWriter(model_dir)

estimator = tf.estimator.DNNLinearCombinedRegressor(
    model_dir = model_dir,
    linear_feature_columns = wide_columns,
    dnn_feature_columns = deep_columns,
    dnn_optimizer = tf.train.AdamOptimizer(learning_rate=0.0006),
    dnn_hidden_units = [64,64,64,8], 
    batch_norm = True,
    dnn_dropout = 0.1
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Linear_Combined_Regressor', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000021C8C9A6BA8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [10]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(train, y_train, batch_size=7000))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(valid, y_valid))

In [12]:
for i in range(5):
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-3575
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 3575 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 223667.08, step = 3576
INFO:tensorflow:global_step/sec: 5.48724
INFO:tensorflow:loss = 203906.42, step = 3676 (18.223 sec)
INFO:tensorflow:global_step/sec: 5.74242
INFO:tensorflow:loss = 195380.6, step = 3776 (17.416 sec)
INFO:tensorflow:global_step/sec: 5.36057


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Linear_Combined_Regressor\model.ckpt-5720
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 5720 into ./DNN_Linear_Combined_Regressor\model.ckpt.
INFO:tensorflow:loss = 980149.8, step = 5721
INFO:tensorflow:global_step/sec: 4.98049
INFO:tensorflow:loss = 184993.22, step = 5821 (20.080 sec)
INFO:tensorflow:global_step/sec: 5.31006
INFO:tensorflow:loss = 169220.89, step = 5921 (18.831 sec)
INFO:tensorflow:global_step/sec: 5.96735
INFO:tensorflow:loss = 159484.3, step = 6021 (16.756 sec)
INFO:tensorflow:global_step/sec: 5.79617
INFO:tensorflow:loss = 189417.11, step = 6121 (17.255 sec)
INFO:tensorflow:global_step/sec: 5.39282
INFO:tensorflow:loss = 177598.66, step = 6221 (18.541 sec)
INFO:tensorflow:global_step/sec: 5.2

In [None]:
generator = estimator.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

In [None]:
ids = infer.index

submission = pd.DataFrame()
submission['fare_amount'] = values
submission['key'] = ids
submission.set_index('key', inplace=True)

submission.to_csv('../output/02.dnn_linear_combined_regressor_r1.csv')