In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (16,10)

In [2]:
def read_csv(TRAIN_PATH):
    chunksize = 5_000_000 # batch size
    
    traintypes = {
        'fare': 'float32',
        'from': 'str',
        'to': 'str',
        'p_count': 'uint8', 
        'hour': 'uint8',
        'weekday': 'uint8',
        'week': 'uint8',
        'year': 'int16',
        'dist_e': 'float32',
        'dist_t': 'float32',
        'x0': 'float32',
        'x1': 'float32',
        'y0': 'float32',
        'y1': 'float32',
    }
    
    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize):
        df_list.append(df_chunk) 
        df = pd.concat(df_list)
        
    return df

In [3]:
train = read_csv('../input/train_cleaned_2.csv')

In [4]:
train = train[(train['from'] == 'Manhattan') & (train['to'] == 'Manhattan')]

In [5]:
train.shape

(46123235, 14)

In [6]:
y = train['fare']
X = train.drop(['fare'], axis=1)
del train

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=379582)

del X,y

In [5]:
import tensorflow as tf

In [17]:
# num_buckets = [0, num_buckets)
p_count = tf.feature_column.numeric_column('p_count')
taxicab = tf.feature_column.numeric_column('dist_t')
euclidean = tf.feature_column.numeric_column('dist_e')

x0 = tf.feature_column.numeric_column('x0')
x1 = tf.feature_column.numeric_column('x1')
y0 = tf.feature_column.numeric_column('y0')
y1 = tf.feature_column.numeric_column('y1')

numbuckets = 30
dx = np.linspace(-74.019108, -73.928684, numbuckets).tolist()
dy = np.linspace(40.700316, 40.877544, numbuckets).tolist()

x0_B = tf.feature_column.bucketized_column(x0, dx)
x1_B = tf.feature_column.bucketized_column(x1, dx)
y0_B = tf.feature_column.bucketized_column(y0, dy)
y1_B = tf.feature_column.bucketized_column(y1, dy)

xy_0 = tf.feature_column.crossed_column([x0_B, y0_B], numbuckets * numbuckets)
xy_1 = tf.feature_column.crossed_column([x1_B, y1_B], numbuckets * numbuckets)
xy_pair = tf.feature_column.crossed_column([xy_0, xy_1], numbuckets**4)

# definition of the categorical columns
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
week = tf.feature_column.categorical_column_with_identity('week', num_buckets = 54)
year = tf.feature_column.categorical_column_with_vocabulary_list('year', [2009, 2010, 2011, 2012, 2013, 2014, 2015])

# definition of the feature crosses
hour_weekday = tf.feature_column.crossed_column([hour, weekday], 24 * 7)
weekday_week_year = tf.feature_column.crossed_column([weekday, week, year], 7 * 54 * 7)
week_year = tf.feature_column.crossed_column([week, year], 54 * 7)
weekday_year = tf.feature_column.crossed_column([weekday, year], 7 * 7)
hour_weekday_year = tf.feature_column.crossed_column([hour, weekday, year], 24 * 7 * 7)

In [18]:
deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(hour_weekday, 50),
    tf.feature_column.embedding_column(weekday_week_year, 50),
    tf.feature_column.embedding_column(week_year, 50),
    tf.feature_column.embedding_column(weekday_year, 50),
    tf.feature_column.embedding_column(hour_weekday_year, 50),
    tf.feature_column.embedding_column(xy_pair, 50),

    # indicator columns
    tf.feature_column.indicator_column(hour),
    tf.feature_column.indicator_column(weekday),
    tf.feature_column.indicator_column(week),
    tf.feature_column.indicator_column(year),
    tf.feature_column.indicator_column(xy_0),
    tf.feature_column.indicator_column(xy_1),

    # numeric columns
    taxicab, euclidean, p_count
]

In [19]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [20]:
model_dir = './DNN_Regressor_manhattan'

file_writer = tf.summary.FileWriter(model_dir)

estimator = tf.estimator.DNNRegressor(
    model_dir = model_dir,
    feature_columns = deep_columns,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    hidden_units = [128,128,128,128], 
    batch_norm = True,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Regressor_manhattan', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001F8559F1198>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(X_train, y_train, batch_size=2500))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(X_eval, y_eval))

In [14]:
for i in range(1):
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./DNN_Regressor_manhattan\model.ckpt.
INFO:tensorflow:loss = 268322.38, step = 0
INFO:tensorflow:global_step/sec: 6.71512
INFO:tensorflow:loss = 216159.89, step = 100 (14.894 sec)
INFO:tensorflow:global_step/sec: 7.23428
INFO:tensorflow:loss = 195271.53, step = 200 (13.822 sec)
INFO:tensorflow:global_step/sec: 7.23445
INFO:tensorflow:loss = 192898.98, step = 300 (13.822 sec)
INFO:tensorflow:global_step/sec: 7.08391
INFO

INFO:tensorflow:global_step/sec: 7.40088
INFO:tensorflow:loss = 19065.648, step = 6600 (13.512 sec)
INFO:tensorflow:global_step/sec: 7.39325
INFO:tensorflow:loss = 16822.85, step = 6700 (13.527 sec)
INFO:tensorflow:global_step/sec: 7.36253
INFO:tensorflow:loss = 19988.008, step = 6800 (13.581 sec)
INFO:tensorflow:global_step/sec: 7.37446
INFO:tensorflow:loss = 13018.806, step = 6900 (13.560 sec)
INFO:tensorflow:global_step/sec: 7.3457
INFO:tensorflow:loss = 15894.525, step = 7000 (13.613 sec)
INFO:tensorflow:global_step/sec: 7.30883
INFO:tensorflow:loss = 14577.035, step = 7100 (13.690 sec)
INFO:tensorflow:global_step/sec: 7.35868
INFO:tensorflow:loss = 13697.546, step = 7200 (13.581 sec)
INFO:tensorflow:global_step/sec: 7.26922
INFO:tensorflow:loss = 11699.647, step = 7300 (13.757 sec)
INFO:tensorflow:global_step/sec: 7.39065
INFO:tensorflow:loss = 26525.164, step = 7400 (13.531 sec)
INFO:tensorflow:global_step/sec: 7.35033
INFO:tensorflow:loss = 12724.57, step = 7500 (13.605 sec)
INF

INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-09-04-21:21:32
INFO:tensorflow:Saving dict for global step 12961: average_loss = 7.100344, global_step = 12961, label/mean = 8.933156, loss = 908.84406, prediction/mean = 9.02547
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 12961: ./DNN_Regressor_manhattan\model.ckpt-12961
INFO:tensorflow:global_step/sec: 4.62697
INFO:tensorflow:loss = 16569.594, step = 13000 (21.612 sec)
INFO:tensorflow:global_step/sec: 7.30489
INFO:tensorflow:loss = 17635.031, step = 13100 (13.689 sec)
INFO:tensorflow:global_step/sec: 7.31135
INFO:tensorflow:loss = 14980.781, step = 13200 (13.677 sec)
INFO:tensorflow:global_step/sec: 7.33378
INFO:tensorflow:loss = 17465.24, step = 13300 (13.636 sec)
INFO:tensorflow:global_step/sec: 7.31403
INFO:tensorflow:loss = 21117.553, step = 13400 (13.680 sec)
INFO:tensorflow:global_step/sec: 7.01845
INFO:ten

In [21]:
infer = pd.read_csv('../input/test_processed.csv', index_col='key')

generator = estimator.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

ids = infer.index
submission = pd.DataFrame()
submission['fare_amount'] = values
submission['key'] = ids
submission.set_index('key', inplace=True)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Regressor_manhattan\model.ckpt-13837
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [34]:
locations = [
    'Manhattan', 'JFK', 'LaGardia', 'Newark', 'Teterboro', 'Brooklin', 'Bronx', 'Staten', 'Queens', 'Westchester',
    'Nassau', 'Jersey', 'Bayonne', 'WStaton', 'West', 'WNewYork', 'North'        
]

# num_buckets = [0, num_buckets)

p_count = tf.feature_column.numeric_column('p_count')
taxicab = tf.feature_column.numeric_column('dist_t')
euclidean = tf.feature_column.numeric_column('dist_e')

x0 = tf.feature_column.numeric_column('x0')
x1 = tf.feature_column.numeric_column('x1')
y0 = tf.feature_column.numeric_column('y0')
y1 = tf.feature_column.numeric_column('y1')

numbuckets = 30
dx = np.linspace(-74.019108, -73.928684, numbuckets).tolist()
dy = np.linspace(40.700316, 40.877544, numbuckets).tolist()

x0_B = tf.feature_column.bucketized_column(x0, dx)
x1_B = tf.feature_column.bucketized_column(x1, dx)
y0_B = tf.feature_column.bucketized_column(y0, dy)
y1_B = tf.feature_column.bucketized_column(y1, dy)

xy_0 = tf.feature_column.crossed_column([x0_B, y0_B], numbuckets * numbuckets)
xy_1 = tf.feature_column.crossed_column([x1_B, y1_B], numbuckets * numbuckets)
xy_pair = tf.feature_column.crossed_column([xy_0, xy_1], numbuckets**4)

# definition of the categorical columns
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
week = tf.feature_column.categorical_column_with_identity('week', num_buckets = 54)
year = tf.feature_column.categorical_column_with_vocabulary_list('year', [2009, 2010, 2011, 2012, 2013, 2014, 2015])

loc_from = tf.feature_column.categorical_column_with_vocabulary_list('from', locations)
loc_to = tf.feature_column.categorical_column_with_vocabulary_list('to', locations)

# definition of the feature crosses
from_to = tf.feature_column.crossed_column([loc_from, loc_to], len(locations)**2)
from_to_year = tf.feature_column.crossed_column([from_to, year], len(locations)**2 * 7)
from_to_year_week = tf.feature_column.crossed_column([from_to_year, week], len(locations)**2 * 7 * 54)

weekday_year = tf.feature_column.crossed_column([weekday, year], 7 * 7)
hour_weekday_year = tf.feature_column.crossed_column([hour, weekday, year], 24 * 7 * 7)

deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(from_to, 50),
    tf.feature_column.embedding_column(from_to_year, 50),
    tf.feature_column.embedding_column(from_to_year_week, 50),
    tf.feature_column.embedding_column(weekday_year, 50),
    tf.feature_column.embedding_column(hour_weekday_year, 50),
    tf.feature_column.embedding_column(xy_pair, 50),

    # indicator columns
    tf.feature_column.indicator_column(loc_from),
    tf.feature_column.indicator_column(loc_to),
    tf.feature_column.indicator_column(hour),
    tf.feature_column.indicator_column(weekday),
    tf.feature_column.indicator_column(week),
    tf.feature_column.indicator_column(year),
    tf.feature_column.indicator_column(xy_0),
    tf.feature_column.indicator_column(xy_1),

    # numeric columns
    taxicab, euclidean, p_count
]

model_dir = './DNN_Regressor_regions_coords_overfit'

file_writer = tf.summary.FileWriter(model_dir)

estimator_other = tf.estimator.DNNRegressor(
    model_dir = model_dir,
    feature_columns = deep_columns,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    hidden_units = [128,128,128,128], 
    batch_norm = True,
)

generator = estimator_other.predict(input_fn=pred_input_fn(infer))
predictions = [next(generator) for i in range(len(infer))]
values = [val['predictions'].tolist()[0] for val in predictions]

ids = infer.index
submission_other = pd.DataFrame()
submission_other['fare_amount'] = values
submission_other['key'] = ids
submission_other.set_index('key', inplace=True)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Regressor_regions_coords_overfit', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001F856272710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Regressor_regions_coords_overfit\model.ckpt-177188
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [27]:
other = infer[(infer['from'] != 'Manhattan') & (infer['to'] != 'Manhattan')].index

In [35]:
submission.loc[other,'fare_amount'] = submission_other.loc[other,'fare_amount']

In [36]:
submission.head()

Unnamed: 0_level_0,fare_amount
key,Unnamed: 1_level_1
2015-01-27 13:08:24.0000002,10.159225
2015-01-27 13:08:24.0000003,11.004674
2011-10-08 11:53:44.0000002,4.678533
2012-12-01 21:12:12.0000002,8.184221
2012-12-01 21:12:12.0000003,17.117945


In [37]:
submission.to_csv('../output/20.dnn_regressor_manhattan_other_best_rmse_(2.43+3.01).csv')