In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (16,10)

In [4]:
def read_csv(TRAIN_PATH):
    chunksize = 5_000_000 # batch size
    
    traintypes = {
        'fare': 'float32',
        'from': 'str',
        'to': 'str',
        'p_count': 'uint8', 
        'year': 'uint8',
        'hour': 'uint8',
        'weekday': 'uint8',
        'week': 'uint8',
        'dist_e': 'float32',
        'dist_t': 'float32',
        'x0': 'float32',
        'x1': 'float32',
        'y0': 'float32',
        'y1': 'float32',
    }
    
    cols = list(traintypes.keys())
    
    # loading the dataframe into list of small dataframes
    df_list = []

    for df_chunk in pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize):
        df_list.append(df_chunk) 
        df = pd.concat(df_list)
        
    return df

In [5]:
train = read_csv('../input/train_cleaned_2.csv')

In [6]:
train.head()

Unnamed: 0,fare,from,to,hour,weekday,week,year,p_count,dist_e,dist_t,x0,x1,y0,y1
0,4.5,Queens,Queens,17,0,25,217,1,0.638322,0.759079,-73.844315,-73.841614,40.721317,40.712276
1,16.9,Manhattan,Manhattan,16,1,1,218,1,5.213395,6.717119,-74.016045,-73.979271,40.711304,40.782005
2,5.7,Manhattan,Manhattan,0,3,33,219,2,0.85261,1.16424,-73.982735,-73.991241,40.761269,40.750561
3,7.7,Manhattan,Manhattan,4,5,16,220,1,1.7356,1.943343,-73.987129,-73.99157,40.733143,40.758091
4,5.3,Manhattan,Manhattan,7,1,10,218,1,1.228196,1.658726,-73.968094,-73.956657,40.768009,40.783764


In [7]:
y = train['fare']
X = train.drop(['fare'], axis=1)
del train

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.25, random_state=379582)

del X,y

In [8]:
import tensorflow as tf

In [9]:
locations = [
    'Manhattan', 'JFK', 'LaGardia', 'Newark', 'Teterboro', 'Brooklin', 'Bronx', 'Staten', 'Queens', 'Westchester',
    'Nassau', 'Jersey', 'Bayonne', 'WStaton', 'West', 'WNewYork', 'North'        
]

p_count = tf.feature_column.numeric_column('p_count')
taxicab = tf.feature_column.numeric_column('dist_t')
euclidean = tf.feature_column.numeric_column('dist_e')

# definition of the categorical columns
hour = tf.feature_column.categorical_column_with_identity('hour', num_buckets = 24)
weekday = tf.feature_column.categorical_column_with_identity('weekday', num_buckets = 7)
week = tf.feature_column.categorical_column_with_identity('week', num_buckets = 54)
year = tf.feature_column.categorical_column_with_identity('year', num_buckets = 7)

loc_from = tf.feature_column.categorical_column_with_vocabulary_list('from', locations)
loc_to = tf.feature_column.categorical_column_with_vocabulary_list('to', locations)

# definition of the feature crosses
from_to = tf.feature_column.crossed_column([loc_from, loc_to], len(locations)**2)
from_to_year = tf.feature_column.crossed_column([from_to, year], len(locations)**2 * 7)
from_to_year_week = tf.feature_column.crossed_column([from_to_year, week], len(locations)**2 * 7 * 54)

weekday_year = tf.feature_column.crossed_column([weekday, year], 7 * 7)
hour_weekday_year = tf.feature_column.crossed_column([hour, weekday, year], 24*7*7)

In [10]:
deep_columns = [
    # Embedding_column to "group" together ...
    tf.feature_column.embedding_column(from_to, 50),
    tf.feature_column.embedding_column(from_to_year, 50),
    tf.feature_column.embedding_column(from_to_year_week, 50),
    tf.feature_column.embedding_column(weekday_year, 50),
    tf.feature_column.embedding_column(hour_weekday_year, 50),

    # indicator columns
    tf.feature_column.indicator_column(loc_from),
    tf.feature_column.indicator_column(loc_to),
    tf.feature_column.indicator_column(hour),
    tf.feature_column.indicator_column(weekday),
    tf.feature_column.indicator_column(week),
    tf.feature_column.indicator_column(year),

    # numeric columns
    taxicab, euclidean, p_count
]

In [11]:
def train_input_fn(features, labels, batch_size):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True,
        batch_size = batch_size)

def eval_input_fn(features, labels):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        y = labels,
        num_epochs = 1,
        shuffle = True)

def pred_input_fn(features):
    return tf.estimator.inputs.pandas_input_fn(
        x = features,
        num_epochs = 1,
        shuffle = False)

In [12]:
model_dir = './DNN_Regressor_regions'

file_writer = tf.summary.FileWriter(model_dir)

estimator = tf.estimator.DNNRegressor(
    model_dir = model_dir,
    feature_columns = deep_columns,
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001),
    hidden_units = [128,128,128,128], 
    batch_norm = True,
    dropout = 0.1,
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './DNN_Regressor_regions', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002428329E8D0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [13]:
train_spec = tf.estimator.TrainSpec(input_fn = train_input_fn(X_train, y_train, batch_size=2500))
eval_spec = tf.estimator.EvalSpec(input_fn = eval_input_fn(X_eval, y_eval))

In [14]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./DNN_Regressor_regions\model.ckpt-0
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./DNN_Regressor_regions\model.ckpt.


InvalidArgumentError: assertion failed: [222 218 219...] [7]
	 [[Node: dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert = Assert[T=[DT_INT64, DT_INT64], summarize=3, _device="/job:localhost/replica:0/task:0/device:CPU:0"](dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert/Switch/_749, dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert/Switch_1/_751, dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert/Switch_2/_753)]]

Caused by op 'dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert', defined at:
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2907, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-30df9d6a3d21>", line 1, in <module>
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\training.py", line 451, in train_and_evaluate
    return executor.run()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\training.py", line 590, in run
    return self.run_local()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\training.py", line 691, in run_local
    saving_listeners=saving_listeners)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 376, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1145, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1170, in _train_model_default
    features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 1133, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py", line 549, in _model_fn
    batch_norm=batch_norm)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py", line 204, in _dnn_model_fn
    logits = logit_fn(features=features, mode=mode)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\canned\dnn.py", line 94, in dnn_logit_fn
    features=features, feature_columns=feature_columns)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 277, in input_layer
    trainable, cols_to_vars)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 202, in _internal_input_layer
    trainable=trainable)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 2620, in _get_dense_tensor
    trainable=trainable)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 2580, in _get_dense_tensor_internal
    inputs, weight_collections=weight_collections, trainable=trainable)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 3210, in _get_sparse_tensors
    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 2263, in get
    transformed = column._transform_feature(self)  # pylint: disable=protected-access
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 3189, in _transform_feature
    ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 3104, in _get_sparse_tensors
    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 2263, in get
    transformed = column._transform_feature(self)  # pylint: disable=protected-access
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 3075, in _transform_feature
    name='assert_less_than_num_buckets')
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\check_ops.py", line 559, in assert_less
    return control_flow_ops.Assert(condition, data, summarize=summarize)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\util\tf_should_use.py", line 118, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs))
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 157, in Assert
    guarded_assert = cond(condition, no_op, true_assert, name="AssertGuard")
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 2057, in cond
    orig_res_f, res_f = context_f.BuildCondBranch(false_fn)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 1895, in BuildCondBranch
    original_result = fn()
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py", line 155, in true_assert
    condition, data, summarize, name="Assert")
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 51, in _assert
    name=name)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3155, in create_op
    op_def=op_def)
  File "C:\Users\vasil\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): assertion failed: [222 218 219...] [7]
	 [[Node: dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert = Assert[T=[DT_INT64, DT_INT64], summarize=3, _device="/job:localhost/replica:0/task:0/device:CPU:0"](dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert/Switch/_749, dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert/Switch_1/_751, dnn/input_from_feature_columns/input_layer/from_X_to_X_week_X_year_embedding/assert_less_than_num_buckets/Assert/AssertGuard/Assert/Switch_2/_753)]]


In [None]:
del estimator