In [3]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

The NY taxi trip data is already clean. It includes January 2019.

This is my first neural network model so I am using extensivly code from Geron's "Hands on machine learning".

The task is to check if sequential MLP model will does better then Random Forest model I done before in other file. The results were:

- R^2 of Random Forest model:
 0.9961362204519756

- MSE of Random Forest model:
 0.2427981302570017

- RMSE of Random Forest model:
 0.4927455025233632

- MAE of Random Forest model:
 0.2427981302570017

- 95% confidence interval of Random Forest model:
 [0.47478443 0.51007451]

- Some train predictions:
 [ 4.695 17.595 12.815 29.825 14.495]

- Some y_train:
 [4.5, 17.5, 13.0, 30.0, 14.5]


In [13]:
data = pd.read_csv('D:\\Coding_data\\yellow_tripdata_2019-1_V2.csv')

irrelevant_features = ['RatecodeID', 'payment_type']
data.drop(irrelevant_features, inplace=True, axis=1)
pd.set_option('use_inf_as_na', True)
data = data.replace([np.inf, -np.inf], 0).dropna(subset=data.columns, how="all")
data.dropna()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,DOLocationID,fare_amount,trip_time,distance/time
0,1546303600,1546304000,2.41,151,239,7.0,400,0.006025
1,1546304387,1546305539,4.18,239,246,14.0,1152,0.003628
2,1546302088,1546302517,2.09,163,229,6.5,429,0.004872
3,1546302721,1546303539,5.95,229,7,13.5,818,0.007274
4,1546304252,1546304972,3.38,141,234,10.0,720,0.004694
...,...,...,...,...,...,...,...,...
7094954,1548975865,1548976629,6.68,186,13,14.5,764,0.008743
7094955,1548977554,1548978100,2.16,68,233,8.0,546,0.003956
7094956,1548978468,1548979268,2.33,161,229,10.5,800,0.002913
7094957,1548977514,1548978636,6.89,186,262,17.0,1122,0.006141


In [14]:
data[data['trip_time'] == 0]

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,DOLocationID,fare_amount,trip_time,distance/time


In [15]:
data.shape

(7094807, 8)

In [16]:
data.isna().sum()

tpep_pickup_datetime     0
tpep_dropoff_datetime    0
trip_distance            0
PULocationID             0
DOLocationID             0
fare_amount              0
trip_time                0
distance/time            0
dtype: int64

In [17]:
data.notnull().values.all()

True

In [18]:
np.isfinite(data).sum()

tpep_pickup_datetime     7094807
tpep_dropoff_datetime    7094807
trip_distance            7094807
PULocationID             7094807
DOLocationID             7094807
fare_amount              7094807
trip_time                7094807
distance/time            7094807
dtype: int64

In [19]:
np.isnan(data).all()

tpep_pickup_datetime     False
tpep_dropoff_datetime    False
trip_distance            False
PULocationID             False
DOLocationID             False
fare_amount              False
trip_time                False
distance/time            False
dtype: bool

In [20]:
data.isnull().values.all()

False

In [21]:
print(data.head())

   tpep_pickup_datetime  tpep_dropoff_datetime  trip_distance  PULocationID  \
0            1546303600             1546304000           2.41           151   
1            1546304387             1546305539           4.18           239   
2            1546302088             1546302517           2.09           163   
3            1546302721             1546303539           5.95           229   
4            1546304252             1546304972           3.38           141   

   DOLocationID  fare_amount  trip_time  distance/time  
0           239          7.0        400       0.006025  
1           246         14.0       1152       0.003628  
2           229          6.5        429       0.004872  
3             7         13.5        818       0.007274  
4           234         10.0        720       0.004694  


Splitting data and calculating mean and scale of each feature in the training set.

In [22]:
predict = 'fare_amount'

X = data.drop(predict, axis=1)
y = data[predict]

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [23]:
X_valid.shape

(1330277, 7)

Here I split the data into total of 40 sub files, just for practice how to do it with a very big data set.

In [24]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    ny_dir = os.path.join("datasets", "ny_taxi")
    os.makedirs(ny_dir, exist_ok=True)
    path_format = os.path.join(ny_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [25]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = data.columns
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

Here is the head of the first train file

In [26]:
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,DOLocationID,fare_amount,trip_time,distance/time
0,1546898000.0,1546900000.0,7.56,161.0,231.0,1186.0,0.006374,17.5
1,1547062000.0,1547063000.0,1.13,246.0,68.0,217.0,0.005207,4.5
2,1547571000.0,1547575000.0,17.64,48.0,63.0,3837.0,0.004597,43.5
3,1546332000.0,1546333000.0,5.63,239.0,74.0,639.0,0.008811,12.0
4,1546695000.0,1546696000.0,1.45,113.0,79.0,404.0,0.003589,6.5


Here head of the first train file in text mode

In [27]:
with open(train_filepaths[0]) as f:
    for i in range(5):
        print(f.readline(), end="")

tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,PULocationID,DOLocationID,fare_amount,trip_time,distance/time
1546898373.0,1546899559.0,7.56,161.0,231.0,1186.0,0.006374367622259696,17.5
1547062335.0,1547062552.0,1.13,246.0,68.0,217.0,0.0052073732718894,4.5
1547570868.0,1547574705.0,17.64,48.0,63.0,3837.0,0.0045973416731821745,43.5
1546332386.0,1546333025.0,5.63,239.0,74.0,639.0,0.008810641627543036,12.0


In [28]:
train_filepaths

['datasets\\ny_taxi\\my_train_00.csv',
 'datasets\\ny_taxi\\my_train_01.csv',
 'datasets\\ny_taxi\\my_train_02.csv',
 'datasets\\ny_taxi\\my_train_03.csv',
 'datasets\\ny_taxi\\my_train_04.csv',
 'datasets\\ny_taxi\\my_train_05.csv',
 'datasets\\ny_taxi\\my_train_06.csv',
 'datasets\\ny_taxi\\my_train_07.csv',
 'datasets\\ny_taxi\\my_train_08.csv',
 'datasets\\ny_taxi\\my_train_09.csv',
 'datasets\\ny_taxi\\my_train_10.csv',
 'datasets\\ny_taxi\\my_train_11.csv',
 'datasets\\ny_taxi\\my_train_12.csv',
 'datasets\\ny_taxi\\my_train_13.csv',
 'datasets\\ny_taxi\\my_train_14.csv',
 'datasets\\ny_taxi\\my_train_15.csv',
 'datasets\\ny_taxi\\my_train_16.csv',
 'datasets\\ny_taxi\\my_train_17.csv',
 'datasets\\ny_taxi\\my_train_18.csv',
 'datasets\\ny_taxi\\my_train_19.csv']

In [29]:
valid_filepaths

['datasets\\ny_taxi\\my_valid_00.csv',
 'datasets\\ny_taxi\\my_valid_01.csv',
 'datasets\\ny_taxi\\my_valid_02.csv',
 'datasets\\ny_taxi\\my_valid_03.csv',
 'datasets\\ny_taxi\\my_valid_04.csv',
 'datasets\\ny_taxi\\my_valid_05.csv',
 'datasets\\ny_taxi\\my_valid_06.csv',
 'datasets\\ny_taxi\\my_valid_07.csv',
 'datasets\\ny_taxi\\my_valid_08.csv',
 'datasets\\ny_taxi\\my_valid_09.csv']

In [30]:
test_filepaths

['datasets\\ny_taxi\\my_test_00.csv',
 'datasets\\ny_taxi\\my_test_01.csv',
 'datasets\\ny_taxi\\my_test_02.csv',
 'datasets\\ny_taxi\\my_test_03.csv',
 'datasets\\ny_taxi\\my_test_04.csv',
 'datasets\\ny_taxi\\my_test_05.csv',
 'datasets\\ny_taxi\\my_test_06.csv',
 'datasets\\ny_taxi\\my_test_07.csv',
 'datasets\\ny_taxi\\my_test_08.csv',
 'datasets\\ny_taxi\\my_test_09.csv']

--- Building an Input Pipeline

In [31]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [32]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets\\ny_taxi\\my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_13.csv', shape=(), dtype=string)
tf.Tensor(b'datasets\\ny_taxi\\my_train_15.csv', sh

Next, we can call the interleave() method to read from 5 files at a time and interleave their lines (skipping the first line of each file, which is the header row, using the skip() method):

In [33]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

These are the first rows (ignoring the header row) of 5 CSV files, chosen randomly.

In [34]:
for line in dataset.take(5):
    print(line.numpy())

b'1547033768.0,1547034426.0,1.75,230.0,142.0,658.0,0.0026595744680851068,7.5'
b'1547844428.0,1547844654.0,1.3,239.0,238.0,226.0,0.005752212389380531,4.5'
b'1546625809.0,1546626707.0,4.51,211.0,256.0,898.0,0.0050222717149220495,13.0'
b'1547369685.0,1547369824.0,0.64,142.0,142.0,139.0,0.00460431654676259,4.0'
b'1548260637.0,1548261897.0,3.06,249.0,170.0,1260.0,0.002428571428571429,14.0'


In [35]:
n_inputs = 7 # X_train.shape[-1]

@tf.function
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y

In [36]:
preprocess(b'1546324359.0,1546325409.0,9.38,100.0,87.0,1050.0,0.008933333333333335,19.5')

(<tf.Tensor: shape=(7,), dtype=float32, numpy=
 array([-0.86244404, -0.86244094,  1.2159067 , -0.9744824 , -1.0811594 ,
         0.02102305,  0.11924608], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([19.5], dtype=float32)>)

Lets put all preproecessing stuff into one function, it will create and return a dataset that will efficiently load NY taxi data from multiple CSV files, then shuffle it, preprocess it and batch it:

In [37]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [38]:
train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

Cause: could not parse the source code:

        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)

This error may be avoided by creating the lambda in a standalone statement.

X = tf.Tensor(
[[-0.8085261  -0.808847   -0.27938288 -1.7654948   0.0126943  -0.1040526
   0.00435072]
 [-0.12686096 -0.12726529 -0.6363665  -0.320376   -0.28955474 -0.15032601
  -0.04942088]
 [ 0.764566    0.7642394  -0.31306058 -0.7615175  -0.7933032  -0.11733903
   0.01622542]], shape=(3, 7), dtype=float32)
y = tf.Tensor(
[[8. ]
 [5.5]
 [7.5]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[ 0.75930375  0.7590581  -0.09527817  1.2920723  -0.01609133 -0.0799996
   0.01786532]
 [-0.06792364 -0.0684901 

Now we can use the csv_reader_dataset() function to create a dataset for the training set (ensuring it repeats the data forever), the validation set and the test set:

In [39]:
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

Cause: could not parse the source code:

        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not parse the source code:

        lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)

This error may be avoided by creating the lambda in a standalone statement.

Cause: could not

Model 0

Sequential MLP with one hidden layer of 30 perceptrons with Nadam optimizer and 10 epochs. First I used SGD optimizer but the loss was falling into NaN (but there are no Nans in the dataset).

The evaluation on the test set resulted with a loss of 0.848.

In [40]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [42]:
model.compile(loss="mse", optimizer=keras.optimizers.Nadam())

In [43]:
batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=valid_set)

Train for 124713 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25910d5df08>

In [45]:
model.evaluate(test_set, steps=len(X_test) // batch_size)



0.8483280139496583

Model 1

Like model 0 Sequential MLP with one hidden layer of 30 perceptrons with Nadam optimizer but with 30 epochs instead of 10.

The evaluation on the test set resulted with a loss of 0.515 better than model 0.

In [47]:
model1 = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [48]:
model1.compile(loss="mse", optimizer=keras.optimizers.Nadam())

In [49]:
batch_size = 32
model1.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=30,
          validation_data=valid_set)

Train for 124713 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x2599a884988>

In [50]:
model1.evaluate(test_set, steps=len(X_test) // batch_size)



0.5156511533886443

Model 2

Sequential MLP with two hidden layers of 50 and 20 perceptrons with Nadam optimizer with 10 epochs.

The evaluation on the test set resulted with a loss of 0.244 better than model 0 and model 1.

This model reached the MSE from Random Forest model I did before. So on the end, due to the very long computing time for MLP the Random Forest would be the better choice.

In [51]:
model2 = keras.models.Sequential([
    keras.layers.Dense(50, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(20, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dense(1),
])

In [52]:
model2.compile(loss="mse", optimizer=keras.optimizers.Nadam())

In [53]:
batch_size = 32
model2.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=valid_set)

Train for 124713 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x259ca63a408>

In [54]:
model2.evaluate(test_set, steps=len(X_test) // batch_size)



0.2445206579807311