# NYC Taxi Fare Prediction - Model Training and Deployment

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fgithub.com%2Fapache%2Fbeam%2Fblob%2Fmaster%2Fsdks%2Fpython%2Fapache_beam%2Fyaml%2Fexamples%2Ftransforms%2Fml%2Finference%2Ftaxi-fare%2Fmodel_garden_custom_nyc_taxifare_prediction.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/apache/beam/blob/master/sdks/python/apache_beam/yaml/examples/transforms/ml/inference/taxi-fare/model_garden_custom_nyc_taxifare_prediction.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>


## Overview

...

## Outline
1. Dataset

2. Training

4. Evaluation

3. Deployment

In [1]:
!pip3 install --quiet --upgrade \
  opendatasets \
  google-cloud-bigquery \
  google-cloud-storage \
  google-cloud-aiplatform \
  scikit-learn \
  xgboost \
  pandas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.1 which is incompatible.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.
dask-cudf-cu12 25.

In [2]:
import opendatasets as od
import pandas as pd
import random
import time
import pickle
import os

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

import google.cloud.bigquery as bq
import google.cloud.storage as storage
import google.cloud.aiplatform as vertex

## Dataset

...


In [3]:
dataset_url = 'https://www.kaggle.com/c/new-york-city-taxi-fare-prediction'
od.download(dataset_url)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: chrlsng
Your Kaggle Key: ··········
Downloading new-york-city-taxi-fare-prediction.zip to ./new-york-city-taxi-fare-prediction


100%|██████████| 1.56G/1.56G [00:01<00:00, 996MB/s]



Extracting archive ./new-york-city-taxi-fare-prediction/new-york-city-taxi-fare-prediction.zip to ./new-york-city-taxi-fare-prediction


In [4]:
data_dir = 'new-york-city-taxi-fare-prediction'
!dir -l {data_dir}

total 5564956
-rw-r--r-- 1 root root        486 Jul 11 04:00 GCP-Coupons-Instructions.rtf
-rw-r--r-- 1 root root     343271 Jul 11 04:00 sample_submission.csv
-rw-r--r-- 1 root root     983020 Jul 11 04:00 test.csv
-rw-r--r-- 1 root root 5697178298 Jul 11 04:01 train.csv


In [5]:
p = 0.01  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df_train_val = pd.read_csv(
    data_dir + "/train.csv",
    header=0,
    parse_dates = ['pickup_datetime'],
    skiprows=lambda i: i > 0 and random.random() > p
)
df_train_val.shape

(554290, 8)

In [6]:
df_train_val.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count'],
      dtype='object')

In [7]:
df_train_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554290 entries, 0 to 554289
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   key                554290 non-null  object             
 1   fare_amount        554290 non-null  float64            
 2   pickup_datetime    554290 non-null  datetime64[ns, UTC]
 3   pickup_longitude   554290 non-null  float64            
 4   pickup_latitude    554290 non-null  float64            
 5   dropoff_longitude  554289 non-null  float64            
 6   dropoff_latitude   554289 non-null  float64            
 7   passenger_count    554290 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(1)
memory usage: 33.8+ MB


In [8]:
df_train_val

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2012-01-02 14:54:00.00000078,4.90,2012-01-02 14:54:00+00:00,-74.003985,40.731677,-73.999765,40.723512,1
1,2013-03-10 15:44:00.00000040,5.00,2013-03-10 15:44:00+00:00,-74.013360,40.702272,-74.016990,40.709245,1
2,2014-09-12 23:10:00.000000213,16.50,2014-09-12 23:10:00+00:00,-73.983837,40.676052,-74.002522,40.723145,1
3,2011-10-08 00:24:00.000000173,16.10,2011-10-08 00:24:00+00:00,-73.988568,40.732097,-73.957498,40.742657,1
4,2013-06-22 13:48:00.000000188,8.50,2013-06-22 13:48:00+00:00,-73.976877,40.759192,-73.991685,40.749810,1
...,...,...,...,...,...,...,...,...
554285,2013-02-22 08:20:00.00000097,5.00,2013-02-22 08:20:00+00:00,-73.985343,40.769115,-73.982817,40.764910,4
554286,2011-04-13 08:22:17.0000005,9.70,2011-04-13 08:22:17+00:00,-73.950372,40.779776,-73.966151,40.765232,1
554287,2015-02-01 13:16:23.0000001,22.50,2015-02-01 13:16:23+00:00,-73.963020,40.775162,-74.002525,40.723854,1
554288,2010-10-05 16:55:00.00000058,26.27,2010-10-05 16:55:00+00:00,-73.981073,40.730682,-73.872077,40.774333,1


In [9]:
df_test = pd.read_csv(data_dir + "/test.csv", parse_dates = ['pickup_datetime'])
df_test.columns

Index(['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [10]:
df_test

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6


In [11]:
df_train, df_val = train_test_split(df_train_val, test_size=0.2, random_state=42)

print("Training dataset's shape: ", df_train.shape)
print("Validation dataset's shape: ", df_val.shape)

Training dataset's shape:  (443432, 8)
Validation dataset's shape:  (110858, 8)


## Training

...


### Simple Feature Engineering

...


In [12]:
def add_dateparts(df, col):
    """
    This function splits the datetime column into separate column such year, month, day, weekday, and hour
    :param df: dataframe where to add the columns
    :param col: the column with datetime values
    :return: None
    """
    df[col + '_year'] = df[col].dt.year
    df[col + '_month'] = df[col].dt.month
    df[col + '_day'] = df[col].dt.day
    df[col + '_weekday'] = df[col].dt.weekday
    df[col + '_hour'] = df[col].dt.hour

In [13]:
add_dateparts(df_train, 'pickup_datetime')
add_dateparts(df_val, 'pickup_datetime')
add_dateparts(df_test, 'pickup_datetime')

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 443432 entries, 382025 to 121958
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype              
---  ------                   --------------   -----              
 0   key                      443432 non-null  object             
 1   fare_amount              443432 non-null  float64            
 2   pickup_datetime          443432 non-null  datetime64[ns, UTC]
 3   pickup_longitude         443432 non-null  float64            
 4   pickup_latitude          443432 non-null  float64            
 5   dropoff_longitude        443431 non-null  float64            
 6   dropoff_latitude         443431 non-null  float64            
 7   passenger_count          443432 non-null  int64              
 8   pickup_datetime_year     443432 non-null  int32              
 9   pickup_datetime_month    443432 non-null  int32              
 10  pickup_datetime_day      443432 non-null  int32              
 11  pickup_dateti

In [15]:
df_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour
382025,2011-01-06 14:57:00.00000027,7.3,2011-01-06 14:57:00+00:00,-73.955125,40.783235,-73.968733,40.763933,1,2011,1,6,3,14
80065,2014-11-26 08:54:00.000000195,7.0,2014-11-26 08:54:00+00:00,-73.983637,40.746717,-73.977325,40.752482,1,2014,11,26,2,8
205482,2012-07-05 12:04:00.000000166,20.9,2012-07-05 12:04:00+00:00,-73.980992,40.68911,-73.990185,40.756382,1,2012,7,5,3,12
484314,2011-04-20 02:37:38.0000001,3.7,2011-04-20 02:37:38+00:00,-73.952419,40.776888,-73.950749,40.771211,1,2011,4,20,2,2
37406,2013-08-29 10:27:21.0000002,19.5,2013-08-29 10:27:21+00:00,-73.980567,40.752691,-74.00873,40.710692,1,2013,8,29,3,10


### Gradient Boosting

...


In [16]:
input_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count',
              'pickup_datetime_year', 'pickup_datetime_month', 'pickup_datetime_day', 'pickup_datetime_weekday',
              'pickup_datetime_hour']

target_cols = 'fare_amount'

train_inputs = df_train[input_cols]
train_targets = df_train[target_cols]

val_inputs = df_val[input_cols]
val_targets = df_val[target_cols]

test_inputs = df_test[input_cols]

In [17]:
xgb_model = XGBRegressor(objective='reg:squarederror',
                         n_jobs=-1,
                         random_state=42,
                         n_estimators=500,
                         max_depth=5,
                         learning_rate=0.05,
                         tree_method='hist',
                         subsample=0.8,
                         colsample_bytree=0.8)

In [18]:
xgb_model.fit(train_inputs.values, train_targets.values)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


## Evaluation

...


In [19]:
def evaluate(model):
    train_preds = model.predict(train_inputs)
    train_rmse = root_mean_squared_error(train_targets, train_preds)
    val_preds = model.predict(val_inputs)
    val_rmse = root_mean_squared_error(val_targets, val_preds)
    return train_rmse, val_rmse, train_preds, val_preds

evaluate(xgb_model)

(4.287849307717088,
 4.394384114914265,
 array([ 7.6404986,  7.9646816, 18.075298 , ..., 17.755816 ,  6.6315913,
        14.081218 ], dtype=float32),
 array([8.176593 , 7.770482 , 9.076797 , ..., 7.2958746, 8.455482 ,
        6.7657313], dtype=float32))

In [20]:
def test_inference(model):
    test_preds = model.predict(test_inputs)
    result_df = df_test
    result_df['predicted_fare_amount'] = test_preds
    return result_df

test_inference(xgb_model)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_datetime_year,pickup_datetime_month,pickup_datetime_day,pickup_datetime_weekday,pickup_datetime_hour,predicted_fare_amount
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.973320,40.763805,-73.981430,40.743835,1,2015,1,27,1,13,9.456256
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1,2015,1,27,1,13,9.413404
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982524,40.751260,-73.979654,40.746139,1,2011,10,8,5,11,6.084315
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981160,40.767807,-73.990448,40.751635,1,2012,12,1,5,21,8.354719
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966046,40.789775,-73.988565,40.744427,1,2012,12,1,5,21,14.150781
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51+00:00,-73.968124,40.796997,-73.955643,40.780388,6,2015,5,10,6,12,8.947151
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51+00:00,-73.945511,40.803600,-73.960213,40.776371,6,2015,1,12,0,17,10.637887
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15+00:00,-73.991600,40.726608,-73.789742,40.647011,6,2015,4,19,6,20,51.095383
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19+00:00,-73.985573,40.735432,-73.939178,40.801731,6,2015,1,31,5,1,18.889997


## Deployment

...


In [21]:
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
REGION     = os.environ["GOOGLE_CLOUD_REGION"]
BUCKET_URI = "gs://my-warehouse"

vertex.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

print(f"Project: {PROJECT_ID} | Region: {REGION}")

Project: silicon-synapse-460717-a0 | Region: us-central1


In [None]:
FILE_NAME = "model.bst"
xgb_model.save_model(FILE_NAME)

# Upload the saved model file to Cloud Storage
BLOB_PATH = "taxifare_prediction/"

BLOB_NAME = BLOB_PATH + FILE_NAME

bucket = storage.Client().bucket(BUCKET_URI[5:])
blob = bucket.blob(BLOB_NAME)
blob.upload_from_filename(FILE_NAME)

  self.get_booster().save_model(fname)


In [None]:
MODEL_DISPLAY_NAME = f"custom/xgb-model-nyc-taxifare"

ARTIFACT_GCS_PATH = f"{BUCKET_URI}/{BLOB_PATH}"

DEPLOY_VERSION = "xgboost-cpu.2-0"
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/prediction/{}:latest".format(DEPLOY_VERSION)

MACHINE_TYPE = "n1-standard"
VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

Deploy machine type n1-standard-4


In [None]:
MODEL_OBJ = vertex.Model.upload(
    display_name = MODEL_DISPLAY_NAME,
    artifact_uri = ARTIFACT_GCS_PATH,
    serving_container_image_uri = DEPLOY_IMAGE,
    serving_container_predict_route = "/predict",
    serving_container_health_route  = "/ping",
    labels = {"framework":"xgboost","demo":"nyc_taxi"}
)

print("Model resource:", MODEL_OBJ.resource_name)

Model resource: projects/946006276398/locations/us-central1/models/2298052969330900992


In [None]:
ENDPOINT = vertex.Endpoint.create(
    display_name=f"{MODEL_DISPLAY_NAME}-endpoint",
    dedicated_endpoint_enabled=True,
)

In [None]:
MODEL_OBJ.deploy(
    endpoint = ENDPOINT,
    machine_type = DEPLOY_COMPUTE,
    deploy_request_timeout=1800,
    traffic_percentage=100
)

print("Endpoint:", ENDPOINT.resource_name)

Endpoint: projects/946006276398/locations/us-central1/endpoints/5903336203645616128


In [29]:
ENDPOINT = vertex.Endpoint('5903336203645616128')

instances = [val_inputs.iloc[0].to_list(), val_inputs.iloc[1].to_list(), val_inputs.iloc[2].to_list()]
print(instances)
predictions = ENDPOINT.predict(instances)
print("Predicted fares: ", predictions.predictions)
print("Actual fares: ", val_targets.iloc[0:3].to_list())

[[-73.957416, 40.774681, -73.978929, 40.77232, 1.0, 2011.0, 8.0, 4.0, 3.0, 8.0], [-73.993245, 40.742157, -73.984952, 40.751928, 1.0, 2014.0, 12.0, 20.0, 5.0, 21.0], [-73.980045, 40.726955, -73.972173, 40.75674, 5.0, 2010.0, 2.0, 14.0, 6.0, 1.0]]
Predicted fares:  [7.947899341583252, 7.781887531280518, 9.431431770324707]
Actual fares:  [8.1, 11.0, 8.9]
