In [2]:
# from holidays.countries.united_states import US
# import holidays
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import isnan, when, count, col, lit, udf
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.context import SparkContext
import pandas as pd 
import numpy as np 
import re
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy import distance
import pickle
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import datetime
from dateutil.relativedelta import relativedelta
from functools import reduce
from pyspark.ml.feature import Bucketizer, StringIndexer, OneHotEncoder
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator, RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import GBTRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import partial_dependence, plot_partial_dependence, permutation_importance
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import ssl
from urllib import request
from PIL import Image

spark = SparkSession.builder.appName("tripDataCSVLoad").getOrCreate()

In [3]:
df_trip_fp =spark.read.parquet("spark-warehouse/trip_fare_tip_predict2")
print(df_trip_fp.count())
print(df_trip_fp.schema.names)

14620414
['trip_id', 'driver_id', 'pickup_latitude', 'pickup_longitude', 'pickup_area', 'dropoff_latitude', 'dropoff_longitude', 'geo_distance', 'pickup_hour', 'pickup_period', 'pickup_tow', 'fare_amount', 'tolls_amount', 'tip_amount', 'payment_type', 'trip_time_in_secs']


In [4]:
flgweekday = udf(lambda x: 0 if x in [0,6] else 1,IntegerType())

df_trip_fp = df_trip_fp.withColumn('pickup_tow_epoch_seconds', F.unix_timestamp('pickup_tow'))\
    .withColumn('flg_weekday',flgweekday(F.dayofweek(F.to_timestamp(col('pickup_tow')))))

## A) Fare Model

In [5]:
### Set features
non_feature_cols = ['trip_id','driver_id','fare_amount','tip_amount','pickup_tow',
                    'geo_distance','pickup_tow_epoch_seconds','pickup_period','pickup_area',
                    'payment_type','tolls_amount','trip_time_in_secs']
                    
features_input = [i for i in df_trip_fp.columns if i not in non_feature_cols]

assembler = VectorAssembler(inputCols= features_input, outputCol="features")
assembled_data = assembler.transform(df_trip_fp)
scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data.select('features'))
data_scale_output=data_scale.transform(assembled_data)

### Train/Test Split
splits = data_scale_output.randomSplit([0.7, 0.3],seed=9000)
train_df = splits[0]
test_df = splits[1]

target = 'fare_amount'
gbt = GBTRegressor(featuresCol = 'standardized', labelCol=target)
fitted = gbt.fit(train_df)
pred = fitted.transform(test_df)

pred.select(F.round(col("prediction"),2), col('fare_amount'), ).show(10)

evaluator = RegressionEvaluator(labelCol= target, predictionCol= "prediction", metricName="rmse")
rmse = evaluator.evaluate(pred)
print('rmse is %.2f', {rmse})
mae = evaluator.evaluate(pred, {evaluator.metricName: "mae"})
print('mae is %.2f', {mae})
r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})
print('R² is %.2f', {r2})

+--------------------+-----------+
|round(prediction, 2)|fare_amount|
+--------------------+-----------+
|                7.97|        6.0|
|                7.32|        8.5|
|                25.9|       20.0|
|               32.75|       29.0|
|               14.44|       12.0|
|               31.78|       27.5|
|                7.41|        5.5|
|                7.77|        8.5|
|               18.22|       19.5|
|                7.32|        4.5|
+--------------------+-----------+
only showing top 10 rows

rmse is %.2f {3.735992954937598}
mae is %.2f {2.5851915152340914}
R² is %.2f {0.8100678129111804}


In [7]:
### Feature Importances

fi = fitted.featureImportances.toArray()

feat_imp = (pd.DataFrame(dict(zip(features_input, fi)), range(1))
  .T.rename(columns={0:'Score'})
  .sort_values("Score", ascending =0)
  )
feat_imp

Unnamed: 0,Score
pickup_latitude,0.316297
dropoff_latitude,0.243198
pickup_longitude,0.218228
dropoff_longitude,0.211112
pickup_hour,0.010868
flg_weekday,0.000297


In [11]:
### Save model
fitted.save('models/fare_predictor')

## B) Tip Model

In [10]:
df_pickup_area = df_trip_fp.groupBy('pickup_area').count().sort(col('count').desc()).toPandas()
pickup_area_list = [i for i in df_pickup_area.iloc[:6]['pickup_area'].unique()]

consol_pickup_area = udf(lambda x: x if x in pickup_area_list else 'others',StringType())
df_trip_fp = df_trip_fp.withColumn('pickup_area',consol_pickup_area(col('pickup_area')))

In [11]:
### Transform features pickup_area, payment_type

pickup_indexer = StringIndexer(inputCol="pickup_area", outputCol="pickup_area_index")
payment_indexer = StringIndexer(inputCol="payment_type", outputCol="payment_type_index")
df_trip_fp_trfm = pickup_indexer.fit(df_trip_fp).transform(df_trip_fp)
df_trip_fp_trfm = payment_indexer.fit(df_trip_fp_trfm).transform(df_trip_fp_trfm)

#onehotencoder to pickup_area_index, payment_type_index
onehotencoder_pickuparea_vector = OneHotEncoder(inputCol="pickup_area_index", outputCol="pickup_area_vector")
onehotencoder_paymenttype_vector = OneHotEncoder(inputCol="payment_type_index", outputCol="payment_type_vector")
df_trip_fp_trfm = onehotencoder_pickuparea_vector.fit(df_trip_fp_trfm).transform(df_trip_fp_trfm)
df_trip_fp_trfm = onehotencoder_paymenttype_vector.fit(df_trip_fp_trfm).transform(df_trip_fp_trfm)

### Set features
non_feature_cols = ['trip_id','tip_amount','pickup_tow','pickup_period','pickup_area','payment_type',
                    'pickup_area_index','payment_type_index','pickup_tow_epoch_seconds',
                    'dropoff_latitude','dropoff_longitude']
features_input = [i for i in df_trip_fp_trfm.columns if i not in non_feature_cols]

assembler = VectorAssembler(inputCols= features_input, outputCol="features")
assembled_data = assembler.transform(df_trip_fp_trfm)
scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data.select('features'))
data_scale_output=data_scale.transform(assembled_data)

### Train/Test Split
splits = data_scale_output.randomSplit([0.7, 0.3],seed=9000)
train_df = splits[0]
test_df = splits[1]

target = 'tip_amount'
gbt = GBTRegressor(featuresCol = 'standardized', labelCol=target)
fitted = gbt.fit(train_df)
pred = fitted.transform(test_df)

pred.select(F.round(col("prediction"),2), col('tip_amount'), ).show(10)

evaluator = RegressionEvaluator(labelCol= target, predictionCol= "prediction", metricName="rmse")
rmse = evaluator.evaluate(pred)
print('rmse is %.2f', {rmse})
mae = evaluator.evaluate(pred, {evaluator.metricName: "mae"})
print('mae is %.2f', {mae})
r2 = evaluator.evaluate(pred, {evaluator.metricName: "r2"})
print('R² is %.2f', {r2})

+--------------------+----------+
|round(prediction, 2)|tip_amount|
+--------------------+----------+
|                 0.0|       0.0|
|                 0.0|       0.0|
|                3.74|       1.0|
|                6.36|       5.9|
|                 0.0|       0.0|
|                6.36|       5.6|
|                 0.0|       0.0|
|                 0.0|       0.0|
|                0.03|       0.0|
|                 0.0|       0.0|
+--------------------+----------+
only showing top 10 rows

rmse is %.2f {0.7897225262991144}
mae is %.2f {0.3621675142386616}
R² is %.2f {0.8054525748643373}


In [12]:
### Feature Importances
fi = fitted.featureImportances.toArray()

feat_imp = (pd.DataFrame(dict(zip(features_input, fi)), range(1))
  .T.rename(columns={0:'Score'})
  .sort_values("Score", ascending =0)
  )
feat_imp

Unnamed: 0,Score
fare_amount,0.440315
tolls_amount,0.012304
pickup_hour,0.005092
pickup_latitude,0.004293
pickup_longitude,0.002924
geo_distance,0.002037
payment_type_vector,0.000869
pickup_area_vector,0.000835
trip_time_in_secs,0.000777
driver_id,7e-06


In [13]:
### Save model
fitted.save('models/tip_predictor')