This script was run with the full dataset only after the script with 1% of the data was run and confirmed as working and complete. Scripts were separated as a precautionary measure, especially considering potential running time. In general results were no better than the 1% dataset - and in some models, they were marginally worse

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col, lit, udf, month, year, date_format, datediff, from_unixtime, unix_timestamp
from pyspark.sql.functions import date_trunc
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.window import Window
from pyspark.sql.types import (StructType, StructField, DateType, BooleanType,
                               DoubleType, IntegerType, StringType, TimestampType)
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!hdfs dfs -ls /user/alphan/data/

Found 4 items
-rw-r--r--   3 alphan alphan 1824926642 2019-05-25 04:44 /user/alphan/data/chicago_crimes.csv
drwxr-xr-x   - alphan alphan          0 2019-06-04 02:09 /user/alphan/data/df.csv
drwxr-xr-x   - alphan alphan          0 2019-06-04 02:12 /user/alphan/data/final_project_df.csv
-rw-r--r--   3 alphan alphan  208276005 2019-04-30 20:10 /user/alphan/data/food-inspections.csv


In [3]:
spark = SparkSession.builder.appName('RideShare').getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','20g')])
df = spark.read.csv("/user/alphan/data/final_project_df.csv", inferSchema=True, header=True)

In [4]:
df.printSchema()

root
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- pickupLat: double (nullable = true)
 |-- pickupLong: double (nullable = true)
 |-- dropoffLat: double (nullable = true)
 |-- dropoffLong: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [5]:
spark = SparkSession.builder.appName('RideShare').getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '256g'),
                                        ('spark.app.name', 'Spark Updated Conf'),
                                        ('spark.executor.cores', '16'),
                                        ('spark.cores.max', '16'),
                                        ('spark.driver.memory','256g'),
                                        ('spark.sql.AutoBroadcastJoinThreshold', -1),
                                        ('mapreduce.reduce.memory.mb',-1),
                                        ('spark.yarn.executor.memoryOverhead', -1),
                                        ('spark.kryoserializer.buffer.max.mb', '5g')])



In [6]:
#WRITE to HDFS
#fulldf = rides.join(spark_weather, rides.startTime == spark_weather.time, how='inner')
#res_path = '/user/alphan/data/final_project_df.csv'
#df.write.csv(path=res_path, header=True, compression='gzip')

### Preliminary Modelling

In [6]:
df.printSchema()

root
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- pickupLat: double (nullable = true)
 |-- pickupLong: double (nullable = true)
 |-- dropoffLat: double (nullable = true)
 |-- dropoffLong: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [7]:
#Assemble vectors and Scale:
#After iterations examining individual impact, final variable 'columns' below does not contain Longitude & Latitude for easier running

columns = ['seconds','miles','shared','communityPickup','communityDropoff','humidity', 
'apparentTemperature','precipIntensity',
'precipProbability', 'temperature', 'Cloudy','Rainy', 'Snowy','month','day', 'hour']

from pyspark.ml.feature import VectorAssembler,StandardScaler
vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'features')# 'fare', 'addCharge', 'tripTotal'
ml_data=vectorAssembler.transform(df)
ml_data.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[430.0,3.0,0.0,3....|
|[368.0,1.9,1.0,44...|
|[1142.0,14.7,1.0,...|
|[1288.0,3.9,1.0,4...|
|[205.0,1.2,0.0,10...|
+--------------------+
only showing top 5 rows



In [8]:
standardscaler=StandardScaler().setInputCol('features').setOutputCol('scaled_features')
scaled_data=standardscaler.fit(ml_data).transform(ml_data)
scaled_data.select('features','scaled_features').show(5)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[430.0,3.0,0.0,3....|[0.65134636695971...|
|[368.0,1.9,1.0,44...|[0.55743130939807...|
|[1142.0,14.7,1.0,...|[1.72985476992555...|
|[1288.0,3.9,1.0,4...|[1.95100958289327...|
|[205.0,1.2,0.0,10...|[0.31052559355055...|
+--------------------+--------------------+
only showing top 5 rows



In [10]:
vfull_df = scaled_data.select(['scaled_features', 'fare'])
vfull_df.show(3)

+--------------------+----+
|     scaled_features|fare|
+--------------------+----+
|[0.65134636695971...| 7.5|
|[0.55743130939807...| 5.0|
|[1.72985476992555...|17.5|
+--------------------+----+
only showing top 3 rows



The results of some of the models below changed noticeably for the better after setting the seed, and thus may differ slightly from the results from the presentation slide. The previous script did not have a seed setting. The rest of the script remains as before.

In [15]:
SEED = 5

In [16]:
#Split the data
#splits = scaled_data.randomSplit([0.99, 0.01], seed=SEED)
#small_df = splits[1]

#Data is not split, all of it is used
small_split = scaled_data.randomSplit([0.7, 0.3],seed=SEED)
train_df = small_split[0]
test_df = small_split[1]

#### Decision Tree

In [17]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'fare')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)


dt_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
dt_rmse = dt_evaluator.evaluate(dt_predictions)
dt_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
dt_r2 = dt_evaluator2.evaluate(dt_predictions)
dt_evaluator3 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="mae")
dt_mae= dt_evaluator3.evaluate(dt_predictions)

In [18]:
print("Root Mean Squared Error (RMSE) on test data = %g" % dt_rmse)
print("R-Squared (R2) on test data = %g" % dt_r2)
print("Mean Absolute Error (MAE) on test data = %g" % dt_mae)

Root Mean Squared Error (RMSE) on test data = 3.18569
R-Squared (R2) on test data = 0.783109
Mean Absolute Error (MAE) on test data = 1.6739


from FeatureImportanceSelector import ExtractFeatureImp, FeatureImpSelector
ExtractFeatureImp(mod.stages[-1].featureImportances, dt_predictions, "features_subset")

In [19]:
model = pd.DataFrame(dt_model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(columns)
model["features"] = features_col
model.sort_values("values", ascending = False)

Unnamed: 0,values,features
1,0.788758,miles
2,0.11,shared
0,0.098925,seconds
4,0.001299,communityDropoff
3,0.000845,communityPickup
15,0.000173,hour
5,0.0,humidity
6,0.0,apparentTemperature
7,0.0,precipIntensity
8,0.0,precipProbability


#### Gradient Boost

In [20]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'fare', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'fare', 'features').show(5)

+------------------+----+--------------------+
|        prediction|fare|            features|
+------------------+----+--------------------+
| 4.418849079762522| 5.0|[3.0,1.8,1.0,2.0,...|
| 4.410442506890022| 5.0|[3.0,2.1,0.0,25.0...|
| 4.429378898540901| 5.0|[3.0,2.4,1.0,69.0...|
| 4.517293369824973| 7.5|[3.0,2.9,1.0,7.0,...|
|3.1454906855994476| 7.5|[4.0,0.1,1.0,32.0...|
+------------------+----+--------------------+
only showing top 5 rows



In [21]:
gbt_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)

gbt_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
gbt_r2 = gbt_evaluator2.evaluate(gbt_predictions)
gbt_evaluator3 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="mae")
gbt_mae = gbt_evaluator3.evaluate(gbt_predictions)

In [22]:
print("Root Mean Squared Error (RMSE) on test data = %g" % gbt_rmse)
print("R-Squared (R2) on test data = %g" % gbt_r2)
print("Mean Absolute Error (MAE) on test data = %g" % gbt_mae)

Root Mean Squared Error (RMSE) on test data = 3.05265
R-Squared (R2) on test data = 0.800673
Mean Absolute Error (MAE) on test data = 1.58199


In [23]:
model = pd.DataFrame(gbt_model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(columns)
model["features"] = features_col
model.sort_values("values", ascending = False)

Unnamed: 0,values,features
1,0.348159,miles
0,0.330531,seconds
2,0.110286,shared
3,0.08152,communityPickup
15,0.043315,hour
4,0.035082,communityDropoff
14,0.019788,day
5,0.016873,humidity
9,0.008103,temperature
8,0.004432,precipProbability


#### Random Forest

In [24]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'fare')
rfModel = rf.fit(train_df)
rf_predictions = rfModel.transform(test_df)
rf_predictions.select('prediction', 'fare', 'features').show(5)

+-----------------+----+--------------------+
|       prediction|fare|            features|
+-----------------+----+--------------------+
|4.914244067686214| 5.0|[3.0,1.8,1.0,2.0,...|
| 5.28916467276556| 5.0|[3.0,2.1,0.0,25.0...|
|4.851192640180555| 5.0|[3.0,2.4,1.0,69.0...|
|4.952220950046641| 7.5|[3.0,2.9,1.0,7.0,...|
|4.534225149458597| 7.5|[4.0,0.1,1.0,32.0...|
+-----------------+----+--------------------+
only showing top 5 rows



In [25]:
rf_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
rf_rmse = rf_evaluator.evaluate(rf_predictions)

rf_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
rf_r2 = rf_evaluator2.evaluate(rf_predictions)
rf_evaluator3 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="mae")
rf_mae = rf_evaluator3.evaluate(rf_predictions)

In [26]:
print("Root Mean Squared Error (RMSE) on test data = %g" % rf_rmse)
print("R-Squared (R2) on test data = %g" % rf_r2)
print("Mean Absolute Error (MAE) on test data = %g" % rf_mae)

Root Mean Squared Error (RMSE) on test data = 3.27499
R-Squared (R2) on test data = 0.770778
Mean Absolute Error (MAE) on test data = 1.75036


In [27]:
model = pd.DataFrame(rfModel.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(columns)
model["features"] = features_col
model.sort_values("values", ascending = False)

Unnamed: 0,values,features
1,0.4371706,miles
0,0.3725979,seconds
2,0.08543686,shared
4,0.07461566,communityDropoff
3,0.02849393,communityPickup
15,0.001312816,hour
14,0.0002041749,day
13,0.0001189574,month
5,2.667602e-05,humidity
6,1.697081e-05,apparentTemperature


#### Linear Regression

In [28]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='fare', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.002341637431339443,0.9493864709339118,-3.430047867004976,0.0,4.679378168085919e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 3.6258245818048502


In [29]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
print("MAE: %f" % trainingSummary.meanAbsoluteError)

RMSE: 3.318745
r2: 0.764394


### Models with Grid Search

#### Random Forest

In [30]:
lr_data = test_df.select(col("fare").alias("label"), *columns)  
lr_data.printSchema()  

root
 |-- label: double (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- humidity: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [31]:
from pyspark.ml import Pipeline 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
rfr = RandomForestRegressor(labelCol="label", featuresCol="scaled_features")
stages = [vectorAssembler, standardscaler, rfr]
pipe = Pipeline(stages=stages)

In [32]:
estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6,8]) \
.addGrid(rfr.maxBins, [5, 10, 15]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [33]:
crossval = CrossValidator(estimator=pipe,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(lr_data)

In [34]:
modelEvaluator=RegressionEvaluator()
eval_rmse = RegressionEvaluator(metricName="rmse")
eval_r2 = RegressionEvaluator(metricName="r2")
eval_mae = RegressionEvaluator(metricName="mae") 

In [35]:
#Not sure it matters what data we use here
print(eval_rmse.evaluate(cvmodel.transform(lr_data)))
print(eval_r2.evaluate(cvmodel.transform(lr_data)))
print(eval_mae.evaluate(cvmodel.transform(lr_data)))

3.1604885811601906
0.786526112532769
1.5975821815077242


#### Gradient Boost

In [36]:
gbt = GBTRegressor(labelCol="label", featuresCol="scaled_features")
stages = [vectorAssembler, standardscaler, gbt]
pipe = Pipeline(stages=stages)

In [37]:
estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6,8]) \
.addGrid(rfr.maxBins, [5, 10, 15]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [38]:
crossval = CrossValidator(estimator=pipe,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(lr_data)

In [39]:
modelEvaluator=RegressionEvaluator()
eval_rmse = RegressionEvaluator(metricName="rmse")
eval_r2 = RegressionEvaluator(metricName="r2")
eval_mae = RegressionEvaluator(metricName="mae") 

In [40]:
print(eval_rmse.evaluate(cvmodel.transform(lr_data)))
print(eval_r2.evaluate(cvmodel.transform(lr_data)))
print(eval_mae.evaluate(cvmodel.transform(lr_data)))

3.0066228031720823
0.806805747672977
1.5554689718064223
