In [13]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col, lit, udf, month, year, date_format, datediff, from_unixtime, unix_timestamp
from pyspark.sql.functions import date_trunc
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.window import Window
from pyspark.sql.types import (StructType, StructField, DateType, BooleanType,
                               DoubleType, IntegerType, StringType, TimestampType)
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
!hdfs dfs -ls /user/alphan/data/

Found 4 items
-rw-r--r--   3 alphan alphan 1824926642 2019-05-25 04:44 /user/alphan/data/chicago_crimes.csv
drwxr-xr-x   - alphan alphan          0 2019-06-04 02:09 /user/alphan/data/df.csv
drwxr-xr-x   - alphan alphan          0 2019-06-04 02:12 /user/alphan/data/final_project_df.csv
-rw-r--r--   3 alphan alphan  208276005 2019-04-30 20:10 /user/alphan/data/food-inspections.csv


In [15]:
spark2 = SparkSession.builder.appName('RideShare2').getOrCreate()
conf2 = spark2.sparkContext._conf.setAll([('spark.executor.memory', '15g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','20g')])
df = spark2.read.csv("/user/alphan/data/final_project_df.csv", inferSchema=True, header=True)

In [16]:
df.printSchema()

root
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- pickupLat: double (nullable = true)
 |-- pickupLong: double (nullable = true)
 |-- dropoffLat: double (nullable = true)
 |-- dropoffLong: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [17]:
spark2 = SparkSession.builder.appName('RideShare2').getOrCreate()
conf = spark2.sparkContext._conf.setAll([('spark.executor.memory', '256g'),
                                        ('spark.app.name', 'Spark Updated Conf'),
                                        ('spark.executor.cores', '16'),
                                        ('spark.cores.max', '16'),
                                        ('spark.driver.memory','256g'),
                                        ('spark.sql.AutoBroadcastJoinThreshold', -1),
                                        ('mapreduce.reduce.memory.mb',-1),
                                        ('spark.yarn.executor.memoryOverhead', -1),
                                        ('spark.kryoserializer.buffer.max.mb', '5g')])



In [6]:
#WRITE to HDFS
#fulldf = rides.join(spark_weather, rides.startTime == spark_weather.time, how='inner')
#res_path = '/user/alphan/data/final_project_df.csv'
#df.write.csv(path=res_path, header=True, compression='gzip')

### Preliminary Modelling

In [18]:
df.printSchema()

root
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- pickupLat: double (nullable = true)
 |-- pickupLong: double (nullable = true)
 |-- dropoffLat: double (nullable = true)
 |-- dropoffLong: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [19]:
#Assemble vectors and Scale:
#After iterations examining individual impact, final variable 'columns' below does not contain Longitude & Latitude for easier running

columns = ['seconds','miles','shared','communityPickup','communityDropoff','humidity', 
'apparentTemperature','precipIntensity',
'precipProbability', 'temperature', 'Cloudy','Rainy', 'Snowy','month','day', 'hour']

from pyspark.ml.feature import VectorAssembler,StandardScaler
vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'features')# 'fare', 'addCharge', 'tripTotal'
ml_data=vectorAssembler.transform(df)
ml_data.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[430.0,3.0,0.0,3....|
|[368.0,1.9,1.0,44...|
|[1142.0,14.7,1.0,...|
|[1288.0,3.9,1.0,4...|
|[205.0,1.2,0.0,10...|
+--------------------+
only showing top 5 rows



In [20]:
standardscaler=StandardScaler().setInputCol('features').setOutputCol('scaled_features')
scaled_data=standardscaler.fit(ml_data).transform(ml_data)
scaled_data.select('features','scaled_features').show(5)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[430.0,3.0,0.0,3....|[0.65134636695970...|
|[368.0,1.9,1.0,44...|[0.55743130939807...|
|[1142.0,14.7,1.0,...|[1.72985476992555...|
|[1288.0,3.9,1.0,4...|[1.95100958289326...|
|[205.0,1.2,0.0,10...|[0.31052559355055...|
+--------------------+--------------------+
only showing top 5 rows



In [21]:
vfull_df = scaled_data.select(['scaled_features', 'fare'])
vfull_df.show(3)

+--------------------+----+
|     scaled_features|fare|
+--------------------+----+
|[0.65134636695970...| 7.5|
|[0.55743130939807...| 5.0|
|[1.72985476992555...|17.5|
+--------------------+----+
only showing top 3 rows



The results of some of the models below changed noticeably for the better after setting the seed, and thus may differ slightly from the results from the presentation slide. The previous script did not have a seed setting. The rest of the script remains as before.

In [25]:
SEED = 5

In [26]:
#Split the data
splits = scaled_data.randomSplit([0.99, 0.01], seed=SEED)
small_df = splits[1]
small_split = small_df.randomSplit([0.7, 0.3],seed=SEED)
train_df = small_split[0]
test_df = small_split[1]

#### Decision Tree

In [27]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'fare')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)


dt_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
dt_rmse = dt_evaluator.evaluate(dt_predictions)
dt_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
dt_r2 = dt_evaluator2.evaluate(dt_predictions)
dt_evaluator3 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="mae")
dt_mae= dt_evaluator3.evaluate(dt_predictions)

In [28]:
print("Root Mean Squared Error (RMSE) on test data = %g" % dt_rmse)
print("R-Squared (R2) on test data = %g" % dt_r2)
print("Mean Absolute Error (MAE) on test data = %g" % dt_mae)

Root Mean Squared Error (RMSE) on test data = 3.08866
R-Squared (R2) on test data = 0.792706
Mean Absolute Error (MAE) on test data = 1.64751


from FeatureImportanceSelector import ExtractFeatureImp, FeatureImpSelector
ExtractFeatureImp(mod.stages[-1].featureImportances, dt_predictions, "features_subset")

In [29]:
model = pd.DataFrame(dt_model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(columns)
model["features"] = features_col
model.sort_values("values", ascending = False)

Unnamed: 0,values,features
1,0.792715,miles
2,0.106965,shared
0,0.097684,seconds
4,0.001424,communityDropoff
3,0.00103,communityPickup
9,0.000182,temperature
5,0.0,humidity
6,0.0,apparentTemperature
7,0.0,precipIntensity
8,0.0,precipProbability


#### Gradient Boost

In [30]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'fare', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'fare', 'features').show(5)

+------------------+----+--------------------+
|        prediction|fare|            features|
+------------------+----+--------------------+
| 3.117913782281579| 2.5|[51.0,0.1,0.0,8.0...|
|3.0618915535250624| 2.5|[57.0,0.2,1.0,23....|
|3.0246523233774725| 2.5|[58.0,0.2,0.0,32....|
| 3.036138878680659| 2.5|[61.0,0.2,0.0,8.0...|
| 3.036138878680659| 2.5|[73.0,0.2,0.0,26....|
+------------------+----+--------------------+
only showing top 5 rows



In [31]:
gbt_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)

gbt_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
gbt_r2 = gbt_evaluator2.evaluate(gbt_predictions)
gbt_evaluator3 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="mae")
gbt_mae = gbt_evaluator3.evaluate(gbt_predictions)

In [32]:
print("Root Mean Squared Error (RMSE) on test data = %g" % gbt_rmse)
print("R-Squared (R2) on test data = %g" % gbt_r2)
print("Mean Absolute Error (MAE) on test data = %g" % gbt_mae)

Root Mean Squared Error (RMSE) on test data = 2.94064
R-Squared (R2) on test data = 0.810466
Mean Absolute Error (MAE) on test data = 1.56672


In [33]:
model = pd.DataFrame(gbt_model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(columns)
model["features"] = features_col
model.sort_values("values", ascending = False)

Unnamed: 0,values,features
1,0.393758,miles
0,0.321471,seconds
2,0.103609,shared
3,0.046849,communityPickup
15,0.034539,hour
4,0.027559,communityDropoff
14,0.021082,day
9,0.014972,temperature
6,0.010923,apparentTemperature
5,0.008051,humidity


#### Random Forest

In [34]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'fare')
rfModel = rf.fit(train_df)
rf_predictions = rfModel.transform(test_df)
rf_predictions.select('prediction', 'fare', 'features').show(5)

+-----------------+----+--------------------+
|       prediction|fare|            features|
+-----------------+----+--------------------+
|4.890041596394282| 2.5|[51.0,0.1,0.0,8.0...|
| 4.72350579959502| 2.5|[57.0,0.2,1.0,23....|
|4.823513771121112| 2.5|[58.0,0.2,0.0,32....|
|4.741040431993048| 2.5|[61.0,0.2,0.0,8.0...|
|4.714938375021649| 2.5|[73.0,0.2,0.0,26....|
+-----------------+----+--------------------+
only showing top 5 rows



In [35]:
rf_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
rf_rmse = rf_evaluator.evaluate(rf_predictions)

rf_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
rf_r2 = rf_evaluator2.evaluate(rf_predictions)
rf_evaluator3 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="mae")
rf_mae = rf_evaluator3.evaluate(rf_predictions)

In [36]:
print("Root Mean Squared Error (RMSE) on test data = %g" % rf_rmse)
print("R-Squared (R2) on test data = %g" % rf_r2)
print("Mean Absolute Error (MAE) on test data = %g" % rf_mae)

Root Mean Squared Error (RMSE) on test data = 3.07921
R-Squared (R2) on test data = 0.791748
Mean Absolute Error (MAE) on test data = 1.66351


In [37]:
model = pd.DataFrame(rfModel.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(columns)
model["features"] = features_col
model.sort_values("values", ascending = False)

Unnamed: 0,values,features
1,0.6176079,miles
0,0.2293197,seconds
4,0.07009469,communityDropoff
2,0.06391204,shared
3,0.01745107,communityPickup
15,0.00116429,hour
14,0.0002772842,day
9,5.638586e-05,temperature
13,4.275432e-05,month
8,3.183826e-05,precipProbability


#### Linear Regression

In [38]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='fare', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0020188080253334038,0.9925907827753835,-3.3895853600260293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 3.722830854394592


In [53]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
print("MAE: %f" % trainingSummary.meanAbsoluteError)

RMSE: 3.361395
r2: 0.761311
MAE: 1.877613


### Models with Grid Search

#### Random Forest

In [40]:
lr_data = test_df.select(col("fare").alias("label"), *columns)  
lr_data.printSchema()  

root
 |-- label: double (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- humidity: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [41]:
from pyspark.ml import Pipeline 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
rfr = RandomForestRegressor(labelCol="label", featuresCol="scaled_features")
stages = [vectorAssembler, standardscaler, rfr]
pipe = Pipeline(stages=stages)

In [42]:
estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6,8]) \
.addGrid(rfr.maxBins, [5, 10, 15]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [43]:
crossval = CrossValidator(estimator=pipe,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(lr_data)

In [44]:
modelEvaluator=RegressionEvaluator()
eval_rmse = RegressionEvaluator(metricName="rmse")
eval_r2 = RegressionEvaluator(metricName="r2")
eval_mae = RegressionEvaluator(metricName="mae") 

In [45]:
#Not sure it matters what data we use here
print(eval_rmse.evaluate(cvmodel.transform(lr_data)))
print(eval_r2.evaluate(cvmodel.transform(lr_data)))
print(eval_mae.evaluate(cvmodel.transform(lr_data)))

2.8894086933239036
0.8166299271058671
1.534910661838085


#### Gradient Boost

In [46]:
gbt = GBTRegressor(labelCol="label", featuresCol="scaled_features")
stages = [vectorAssembler, standardscaler, gbt]
pipe = Pipeline(stages=stages)

In [47]:
estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6,8]) \
.addGrid(rfr.maxBins, [5, 10, 15]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [48]:
crossval = CrossValidator(estimator=pipe,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(lr_data)

In [49]:
modelEvaluator=RegressionEvaluator()
eval_rmse = RegressionEvaluator(metricName="rmse")
eval_r2 = RegressionEvaluator(metricName="r2")
eval_mae = RegressionEvaluator(metricName="mae") 

In [50]:
print(eval_rmse.evaluate(cvmodel.transform(lr_data)))
print(eval_r2.evaluate(cvmodel.transform(lr_data)))
print(eval_mae.evaluate(cvmodel.transform(lr_data)))

2.804763596021153
0.8272161959633803
1.5185055819271953
