In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col, lit, udf, month, year, date_format, datediff, from_unixtime, unix_timestamp
from pyspark.sql.functions import date_trunc
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.window import Window
from pyspark.sql.types import (StructType, StructField, DateType, BooleanType,
                               DoubleType, IntegerType, StringType, TimestampType)
from functools import reduce
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [68]:
#!hdfs dfs -ls /user/mechols/data/
!hdfs dfs -ls /user/alphan/data/

Found 4 items
-rw-r--r--   3 alphan alphan 1824926642 2019-05-25 04:44 /user/alphan/data/chicago_crimes.csv
drwxr-xr-x   - alphan alphan          0 2019-06-04 02:09 /user/alphan/data/df.csv
drwxr-xr-x   - alphan alphan          0 2019-06-04 02:12 /user/alphan/data/final_project_df.csv
-rw-r--r--   3 alphan alphan  208276005 2019-04-30 20:10 /user/alphan/data/food-inspections.csv


In [None]:
spark = SparkSession.builder.appName('RideShare').getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','20g')])
#df = spark.read.csv("/user/mechols/data/fulldf.csv", inferSchema=True, header=True)
df = spark.read.csv("/user/alphan/data/final_project_df.csv", inferSchema=True, header=True)

In [None]:
df.printSchema()

In [None]:
spark = SparkSession.builder.appName('RideShare').getOrCreate()
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '256g'),
                                        ('spark.app.name', 'Spark Updated Conf'),
                                        ('spark.executor.cores', '16'),
                                        ('spark.cores.max', '16'),
                                        ('spark.driver.memory','256g'),
                                        ('spark.sql.AutoBroadcastJoinThreshold', -1),
                                        ('mapreduce.reduce.memory.mb',-1),
                                        ('spark.yarn.executor.memoryOverhead', -1),
                                        ('spark.kryoserializer.buffer.max.mb', '5g')])



In [67]:
#WRITE to HDFS
#fulldf = rides.join(spark_weather, rides.startTime == spark_weather.time, how='inner')

#res_path = '/user/alphan/data/final_project_df.csv'
#df.write.csv(path=res_path, header=True, compression='gzip')

### Preliminary Modelling

In [None]:
df.printSchema()

In [None]:
#Is there a linear correlation between columns

from pyspark.ml.feature import StandardScaler
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
raw_data=standardscaler.fit(raw_data).transform(raw_data)
raw_data.select("features","Scaled_features").show(5)

In [69]:
#Assemble vectors and Scale:

columns = ['seconds','miles','shared','communityPickup','communityDropoff','humidity', 
'apparentTemperature','precipIntensity',
'precipProbability', 'temperature', 'Cloudy','Rainy', 'Snowy','month','day', 'hour']

from pyspark.ml.feature import VectorAssembler,StandardScaler
vectorAssembler = VectorAssembler(inputCols = columns, outputCol = 'features')# 'fare', 'addCharge', 'tripTotal'
ml_data=vectorAssembler.transform(df)
ml_data.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[430.0,3.0,0.0,3....|
|[368.0,1.9,1.0,44...|
|[1142.0,14.7,1.0,...|
|[1288.0,3.9,1.0,4...|
|[205.0,1.2,0.0,10...|
+--------------------+
only showing top 5 rows



In [18]:
standardscaler=StandardScaler().setInputCol('features').setOutputCol('scaled_features')
scaled_data=standardscaler.fit(ml_data).transform(ml_data)
scaled_data.select('features','scaled_features').show(5)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[430.0,3.0,0.0,3....|[0.65134636695970...|
|[368.0,1.9,1.0,44...|[0.55743130939807...|
|[1142.0,14.7,1.0,...|[1.72985476992555...|
|[1288.0,3.9,1.0,4...|[1.95100958289326...|
|[205.0,1.2,0.0,10...|[0.31052559355055...|
+--------------------+--------------------+
only showing top 5 rows



In [19]:
#stages = [vector, scaler, ]
#pipe = Pipeline(stages=stages)

vfull_df = scaled_data.select(['scaled_features', 'fare'])
vfull_df.show(3)

+--------------------+----+
|     scaled_features|fare|
+--------------------+----+
|[0.65134636695970...| 7.5|
|[0.55743130939807...| 5.0|
|[1.72985476992555...|17.5|
+--------------------+----+
only showing top 3 rows



In [20]:
#Split the data
splits = scaled_data.randomSplit([0.99, 0.01])
small_df = splits[1]
small_split = small_df.randomSplit([0.7, 0.3])
train_df = small_split[0]
test_df = small_split[1]

#### Decision Tree

In [21]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'fare')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 3.39705


from FeatureImportanceSelector import ExtractFeatureImp, FeatureImpSelector
ExtractFeatureImp(mod.stages[-1].featureImportances, dt_predictions, "features_subset")

In [None]:
model = pd.DataFrame(dt_model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(features)
model["features"] = features_col
model.sort_values("values", ascending = False)

#### Gradient Boost

In [22]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'fare', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'fare', 'features').show(5)

+------------------+----+--------------------+
|        prediction|fare|            features|
+------------------+----+--------------------+
| 3.191120111559453| 5.0|[35.0,0.1,0.0,61....|
|3.2407646102402516| 5.0|[48.0,0.3,1.0,28....|
|3.1876104336031577| 2.5|[63.0,0.1,0.0,39....|
|3.0995382207737494| 2.5|[73.0,0.1,0.0,33....|
|3.1183621383751605| 2.5|[82.0,0.3,0.0,1.0...|
+------------------+----+--------------------+
only showing top 5 rows



In [73]:
gbt_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
gbt_rmse = gbt_evaluator.evaluate(gbt_predictions)

gbt_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
gbt_r2 = gbt_evaluator2.evaluate(gbt_predictions)

Root Mean Squared Error (RMSE) on test data = 3.14563
Root Mean Squared Error (RMSE) on test data = 0.758795


In [74]:
print("Root Mean Squared Error (RMSE) on test data = %g" % gbt_rmse)
print("Root Mean Squared Error (RMSE) on test data = %g" % gbt_r2)

Root Mean Squared Error (RMSE) on test data = 2.94831
Root Mean Squared Error (RMSE) on test data = 0.782302


In [None]:
model = pd.DataFrame(gbt_model.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(features)
model["features"] = features_col
model.sort_values("values", ascending = False)

#### Random Forest

In [24]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol = 'features', labelCol = 'fare')
rfModel = rf.fit(train_df)
rf_predictions = rfModel.transform(test_df)
rf_predictions.select('prediction', 'fare', 'features').show(5)

+-----------------+----+--------------------+
|       prediction|fare|            features|
+-----------------+----+--------------------+
|4.931155508072363| 2.5|[39.0,0.2,0.0,3.0...|
|4.869704038876839| 5.0|[50.0,0.1,1.0,16....|
|4.935776284030992| 2.5|[64.0,0.2,0.0,8.0...|
|4.833982624632548| 2.5|[73.0,0.3,1.0,60....|
|4.864347998313905| 2.5|[84.0,0.2,0.0,32....|
+-----------------+----+--------------------+
only showing top 5 rows



In [70]:
rf_evaluator = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="rmse")
rmse = rf_evaluator.evaluate(rf_predictions)

rf_evaluator2 = RegressionEvaluator(
    labelCol="fare", predictionCol="prediction", metricName="r2")
r2 = rf_evaluator2.evaluate(rf_predictions)

print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
print("Root Mean Squared Error (RMSE) on test data = %g" % r2)

Root Mean Squared Error (RMSE) on test data = 3.14563
Root Mean Squared Error (RMSE) on test data = 0.758795


In [None]:
model = pd.DataFrame(rfModel.featureImportances.toArray(), columns=["values"])
features_col = pd.Series(features)
model["features"] = features_col
model.sort_values("values", ascending = False)

#### Linear Regression

In [26]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='fare', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0019992517787194124,0.9857414716629734,-3.3667961022972794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 3.7666711227798704


In [27]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 3.297317
r2: 0.765130


### Models with Gradient Search

#### Random Forest

In [36]:
lr_data = train_df.select(col("fare").alias("label"), *columns)  
lr_data.printSchema()  

root
 |-- label: double (nullable = true)
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- humidity: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [37]:
from pyspark.ml import Pipeline 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
rfr = RandomForestRegressor(labelCol="label", featuresCol="scaled_features")
stages = [vectorAssembler, standardscaler, rfr]
pipe = Pipeline(stages=stages)

In [38]:
estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6]) \
.addGrid(rfr.maxBins, [5, 10]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [40]:
crossval = CrossValidator(estimator=pipe,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(lr_data)

In [41]:
modelEvaluator=RegressionEvaluator()
eval_rmse = RegressionEvaluator(metricName="rmse")
eval_r2 = RegressionEvaluator(metricName="r2")

In [47]:
#Not sure it matters what data we use here
print(eval_rmse.evaluate(cvmodel.transform(lr_data)))
print(eval_r2.evaluate(cvmodel.transform(lr_data)))

3.2565578698190656
0.7710192256352868


#### Gradient Boost

In [55]:
gbt = GBTRegressor(labelCol="label", featuresCol="scaled_features")
stages = [vectorAssembler, standardscaler, gbt]
pipe = Pipeline(stages=stages)

In [56]:
estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6]) \
.addGrid(rfr.maxBins, [5, 10]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [57]:
crossval = CrossValidator(estimator=pipe,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

cvmodel = crossval.fit(lr_data)

In [58]:
modelEvaluator=RegressionEvaluator()
eval_rmse = RegressionEvaluator(metricName="rmse")
eval_r2 = RegressionEvaluator(metricName="r2")

In [59]:
print(eval_rmse.evaluate(cvmodel.transform(lr_data)))
print(eval_r2.evaluate(cvmodel.transform(lr_data)))

3.009552922333908
0.8058905579571947


In [61]:
df.printSchema()

root
 |-- seconds: integer (nullable = true)
 |-- miles: double (nullable = true)
 |-- communityPickup: integer (nullable = true)
 |-- communityDropoff: integer (nullable = true)
 |-- fare: double (nullable = true)
 |-- shared: boolean (nullable = true)
 |-- pickupLat: double (nullable = true)
 |-- pickupLong: double (nullable = true)
 |-- dropoffLat: double (nullable = true)
 |-- dropoffLong: double (nullable = true)
 |-- apparentTemperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precipIntensity: double (nullable = true)
 |-- precipProbability: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- Cloudy: double (nullable = true)
 |-- Rainy: double (nullable = true)
 |-- Snowy: double (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)

