In [2]:
# Import linear regression library
from pyspark.ml.regression import LinearRegression


In [3]:
# Starting Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .config('spark.master', 'local[4]') \
    .appName('Test') \
    .getOrCreate()

sc = spark.sparkContext
spark

In [4]:
# Start with seed 1
seed=1

In [5]:
Green_Taxi_DF = spark.read.csv("Green_Taxi_B7.csv",header=True,inferSchema=True)

In [6]:
Green_Taxi_DF.limit(3).toPandas()

Unnamed: 0,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,month,week,day,hour,Trip_distance,Final_Fare_amount,Trip_Duration,speed,log_Trip_distance,log_Trip_Duration
0,-75.231,39.928,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,4.3,1.12,13.43,-1.39,0.11
1,-75.165,39.954,-75.165,39.954,2.0,7.0,15.0,12.0,0.2,3.3,0.7,17.14,-1.61,-0.36
2,-74.466,40.368,-74.444,40.416,1.0,4.0,25.0,16.0,3.7,20.8,19.27,11.52,1.31,2.96


In [7]:
# Dorp the unneeded columns
Green_Taxi_DF= Green_Taxi_DF.drop('speed')
Green_Taxi_DF= Green_Taxi_DF.drop('log_Trip_Duration')

In [8]:
# Split the data
trainData,testData = Green_Taxi_DF.randomSplit([0.8,0.2],seed=seed)

In [9]:
inputCols = trainData.columns

In [10]:
inputCols

['Pickup_longitude',
 'Pickup_latitude',
 'Dropoff_longitude',
 'Dropoff_latitude',
 'month',
 'week',
 'day',
 'hour',
 'Trip_distance',
 'Final_Fare_amount',
 'Trip_Duration',
 'log_Trip_distance']

In [11]:
# Convert input columns to a list
inputCols = list(filter(lambda x: not any(s in x for s in ['Trip_Duration','Final_Fare_amount']),inputCols))


In [12]:
inputCols

['Pickup_longitude',
 'Pickup_latitude',
 'Dropoff_longitude',
 'Dropoff_latitude',
 'month',
 'week',
 'day',
 'hour',
 'Trip_distance',
 'log_Trip_distance']

In [13]:
trainData.limit(1).toPandas()

Unnamed: 0,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,month,week,day,hour,Trip_distance,Final_Fare_amount,Trip_Duration,log_Trip_distance
0,-75.231,39.928,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,4.3,1.12,-1.39


In [14]:
############### Machine Learning
# Use vector assmbelter to convert the data to featurevector and required output label
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler()\
        .setInputCols(inputCols)\
        .setOutputCol("featureVector")

In [15]:
# Predict Fare
# Transform the data using the Vector assembler
TripTrainData = va.transform(trainData)
TripTrainData.select("featureVector", 'Trip_Duration').show(truncate=False, n=2)


+----------------------------------------------------------------------------------+-------------+
|featureVector                                                                     |Trip_Duration|
+----------------------------------------------------------------------------------+-------------+
|[-75.23100000000001,39.928000000000004,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,-1.39]|1.12         |
|[-75.165,39.954,-75.165,39.954,2.0,7.0,15.0,12.0,0.2,-1.61]                       |0.7          |
+----------------------------------------------------------------------------------+-------------+
only showing top 2 rows



In [16]:
TripTestData = va.transform(testData)
TripTestData.select("featureVector", 'Trip_Duration').show(truncate=False, n=2)

+------------------------------------------------------------+-------------+
|featureVector                                               |Trip_Duration|
+------------------------------------------------------------+-------------+
|[-74.285,40.519,-74.293,40.521,4.0,15.0,16.0,7.0,0.52,-0.65]|2.83         |
|[-74.23,40.77,-74.188,40.765,6.0,23.0,6.0,10.0,2.59,0.95]   |11.05        |
+------------------------------------------------------------+-------------+
only showing top 2 rows



In [17]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'featureVector', labelCol='Trip_Duration', maxIter=10, regParam=0.3, elasticNetParam=0.8)


In [18]:
# Fit the train data with the linear regression and build the model
ModelA = lr.fit(TripTrainData)


In [19]:
# Calculate the RMSE for the train model
trainingSummary = ModelA.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 3.144892
r2: 0.714153


In [21]:
# Do the prediction for the test data
Trip_lr_predictions = ModelA.transform(TripTestData)
Trip_lr_predictions.select("prediction","Final_Fare_amount","featureVector").show(5)


+------------------+-----------------+--------------------+
|        prediction|Final_Fare_amount|       featureVector|
+------------------+-----------------+--------------------+
| 4.611552086878177|              4.8|[-74.285,40.519,-...|
|13.270932397746364|             11.8|[-74.23,40.77,-74...|
| 6.167635178976099|5.799999999999998|[-74.195,40.702,-...|
|19.456683411262404|             16.3|[-74.185,40.564,-...|
|14.598487240152565|             12.3|[-74.179,40.607,-...|
+------------------+-----------------+--------------------+
only showing top 5 rows



In [22]:
# Evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator

Trip_lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Trip_Duration",metricName="r2")
print("R Squared (R2) on test data = %g" % Trip_lr_evaluator.evaluate(Trip_lr_predictions))


R Squared (R2) on test data = 0.714194


<br><B>Now we will do the same for the Taxi Fare <br><br><br>

In [23]:
# Predict Fare
FareTrainData = va.transform(trainData)
FareTrainData.select("featureVector", 'Final_Fare_amount').show(truncate=False, n=2)

+----------------------------------------------------------------------------------+-----------------+
|featureVector                                                                     |Final_Fare_amount|
+----------------------------------------------------------------------------------+-----------------+
|[-75.23100000000001,39.928000000000004,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,-1.39]|4.3              |
|[-75.165,39.954,-75.165,39.954,2.0,7.0,15.0,12.0,0.2,-1.61]                       |3.3              |
+----------------------------------------------------------------------------------+-----------------+
only showing top 2 rows



In [24]:
FareTestData = va.transform(testData)
FareTestData.select("featureVector", 'Final_Fare_amount').show(truncate=False, n=2)

+------------------------------------------------------------+-----------------+
|featureVector                                               |Final_Fare_amount|
+------------------------------------------------------------+-----------------+
|[-74.285,40.519,-74.293,40.521,4.0,15.0,16.0,7.0,0.52,-0.65]|4.8              |
|[-74.23,40.77,-74.188,40.765,6.0,23.0,6.0,10.0,2.59,0.95]   |11.8             |
+------------------------------------------------------------+-----------------+
only showing top 2 rows



In [25]:
from pyspark.ml.regression import LinearRegression
Fare_lr = LinearRegression(featuresCol = 'featureVector', labelCol='Final_Fare_amount', maxIter=10, regParam=0.3, elasticNetParam=0.8)


In [26]:
ModelB = Fare_lr.fit(FareTrainData)

In [27]:
Fare_trainingSummary = ModelB.summary
print("RMSE: %f" % Fare_trainingSummary.rootMeanSquaredError)
print("r2: %f" % Fare_trainingSummary.r2)

RMSE: 1.327268
r2: 0.909894


In [28]:
Fare_lr_predictions = ModelB.transform(FareTestData)

In [29]:
Fare_lr_predictions.select("prediction","Final_Fare_amount","featureVector").show(5)


+------------------+-----------------+--------------------+
|        prediction|Final_Fare_amount|       featureVector|
+------------------+-----------------+--------------------+
| 5.950869475332949|              4.8|[-74.285,40.519,-...|
|12.525673956865889|             11.8|[-74.23,40.77,-74...|
| 6.974178080963257|5.799999999999998|[-74.195,40.702,-...|
|18.383190648653446|             16.3|[-74.185,40.564,-...|
| 13.76861219564039|             12.3|[-74.179,40.607,-...|
+------------------+-----------------+--------------------+
only showing top 5 rows



In [30]:
from pyspark.ml.evaluation import RegressionEvaluator

Fare_lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Final_Fare_amount",metricName="r2")
print("R Squared (R2) on test data = %g" % Fare_lr_evaluator.evaluate(Fare_lr_predictions))


R Squared (R2) on test data = 0.909869


<b>Save the Model (Pickle)

In [31]:
ModelA.save("modelA.model")


In [32]:
ModelB.save("modelB.model")


<b>Call the Model (Pickle)

In [33]:
from pyspark.ml.regression import LinearRegressionModel

In [72]:
Trip_lr_model = LinearRegressionModel.load("modelA.model")
Fare_lr_model = LinearRegressionModel.load("modelB.model")


In [73]:
trainData.limit(2).toPandas()

Unnamed: 0,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,month,week,day,hour,Trip_distance,Final_Fare_amount,Trip_Duration,log_Trip_distance
0,-75.231,39.928,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,4.3,1.12,-1.39
1,-75.165,39.954,-75.165,39.954,2.0,7.0,15.0,12.0,0.2,3.3,0.7,-1.61


In [74]:
columns = ['Pickup_longitude','Pickup_latitude','Dropoff_longitude','Dropoff_latitude','month','week','day','hour','Trip_distance','log_Trip_distance']
vals = [(-75.231,39.928,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,-1.39)]

df = spark.createDataFrame(vals, columns)
df.toPandas()

Unnamed: 0,Pickup_longitude,Pickup_latitude,Dropoff_longitude,Dropoff_latitude,month,week,day,hour,Trip_distance,log_Trip_distance
0,-75.231,39.928,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,-1.39


In [75]:
rec = va.transform(df)
rec.select("featureVector", 'Trip_distance').show(truncate=False, n=2)


+-----------------------------------------------------------+-------------+
|featureVector                                              |Trip_distance|
+-----------------------------------------------------------+-------------+
|[-75.231,39.928,-75.227,39.93,6.0,23.0,8.0,20.0,0.25,-1.39]|0.25         |
+-----------------------------------------------------------+-------------+



In [76]:
predicted1 = Trip_lr_model.transform(rec)


In [84]:
predicted1.select("featureVector","prediction").show(5)


+--------------------+-------------+------------------+
|       featureVector|Trip_distance|        prediction|
+--------------------+-------------+------------------+
|[-75.231,39.928,-...|         0.25|3.2263917388071235|
+--------------------+-------------+------------------+



In [78]:
predicted2 = Fare_lr_model.transform(rec)


In [83]:
predicted2.select("featureVector","prediction").show(5)


+--------------------+-----------------+
|       featureVector|       prediction|
+--------------------+-----------------+
|[-75.231,39.928,-...|4.566109023316363|
+--------------------+-----------------+

