In [23]:
!pip install pyspark



In [24]:
# Importing libraries

from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [25]:
# Get preprocessed data

spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df = spark.read.option("inferSchema", "true").csv("../Preprocessing/NYC Taxi Duration Preprocessed/*.csv", header=True)
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- vendor_id: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: integer (nullable = true)
 |-- trip_duration: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- week_day: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter_of_year: integer (nullable = true)
 |-- hour: integer (nullable = true)



[Stage 3337:>                                                       (0 + 1) / 1]                                                                                

In [26]:
df.show()

+-------+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+
|     id|vendor_id|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration| distance|week_day|year|month|quarter_of_year|hour|
+-------+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+
|2875421|        2|              1|      -73.982155|      40.767937|        -73.96463|       40.765602|                 0|          455|2.4444735|     Mon|2016|    3|              1|  17|
|2377394|        1|              1|      -73.980415|      40.738564|        -73.99948|        40.73115|                 0|          663|2.6599078|     Sun|2016|    6|              2|   0|
|3858529|        2|              1|       -73.97903|       4

# Preparing df for ML models

In [27]:
# Convert week day string column into index column.
label_index = StringIndexer(inputCol = 'week_day', outputCol = 'week_day_index')
df = label_index.fit(df).transform(df)
df.show()

+-------+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+--------------+
|     id|vendor_id|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration| distance|week_day|year|month|quarter_of_year|hour|week_day_index|
+-------+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+--------------+
|2875421|        2|              1|      -73.982155|      40.767937|        -73.96463|       40.765602|                 0|          455|2.4444735|     Mon|2016|    3|              1|  17|           6.0|
|2377394|        1|              1|      -73.980415|      40.738564|        -73.99948|        40.73115|                 0|          663|2.6599078|     Sun|2016|    6|              2|   0| 

In [28]:
# Creating a list of input columns headers
input_col = [i[0] for i in df.dtypes if i[1] == "int" or i[1] == "double"]
input_col.remove("trip_duration")
input_col.remove("id")

# Creating a dataframe of input columns.
input_features = VectorAssembler(inputCols = input_col, outputCol = "input_features")
input_column = input_features.transform(df)
input_column.select("input_features").show()

+--------------------+
|      input_features|
+--------------------+
|[2.0,1.0,-73.9821...|
|[1.0,1.0,-73.9804...|
|[2.0,1.0,-73.9790...|
|[1.0,4.0,-73.9690...|
|[2.0,1.0,-73.9692...|
|[1.0,1.0,-73.9994...|
|[2.0,1.0,-73.9826...|
|[2.0,4.0,-73.9915...|
|[2.0,2.0,-73.9629...|
|[2.0,1.0,-73.9921...|
|[1.0,1.0,-74.0039...|
|[1.0,1.0,-73.9803...|
|[2.0,1.0,-73.9795...|
|[1.0,1.0,-73.9935...|
|[2.0,1.0,-73.9552...|
|[2.0,1.0,-73.9565...|
|[1.0,1.0,-73.9837...|
|[2.0,1.0,-73.9942...|
|[1.0,1.0,-73.9821...|
|[1.0,1.0,-73.9709...|
+--------------------+
only showing top 20 rows



In [29]:
data_frame = input_column.select("input_features", "trip_duration")
data_frame.show(5)

+--------------------+-------------+
|      input_features|trip_duration|
+--------------------+-------------+
|[2.0,1.0,-73.9821...|          455|
|[1.0,1.0,-73.9804...|          663|
|[2.0,1.0,-73.9790...|         2124|
|[1.0,4.0,-73.9690...|          341|
|[2.0,1.0,-73.9692...|         1551|
+--------------------+-------------+
only showing top 5 rows



## Linear Regression model

In [30]:
# Split training and testing dataset for the models.
train, test = data_frame.randomSplit([0.7, 0.3])

# linear regression model.
linear_regression = LinearRegression(labelCol = "trip_duration", featuresCol = "input_features")

# Create ParamGrid for Cross Validation.
linear_regression_param_grid = ParamGridBuilder() \
                                .addGrid(linear_regression.regParam, [0.01, 0.1, 0.5, 1.0, 2.0]) \
                                .addGrid(linear_regression.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]) \
                                .addGrid(linear_regression.maxIter, [1, 5, 10, 20, 50]) \
                                .build()

# evaluator with the root mean square error metric.
evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "rmse")

# cross validator with total 5 folds.
cross_validator = CrossValidator(estimator = linear_regression,
                      estimatorParamMaps = linear_regression_param_grid,
                      evaluator = evaluator,
                      numFolds = 5)

In [31]:
# Training a linear regression model.
model = cross_validator.fit(train)

                                                                                

23/01/14 15:34:55 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [36]:
# Print best parameter of the model.
print("Best Regression Param: ", model.bestModel._java_obj.getRegParam())
print("Best Elastic Net Param: ", model.bestModel._java_obj.getElasticNetParam())
print("Best maxIter Param: ", model.bestModel._java_obj.getMaxIter())
print("Intercept: ", model.bestModel.intercept)

# Acuracy and coefficients of the linear equation.
print("Coefficient of Determination(Accuracy on Training data): ", str(round(model.bestModel.summary.r2*100, 2)), "%")
print("Coefficients: ", model.bestModel.coefficients)

Best Regression Param:  0.1
Best Elastic Net Param:  0.75
Best maxIter Param:  50
Intercept:  -16692.383170831203
Coefficient of Determination(Accuracy on Training data):  50.1 %
Coefficients:  [-0.9437615811110158,2.363040263139473,-322.41843037716217,-217.52903235402331,-525.288344268495,-899.6268985849673,85.87879021095615,81.99337823918947,0.0,19.043871597021234,24.458390904352143,5.231982929145022,-12.440055126920804]


In [37]:
# Predict.
prediction = model.transform(test)

prediction.show(5)

[Stage 5225:>                                                       (0 + 1) / 1]

+--------------------+-------------+------------------+
|      input_features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,1.0,-74.1793...|         2975| 5131.088150512132|
|[1.0,1.0,-74.0280...|         1959|1126.2990554000062|
|[1.0,1.0,-74.0279...|          834| 956.8710007384761|
|[1.0,1.0,-74.0180...|          306| 771.0854912196191|
|[1.0,1.0,-74.0179...|         1054|1243.7367192554193|
+--------------------+-------------+------------------+
only showing top 5 rows



                                                                                

In [38]:
# Evaluator for testing data.
test_pred_evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "r2")

# Print RMSE and accuracy of the model on testing data.
print("RMSE: ", evaluator.evaluate(prediction))
print("Coefficient of Determination(Accuracy on Test Data): ", str(round(test_pred_evaluator.evaluate(prediction)*100, 4)), "%")


                                                                                

RMSE:  500.1457526949354


[Stage 5227:>                                                       (0 + 1) / 1]

Coefficient of Determination(Accuracy on Test Data):  50.371 %


                                                                                

## Gradient Boosted Trees Regression

In [39]:
# Gradient Boosted Trees regression instance.
gbtr = GBTRegressor(featuresCol="input_features", labelCol="trip_duration")

# Create ParamGrid for Cross Validation of gradient boosted trees regression.
gradient_boosted_trees_param_grid = ParamGridBuilder() \
                                .addGrid(gbtr.maxDepth, [2, 5, 10]) \
                                .addGrid(gbtr.maxIter, [2, 5, 10, 15]) \
                                .build()

# evaluator with the root mean square error metric for gbtr.
gbt_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="rmse")

# Cross validator for gbtr with 5 total folds.
cross_validator_gbtr = CrossValidator(estimator = gbtr,
                      estimatorParamMaps = gradient_boosted_trees_param_grid,
                      evaluator = gbt_evaluator,
                      numFolds = 5)

In [40]:
# Training a gradient boosted trees regression model.
model_gbtr = cross_validator_gbtr.fit(train)

                                                                                

23/01/14 15:38:48 WARN DAGScheduler: Broadcasting large task binary with size 1016.3 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1027.2 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1027.7 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1028.3 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1029.4 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1031.7 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1036.2 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1045.5 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1064.1 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1100.7 KiB
23/01/14 15:38:49 WARN DAGScheduler: Broadcasting large task binary with size 1170.2 KiB
23/01/14 15:38:50 WAR

23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1805.0 KiB
23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1805.5 KiB
23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1806.7 KiB
23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1809.0 KiB
23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1813.6 KiB
23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1823.0 KiB
23/01/14 15:39:04 WARN DAGScheduler: Broadcasting large task binary with size 1842.2 KiB
23/01/14 15:39:05 WARN DAGScheduler: Broadcasting large task binary with size 1878.1 KiB
23/01/14 15:39:05 WARN DAGScheduler: Broadcasting large task binary with size 1945.7 KiB
23/01/14 15:39:05 WARN DAGScheduler: Broadcasting large task binary with size 1956.5 KiB
23/01/14 15:39:05 WARN DAGScheduler: Broadcasting large task binary with size 1956.9 KiB
23/01/14 15:39:05 WAR

                                                                                

23/01/14 15:39:44 WARN DAGScheduler: Broadcasting large task binary with size 1005.1 KiB
23/01/14 15:39:44 WARN DAGScheduler: Broadcasting large task binary with size 1005.6 KiB
23/01/14 15:39:44 WARN DAGScheduler: Broadcasting large task binary with size 1006.2 KiB
23/01/14 15:39:44 WARN DAGScheduler: Broadcasting large task binary with size 1007.5 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1009.7 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1014.3 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1023.6 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1042.1 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1077.6 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1146.3 KiB
23/01/14 15:39:45 WARN DAGScheduler: Broadcasting large task binary with size 1157.8 KiB
23/01/14 15:39:45 WAR

23/01/14 15:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1733.9 KiB
23/01/14 15:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1738.4 KiB
23/01/14 15:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1747.7 KiB
23/01/14 15:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1766.3 KiB
23/01/14 15:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1802.0 KiB
23/01/14 15:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1871.4 KiB
23/01/14 15:40:01 WARN DAGScheduler: Broadcasting large task binary with size 1882.4 KiB
23/01/14 15:40:01 WARN DAGScheduler: Broadcasting large task binary with size 1882.9 KiB
23/01/14 15:40:01 WARN DAGScheduler: Broadcasting large task binary with size 1883.5 KiB
23/01/14 15:40:01 WARN DAGScheduler: Broadcasting large task binary with size 1884.6 KiB
23/01/14 15:40:01 WARN DAGScheduler: Broadcasting large task binary with size 1886.9 KiB
23/01/14 15:40:01 WAR

                                                                                

23/01/14 15:40:39 WARN DAGScheduler: Broadcasting large task binary with size 1010.7 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1024.2 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1024.7 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1025.2 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1026.5 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1029.0 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1033.6 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1043.1 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1061.8 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1098.2 KiB
23/01/14 15:40:40 WARN DAGScheduler: Broadcasting large task binary with size 1167.3 KiB
23/01/14 15:40:40 WAR

23/01/14 15:40:54 WARN DAGScheduler: Broadcasting large task binary with size 1807.6 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1808.9 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1811.2 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1815.9 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1824.9 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1843.1 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1877.6 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1943.5 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1952.9 KiB
23/01/14 15:40:55 WARN DAGScheduler: Broadcasting large task binary with size 1953.3 KiB
23/01/14 15:40:56 WARN DAGScheduler: Broadcasting large task binary with size 1953.9 KiB
23/01/14 15:40:56 WAR

                                                                                

23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1013.0 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1020.8 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1021.3 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1021.9 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1023.1 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1025.4 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1030.1 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1039.4 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1058.0 KiB
23/01/14 15:41:34 WARN DAGScheduler: Broadcasting large task binary with size 1094.1 KiB
23/01/14 15:41:35 WARN DAGScheduler: Broadcasting large task binary with size 1163.8 KiB
23/01/14 15:41:35 WAR

23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1772.4 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1772.9 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1774.2 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1776.4 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1781.0 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1790.4 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1809.2 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1843.6 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1909.3 KiB
23/01/14 15:41:49 WARN DAGScheduler: Broadcasting large task binary with size 1922.0 KiB
23/01/14 15:41:50 WARN DAGScheduler: Broadcasting large task binary with size 1922.5 KiB
23/01/14 15:41:50 WAR

                                                                                

23/01/14 15:42:28 WARN DAGScheduler: Broadcasting large task binary with size 1008.0 KiB
23/01/14 15:42:28 WARN DAGScheduler: Broadcasting large task binary with size 1020.0 KiB
23/01/14 15:42:28 WARN DAGScheduler: Broadcasting large task binary with size 1020.5 KiB
23/01/14 15:42:28 WARN DAGScheduler: Broadcasting large task binary with size 1021.2 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1022.4 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1024.7 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1029.3 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1038.5 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1056.9 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1092.5 KiB
23/01/14 15:42:29 WARN DAGScheduler: Broadcasting large task binary with size 1160.2 KiB
23/01/14 15:42:29 WAR

23/01/14 15:42:43 WARN DAGScheduler: Broadcasting large task binary with size 1779.8 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1780.5 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1781.7 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1784.0 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1788.5 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1797.8 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1816.3 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1850.4 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1914.9 KiB
23/01/14 15:42:44 WARN DAGScheduler: Broadcasting large task binary with size 1922.5 KiB
23/01/14 15:42:45 WARN DAGScheduler: Broadcasting large task binary with size 1922.9 KiB
23/01/14 15:42:45 WAR

                                                                                

23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1015.1 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1026.0 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1026.5 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1027.1 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1028.2 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1035.3 KiB
23/01/14 15:42:59 WARN DAGScheduler: Broadcasting large task binary with size 1044.5 KiB
23/01/14 15:43:00 WARN DAGScheduler: Broadcasting large task binary with size 1063.3 KiB
23/01/14 15:43:00 WARN DAGScheduler: Broadcasting large task binary with size 1100.7 KiB
23/01/14 15:43:00 WARN DAGScheduler: Broadcasting large task binary with size 1171.1 KiB
23/01/14 15:43:00 WAR

In [41]:
# Print best depth, bins, maxIter, impurity and losstype.
print("Best maxDepth Param: ", model_gbtr.bestModel._java_obj.getMaxDepth())
print("Best maxBins Param: ", model_gbtr.bestModel._java_obj.getMaxBins())
print("Best maxIter Param: ", model_gbtr.bestModel._java_obj.getMaxIter())
print("Best impurity Param: ", model_gbtr.bestModel._java_obj.getImpurity())
print("Best lossType Param: ", model_gbtr.bestModel._java_obj.getLossType())

Best maxDepth Param:  10
Best maxBins Param:  32
Best maxIter Param:  15
Best impurity Param:  variance
Best lossType Param:  squared


In [42]:
# Predict.
prediction_gbtr = model_gbtr.transform(test)

prediction_gbtr.show(5)

[Stage 11272:>                                                      (0 + 1) / 1]

+--------------------+-------------+------------------+
|      input_features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,1.0,-74.1793...|         2975|1798.9957536456566|
|[1.0,1.0,-74.0280...|         1959|1038.0020308849473|
|[1.0,1.0,-74.0279...|          834| 681.4740747461232|
|[1.0,1.0,-74.0180...|          306|  583.134764486287|
|[1.0,1.0,-74.0179...|         1054|1357.8441686430988|
+--------------------+-------------+------------------+
only showing top 5 rows



                                                                                

In [43]:
# Evaluator for testing data for gradient boosted trees regression model.
test_pred_evaluator_gbtr = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "r2")

# Print RMSE and accuracy of the model on testing data for gradient boosted trees regression.
print("RMSE: ", gbt_evaluator.evaluate(prediction_gbtr))
print("Coefficient of Determination(Accuracy on Test Data): ", str(round(test_pred_evaluator_gbtr.evaluate(prediction_gbtr)*100, 4)), "%")


                                                                                

RMSE:  361.136102638509


[Stage 11274:>                                                      (0 + 1) / 1]

Coefficient of Determination(Accuracy on Test Data):  74.1248 %


                                                                                