In [1]:
!pip install pyspark



In [8]:
# Importing libraries

from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [9]:
# Get preprocessed data

spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df = spark.read.option("inferSchema", "true").csv("../Preprocessing/NYC Taxi Duration Preprocessed/*.csv", header=True)
df.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: integer (nullable = true)
 |-- trip_duration: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- week_day: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter_of_year: integer (nullable = true)
 |-- hour: integer (nullable = true)



[Stage 8:>                                                          (0 + 1) / 1]                                                                                

In [10]:
df.show()

+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+
|vendor_id|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration| distance|week_day|year|month|quarter_of_year|hour|
+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+
|        2|              1|      -73.982155|      40.767937|        -73.96463|       40.765602|                 0|          455|2.4444735|     Mon|2016|    3|              1|  17|
|        1|              1|      -73.980415|      40.738564|        -73.99948|        40.73115|                 0|          663|2.6599078|     Sun|2016|    6|              2|   0|
|        2|              1|       -73.97903|       40.76394|        -74.00533|       40.710087|     

# Preparing df for ML models

In [11]:
# Convert week day string column into index column.
label_index = StringIndexer(inputCol = 'week_day', outputCol = 'week_day_index')
df = label_index.fit(df).transform(df)
df.show()

+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+--------------+
|vendor_id|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration| distance|week_day|year|month|quarter_of_year|hour|week_day_index|
+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+--------------+
|        2|              1|      -73.982155|      40.767937|        -73.96463|       40.765602|                 0|          455|2.4444735|     Mon|2016|    3|              1|  17|           6.0|
|        1|              1|      -73.980415|      40.738564|        -73.99948|        40.73115|                 0|          663|2.6599078|     Sun|2016|    6|              2|   0|           4.0|
|        2|              

In [12]:
# Creating a list of input columns headers
input_col = [i[0] for i in df.dtypes if i[1] == "int" or i[1] == "double"]
input_col.remove("trip_duration")

# Creating a dataframe of input columns.
input_features = VectorAssembler(inputCols = input_col, outputCol = "input_features")
input_column = input_features.transform(df)
input_column.select("input_features").show()

23/01/15 23:58:49 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------------------+
|      input_features|
+--------------------+
|[2.0,1.0,-73.9821...|
|[1.0,1.0,-73.9804...|
|[2.0,1.0,-73.9790...|
|[1.0,4.0,-73.9690...|
|[2.0,1.0,-73.9692...|
|[1.0,1.0,-73.9994...|
|[2.0,1.0,-73.9826...|
|[2.0,4.0,-73.9915...|
|[2.0,2.0,-73.9629...|
|[2.0,1.0,-73.9921...|
|[1.0,1.0,-74.0039...|
|[1.0,1.0,-73.9803...|
|[2.0,1.0,-73.9795...|
|[1.0,1.0,-73.9935...|
|[2.0,1.0,-73.9552...|
|[2.0,1.0,-73.9565...|
|[1.0,1.0,-73.9837...|
|[2.0,1.0,-73.9942...|
|[1.0,1.0,-73.9821...|
|[1.0,1.0,-73.9709...|
+--------------------+
only showing top 20 rows



In [13]:
data_frame = input_column.select("input_features", "trip_duration")
data_frame.show(5)

+--------------------+-------------+
|      input_features|trip_duration|
+--------------------+-------------+
|[2.0,1.0,-73.9821...|          455|
|[1.0,1.0,-73.9804...|          663|
|[2.0,1.0,-73.9790...|         2124|
|[1.0,4.0,-73.9690...|          341|
|[2.0,1.0,-73.9692...|         1551|
+--------------------+-------------+
only showing top 5 rows



## Linear Regression model

In [19]:
# Split training and testing dataset for the models.
train, test = data_frame.randomSplit([0.7, 0.3])

# linear regression model.
linear_regression = LinearRegression(labelCol = "trip_duration", featuresCol = "input_features")

# Create ParamGrid for Cross Validation.
linear_regression_param_grid = ParamGridBuilder() \
                                .addGrid(linear_regression.regParam, [0.01, 0.1, 0.5, 1.0, 2.0]) \
                                .addGrid(linear_regression.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]) \
                                .addGrid(linear_regression.maxIter, [1, 5, 10, 20, 50]) \
                                .addGrid(linear_regression.fitIntercept, [True, False]) \
                                .build()

# evaluator with the root mean square error metric.
evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "rmse")

# cross validator with total 5 folds.
cross_validator = CrossValidator(estimator = linear_regression,
                      estimatorParamMaps = linear_regression_param_grid,
                      evaluator = evaluator,
                      numFolds = 5)

In [20]:
# Training a linear regression model.
model = cross_validator.fit(train)

                                                                                

In [21]:
# Print best parameter of the model.
print("Best Regression Param: ", model.bestModel._java_obj.getRegParam())
print("Best Elastic Net Param: ", model.bestModel._java_obj.getElasticNetParam())
print("Best maxIter Param: ", model.bestModel._java_obj.getMaxIter())
print("Best fitIntercept Param: ", model.bestModel._java_obj.getFitIntercept())
print("Intercept: ", model.bestModel.intercept)

# Acuracy and coefficients of the linear equation.
print("Coefficient of Determination(Accuracy on Training data): ", str(round(model.bestModel.summary.r2*100, 2)), "%")
print("Coefficients: ", model.bestModel.coefficients)

Best Regression Param:  0.01
Best Elastic Net Param:  0.25
Best maxIter Param:  20
Best fitIntercept Param:  True
Intercept:  -19235.005975346867
Coefficient of Determination(Accuracy on Training data):  50.12 %
Coefficients:  [0.749325910707641,2.2572463385882204,-325.336075706018,-217.67552819562678,-536.4434332987057,-862.7021553118498,75.32756296525925,82.03011447964624,0.0,18.879690528403284,22.082995444444144,5.383600297732522,-12.106268796570658]


In [22]:
# Predict.
prediction = model.transform(test)

prediction.show(5)

[Stage 7523:>                                                       (0 + 1) / 1]

+--------------------+-------------+------------------+
|      input_features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,1.0,-74.2310...|         1880| 4318.493897415774|
|[1.0,1.0,-74.0215...|          340| 885.8448166915368|
|[1.0,1.0,-74.0184...|          667|1244.3454351175642|
|[1.0,1.0,-74.0180...|         3553|1487.2242294263888|
|[1.0,1.0,-74.0179...|         1054|1244.1507614153415|
+--------------------+-------------+------------------+
only showing top 5 rows



                                                                                

In [23]:
# Evaluator for testing data.
test_pred_evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "r2")

# Print RMSE and accuracy of the model on testing data.
print("RMSE: ", evaluator.evaluate(prediction))
print("Coefficient of Determination(Accuracy on Test Data): ", str(round(test_pred_evaluator.evaluate(prediction)*100, 4)), "%")


                                                                                

RMSE:  500.76489579072614


[Stage 7525:>                                                       (0 + 1) / 1]

Coefficient of Determination(Accuracy on Test Data):  50.3203 %


                                                                                

## Gradient Boosted Trees Regression

In [25]:
# Gradient Boosted Trees regression instance.
gbtr = GBTRegressor(featuresCol="input_features", labelCol="trip_duration")

# Create ParamGrid for Cross Validation of gradient boosted trees regression.
gradient_boosted_trees_param_grid = ParamGridBuilder() \
                                .addGrid(gbtr.maxDepth, [5, 10]) \
                                .addGrid(gbtr.maxIter, [5, 10, 20]) \
                                .addGrid(gbtr.maxBins, [32, 64]) \
                                .build()

# evaluator with the root mean square error metric for gbtr.
gbt_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="rmse")

# Cross validator for gbtr with 5 total folds.
cross_validator_gbtr = CrossValidator(estimator = gbtr,
                      estimatorParamMaps = gradient_boosted_trees_param_grid,
                      evaluator = gbt_evaluator,
                      numFolds = 5)

In [26]:
# Training a gradient boosted trees regression model.
model_gbtr = cross_validator_gbtr.fit(train)

                                                                                

23/01/16 00:12:46 WARN DAGScheduler: Broadcasting large task binary with size 1028.4 KiB
23/01/16 00:12:46 WARN DAGScheduler: Broadcasting large task binary with size 1044.1 KiB
23/01/16 00:12:46 WARN DAGScheduler: Broadcasting large task binary with size 1044.6 KiB
23/01/16 00:12:46 WARN DAGScheduler: Broadcasting large task binary with size 1045.1 KiB
23/01/16 00:12:46 WARN DAGScheduler: Broadcasting large task binary with size 1046.4 KiB
23/01/16 00:12:47 WARN DAGScheduler: Broadcasting large task binary with size 1048.7 KiB
23/01/16 00:12:47 WARN DAGScheduler: Broadcasting large task binary with size 1053.2 KiB
23/01/16 00:12:47 WARN DAGScheduler: Broadcasting large task binary with size 1062.7 KiB
23/01/16 00:12:47 WARN DAGScheduler: Broadcasting large task binary with size 1081.1 KiB
23/01/16 00:12:47 WARN DAGScheduler: Broadcasting large task binary with size 1115.3 KiB
23/01/16 00:12:47 WARN DAGScheduler: Broadcasting large task binary with size 1177.8 KiB
23/01/16 00:12:47 WAR

23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1187.6 KiB
23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1190.0 KiB
23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1194.5 KiB
23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1204.0 KiB
23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1222.9 KiB
23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1258.7 KiB
23/01/16 00:13:07 WARN DAGScheduler: Broadcasting large task binary with size 1325.5 KiB
23/01/16 00:13:08 WARN DAGScheduler: Broadcasting large task binary with size 1337.2 KiB
23/01/16 00:13:08 WARN DAGScheduler: Broadcasting large task binary with size 1337.7 KiB
23/01/16 00:13:08 WARN DAGScheduler: Broadcasting large task binary with size 1338.3 KiB
23/01/16 00:13:08 WARN DAGScheduler: Broadcasting large task binary with size 1339.4 KiB
23/01/16 00:13:08 WAR

23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:17 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:18 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:18 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:18 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:18 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:13:18 WARN DAGScheduler: Broadcasting larg

23/01/16 00:13:33 WARN DAGScheduler: Broadcasting large task binary with size 1877.6 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 1882.3 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 1891.4 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 1909.2 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 1942.1 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 1999.2 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 2007.3 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 2007.8 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 2008.4 KiB
23/01/16 00:13:34 WARN DAGScheduler: Broadcasting large task binary with size 2009.6 KiB
23/01/16 00:13:35 WARN DAGScheduler: Broadcasting large task binary with size 2011.9 KiB
23/01/16 00:13:35 WAR

                                                                                

23/01/16 00:14:28 WARN DAGScheduler: Broadcasting large task binary with size 1001.9 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1013.1 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1013.5 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1014.1 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1015.3 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1017.6 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1022.1 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1031.5 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1050.1 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1085.9 KiB
23/01/16 00:14:29 WARN DAGScheduler: Broadcasting large task binary with size 1151.5 KiB
23/01/16 00:14:30 WAR

23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1163.5 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1164.3 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1165.4 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1167.7 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1172.3 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1181.6 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1199.5 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1234.4 KiB
23/01/16 00:14:50 WARN DAGScheduler: Broadcasting large task binary with size 1302.2 KiB
23/01/16 00:14:51 WARN DAGScheduler: Broadcasting large task binary with size 1316.3 KiB
23/01/16 00:14:51 WARN DAGScheduler: Broadcasting large task binary with size 1316.8 KiB
23/01/16 00:14:51 WAR

23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:00 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:15:01 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
23/01/16 00:15:01 WARN DAGScheduler: Broadcasting larg

23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1898.4 KiB
23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1899.0 KiB
23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1900.3 KiB
23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1902.6 KiB
23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1907.3 KiB
23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1916.6 KiB
23/01/16 00:15:16 WARN DAGScheduler: Broadcasting large task binary with size 1935.3 KiB
23/01/16 00:15:17 WARN DAGScheduler: Broadcasting large task binary with size 1970.8 KiB
23/01/16 00:15:17 WARN DAGScheduler: Broadcasting large task binary with size 2035.6 KiB
23/01/16 00:15:17 WARN DAGScheduler: Broadcasting large task binary with size 2044.0 KiB
23/01/16 00:15:17 WARN DAGScheduler: Broadcasting large task binary with size 2044.5 KiB
23/01/16 00:15:17 WAR

                                                                                

23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1017.6 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1032.6 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1033.1 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1033.7 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1034.9 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1037.2 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1041.7 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1051.2 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1069.3 KiB
23/01/16 00:16:11 WARN DAGScheduler: Broadcasting large task binary with size 1103.8 KiB
23/01/16 00:16:12 WARN DAGScheduler: Broadcasting large task binary with size 1168.3 KiB
23/01/16 00:16:12 WAR

23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1180.3 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1180.9 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1182.2 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1184.5 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1189.0 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1198.3 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1216.7 KiB
23/01/16 00:16:32 WARN DAGScheduler: Broadcasting large task binary with size 1251.8 KiB
23/01/16 00:16:33 WARN DAGScheduler: Broadcasting large task binary with size 1317.9 KiB
23/01/16 00:16:33 WARN DAGScheduler: Broadcasting large task binary with size 1326.7 KiB
23/01/16 00:16:33 WARN DAGScheduler: Broadcasting large task binary with size 1327.2 KiB
23/01/16 00:16:33 WAR

23/01/16 00:16:42 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:16:42 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:16:42 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:16:43 WARN DAGScheduler: Broadcasting larg

23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1877.8 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1878.4 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1879.5 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1881.8 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1886.4 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1895.8 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1913.8 KiB
23/01/16 00:16:59 WARN DAGScheduler: Broadcasting large task binary with size 1946.8 KiB
23/01/16 00:17:00 WARN DAGScheduler: Broadcasting large task binary with size 2008.0 KiB
23/01/16 00:17:00 WARN DAGScheduler: Broadcasting large task binary with size 2015.8 KiB
23/01/16 00:17:00 WARN DAGScheduler: Broadcasting large task binary with size 2016.2 KiB
23/01/16 00:17:00 WAR

                                                                                

23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1009.7 KiB
23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1021.0 KiB
23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1021.5 KiB
23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1022.1 KiB
23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1023.3 KiB
23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1025.6 KiB
23/01/16 00:17:54 WARN DAGScheduler: Broadcasting large task binary with size 1030.1 KiB
23/01/16 00:17:55 WARN DAGScheduler: Broadcasting large task binary with size 1039.3 KiB
23/01/16 00:17:55 WARN DAGScheduler: Broadcasting large task binary with size 1057.2 KiB
23/01/16 00:17:55 WARN DAGScheduler: Broadcasting large task binary with size 1092.6 KiB
23/01/16 00:17:55 WARN DAGScheduler: Broadcasting large task binary with size 1160.4 KiB
23/01/16 00:17:55 WAR

23/01/16 00:18:27 WARN DAGScheduler: Broadcasting large task binary with size 1173.2 KiB
23/01/16 00:18:27 WARN DAGScheduler: Broadcasting large task binary with size 1173.8 KiB
23/01/16 00:18:27 WARN DAGScheduler: Broadcasting large task binary with size 1175.0 KiB
23/01/16 00:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1177.3 KiB
23/01/16 00:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1181.9 KiB
23/01/16 00:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1191.4 KiB
23/01/16 00:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1211.0 KiB
23/01/16 00:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1247.8 KiB
23/01/16 00:18:28 WARN DAGScheduler: Broadcasting large task binary with size 1317.1 KiB
23/01/16 00:18:29 WARN DAGScheduler: Broadcasting large task binary with size 1327.0 KiB
23/01/16 00:18:29 WARN DAGScheduler: Broadcasting large task binary with size 1327.5 KiB
23/01/16 00:18:29 WAR

23/01/16 00:18:44 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:18:45 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:18:45 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:18:45 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:45 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:45 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:46 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:46 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:46 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:46 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:46 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:18:46 WARN DAGScheduler: Broadcasting larg

23/01/16 00:19:03 WARN DAGScheduler: Broadcasting large task binary with size 1869.1 KiB
23/01/16 00:19:03 WARN DAGScheduler: Broadcasting large task binary with size 1869.6 KiB
23/01/16 00:19:03 WARN DAGScheduler: Broadcasting large task binary with size 1870.2 KiB
23/01/16 00:19:03 WARN DAGScheduler: Broadcasting large task binary with size 1871.3 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 1873.7 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 1878.3 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 1887.7 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 1906.7 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 1942.9 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 2009.2 KiB
23/01/16 00:19:04 WARN DAGScheduler: Broadcasting large task binary with size 2018.0 KiB
23/01/16 00:19:04 WAR

                                                                                

23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1020.0 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1032.2 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1032.7 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1033.3 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1034.6 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1036.8 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1041.5 KiB
23/01/16 00:19:59 WARN DAGScheduler: Broadcasting large task binary with size 1050.8 KiB
23/01/16 00:20:00 WARN DAGScheduler: Broadcasting large task binary with size 1069.3 KiB
23/01/16 00:20:00 WARN DAGScheduler: Broadcasting large task binary with size 1104.5 KiB
23/01/16 00:20:00 WARN DAGScheduler: Broadcasting large task binary with size 1172.0 KiB
23/01/16 00:20:00 WAR

23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1183.6 KiB
23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1184.3 KiB
23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1185.4 KiB
23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1187.8 KiB
23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1192.3 KiB
23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1201.6 KiB
23/01/16 00:20:20 WARN DAGScheduler: Broadcasting large task binary with size 1220.2 KiB
23/01/16 00:20:21 WARN DAGScheduler: Broadcasting large task binary with size 1256.6 KiB
23/01/16 00:20:21 WARN DAGScheduler: Broadcasting large task binary with size 1326.2 KiB
23/01/16 00:20:21 WARN DAGScheduler: Broadcasting large task binary with size 1337.2 KiB
23/01/16 00:20:21 WARN DAGScheduler: Broadcasting large task binary with size 1337.6 KiB
23/01/16 00:20:21 WAR

23/01/16 00:20:30 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:20:30 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB
23/01/16 00:20:30 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:30 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:30 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
23/01/16 00:20:31 WARN DAGScheduler: Broadcasting larg

23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1911.7 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1912.1 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1912.8 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1913.9 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1916.2 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1920.8 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1930.0 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1948.5 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 1983.9 KiB
23/01/16 00:20:47 WARN DAGScheduler: Broadcasting large task binary with size 2046.1 KiB
23/01/16 00:20:48 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB
23/01/16 00:20:48 WARN D

                                                                                

23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1002.5 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1003.0 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1003.6 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1004.8 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1007.2 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1011.8 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1021.0 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1039.9 KiB
23/01/16 00:21:08 WARN DAGScheduler: Broadcasting large task binary with size 1075.4 KiB
23/01/16 00:21:09 WARN DAGScheduler: Broadcasting large task binary with size 1140.8 KiB
23/01/16 00:21:09 WARN DAGScheduler: Broadcasting large task binary with size 1150.4 KiB
23/01/16 00:21:09 WAR

23/01/16 00:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
23/01/16 00:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
23/01/16 00:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
23/01/16 00:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:19 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/01/16 00:21:20 WARN DAGScheduler: Broadcasting larg

In [27]:
# Print best depth, bins, maxIter, impurity and losstype.
print("Best maxDepth Param: ", model_gbtr.bestModel._java_obj.getMaxDepth())
print("Best maxBins Param: ", model_gbtr.bestModel._java_obj.getMaxBins())
print("Best maxIter Param: ", model_gbtr.bestModel._java_obj.getMaxIter())
print("Best impurity Param: ", model_gbtr.bestModel._java_obj.getImpurity())
print("Best lossType Param: ", model_gbtr.bestModel._java_obj.getLossType())

Best maxDepth Param:  10
Best maxBins Param:  64
Best maxIter Param:  20
Best impurity Param:  variance
Best lossType Param:  squared


In [28]:
# Predict.
prediction_gbtr = model_gbtr.transform(test)

prediction_gbtr.show(5)

[Stage 18730:>                                                      (0 + 1) / 1]

+--------------------+-------------+-----------------+
|      input_features|trip_duration|       prediction|
+--------------------+-------------+-----------------+
|[1.0,1.0,-74.2310...|         1880|2920.810981015648|
|[1.0,1.0,-74.0215...|          340|782.3941378648464|
|[1.0,1.0,-74.0184...|          667|952.3842015242665|
|[1.0,1.0,-74.0180...|         3553|2206.326296134674|
|[1.0,1.0,-74.0179...|         1054|1365.704378409708|
+--------------------+-------------+-----------------+
only showing top 5 rows



                                                                                

In [29]:
# Evaluator for testing data for gradient boosted trees regression model.
test_pred_evaluator_gbtr = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "r2")

# Print RMSE and accuracy of the model on testing data for gradient boosted trees regression.
print("RMSE: ", gbt_evaluator.evaluate(prediction_gbtr))
print("Coefficient of Determination(Accuracy on Test Data): ", str(round(test_pred_evaluator_gbtr.evaluate(prediction_gbtr)*100, 4)), "%")


                                                                                

RMSE:  362.0912865178779


[Stage 18732:>                                                      (0 + 1) / 1]

Coefficient of Determination(Accuracy on Test Data):  74.0255 %


                                                                                