# Install required library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install pyspark jupyter_contrib_nbextensions sparkmonitor
!jupyter contrib nbextension install --user
!jupyter nbextension enable varInspector/main
!jupyter nbextension install sparkmonitor --py --user
!jupyter nbextension enable  sparkmonitor --py --user
!jupyter serverextension enable --py --system sparkmonitor  --user
!jupyter lab build
!ipython profile create

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 69 kB/s 
[?25hCollecting jupyter_contrib_nbextensions
  Downloading jupyter_contrib_nbextensions-0.5.1-py2.py3-none-any.whl (20.9 MB)
[K     |████████████████████████████████| 20.9 MB 1.2 MB/s 
[?25hCollecting sparkmonitor
  Downloading sparkmonitor-1.1.1-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 59.3 MB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 48.1 MB/s 
Collecting jupyter-latex-envs>=1.3.8
  Downloading jupyter_latex_envs-1.4.6.tar.gz (861 kB)
[K     |████████████████████████████████| 861 kB 49.4 MB/s 
[?25hCollecting jupyter-contrib-core>=0.3.3
  Downloading jupyter_contrib_core-0.3.3-py2.py3-none-any.whl (18 kB)
Collecting jupyter-nbextensions-configurator>=0.4.0
  Downloading jupyter_nbextensions_configurator-0.4.1.tar.gz (479 kB)


# Import required library

In [8]:
import os
import pyspark
import pyspark.sql.functions as F
root_folder = "/content/drive/MyDrive/MAST30024/"
data_dir = os.path.join(root_folder, "Data")
SQLOutput_dir = os.path.join(root_folder, "code/SparkSQL_Output")
plot_dir = os.path.join(root_folder, "Plots")

In [9]:
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [10]:
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")

# create a spark session (which will run spark jobs)
spark = SparkSession.builder.getOrCreate()

# Loading data

In [45]:
train = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(os.path.join(data_dir,"Model", "train1.csv"))
test = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(os.path.join(data_dir,"Model", "test1.csv"))
train = train.drop("Key")
test = test.drop("Key")

In [42]:
continuous_columns  =['passenger_count', 'trip_distance', 'payment_type', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
         'tempMax', 'tempMin', 'tempAvg', 'tempDeparture', 'hdd',
       'cdd', 'precipitation', 'newSnow', 'snowDepth']

In [49]:
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=continuous_columns, outputCol='conti_features')
train_trans = assembler.transform(train)
test_trans=  assembler.transform(test)
assembler = VectorAssembler(inputCols=categorical_columns, outputCol='cate_features')
train_trans = assembler.transform(train_trans)
test_trans =  assembler.transform(test_trans)

In [50]:
train_trans.show(5)

+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+-------+-------+-------+-------------+----+----+-------------+-------+---------+---------+-------+-----+----+-----------+--------------------+--------------------+
|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration|tempMax|tempMin|tempAvg|tempDeparture| hdd| cdd|precipitation|newSnow|snowDepth|DayofWeek|Weekend|Month|Hour|WorkingHour|      conti_features|       cate_features|
+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+-------+-------+-------+-------------+----+----+-------------+-------+---------+---------+-------+-----+----+-----------+--------------------+-----

In [53]:
from pyspark.ml.feature import UnivariateFeatureSelector
selector1 = UnivariateFeatureSelector(featuresCol="conti_features", outputCol="contiselectedFeatures",
                                     labelCol="duration", selectionMode="fpr")
selector1.setFeatureType("continuous").setLabelType("continuous").setSelectionThreshold(0.05)
selector1 = selector1.fit(train_trans)

In [54]:
selected_train = selector1.transform(train_trans)
selected_test = selector1.transform(test_trans)
selected_train.show(5)

+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+-------+-------+-------+-------------+----+----+-------------+-------+---------+---------+-------+-----+----+-----------+--------------------+--------------------+---------------------+
|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration|tempMax|tempMin|tempAvg|tempDeparture| hdd| cdd|precipitation|newSnow|snowDepth|DayofWeek|Weekend|Month|Hour|WorkingHour|      conti_features|       cate_features|contiselectedFeatures|
+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+-------+-------+-------+-------------+----+----+-------------+-------+---------+---------+-------+-----

In [55]:
selected_test.show(5)

+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+-------+-------+-------+-------------+----+---+-------------+-------+---------+---------+-------+-----+----+-----------+--------------------+--------------------+---------------------+
|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration|tempMax|tempMin|tempAvg|tempDeparture| hdd|cdd|precipitation|newSnow|snowDepth|DayofWeek|Weekend|Month|Hour|WorkingHour|      conti_features|       cate_features|contiselectedFeatures|
+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------+-------+-------+-------+-------------+----+---+-------------+-------+---------+---------+-------+-----+--

# Loading model

In [57]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.

featuresCol = ["cate_features", "contiselectedFeatures"]
assembler = VectorAssembler(inputCols=featuresCol, outputCol='concat_features')
train_trans = assembler.transform(selected_train)
test_trans=  assembler.transform(selected_test)

# Train a RandomForest model.
gbt = GBTRegressor(featuresCol="concat_features", labelCol="duration" , maxIter=30, lossType="absolute")

# Train model.  This also runs the indexer.
model = gbt.fit(train_trans)
predict = model.transform(test_trans)

In [58]:
y_predict=  predict.toPandas()

In [None]:
y_predict = list(y_predict["prediction"])

In [65]:
from sklearn.metrics import mean_squared_log_error
y_test = test.select('duration').rdd.flatMap(lambda x: x).collect()

In [66]:
for value in y_predict:
    if value < 0:
        index = y_predict.index(value)
        y_predict[index] = 0

In [67]:
# GBT maxIter 30 loss absolute with feature selection
mean_squared_log_error(y_test, y_predict )

0.025886957201085508

In [None]:
# GBT maxIter 30 loss absolute without feature selection
mean_squared_log_error(y_test, y_predict )

0.025836632066295415

## Performance does not change much with or without feature selection
Therefore we continue without feature selection