# Merging datasets

In [57]:
root_folder = "/home/trungdc/unimelb/MAST30024/asm/mast30034_2021_s2_project_1-alexdang02-1/"
data_dir = os.path.join(root_folder, "Data")


import os
import shutil
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [7]:
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

from pyspark.sql.functions import to_timestamp


In [16]:
sdf = spark.read.format("csv").option("header", "true").load(os.path.join(data_dir,"Trip", "yellow_concat.csv"))
sdf= sdf.where((sdf.PULocationID.isin(["264", "265"]) == False) | (sdf.PULocationID.isin(["264", "265"]) == False) ) \
.withColumn("tpep_pickup_datetime", to_timestamp(sdf.tpep_pickup_datetime, 'yyyy-MM-dd HH:mm:ss') ) \
.withColumn("tpep_dropoff_datetime", to_timestamp(sdf.tpep_dropoff_datetime, 'yyyy-MM-dd HH:mm:ss') ) \
.withColumn("trip_distance", col("trip_distance").cast("int")) \
.withColumn("duration(m)", round((col("tpep_dropoff_datetime").cast("long")  - col("tpep_pickup_datetime").cast("long"))/60)) \
.withColumn('Key', concat(col('PULocationID'),lit('-'), col('DOLocationID'))) \
.withColumn("date", date_trunc("day", col("tpep_pickup_datetime"))) \
.drop("store_and_fwd_flag", "tpep_dropoff_datetime")
sdf.show(5)

+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+-------+-------------------+
|VendorID|tpep_pickup_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration(m)|    Key|               date|
+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+-------+-------------------+
|       1| 2018-01-01 00:44:55|              1|            2|         1|         239|         140|           2|         14|  0.5|    0.5|         0|           0|                  0.3|        15.3|       18.0|239-140|2018-01-01 00:00:00|
|       1| 2018-01-01 00:20:22|              1|     

In [20]:
taxi_zones =spark.read.format("csv").option("header", "true").load(os.path.join(data_dir, "OSRM", "OSRM_plus.csv") ) \
    .withColumnRenamed("expected_total_distance", "expected_total_distance(miles)")\
    .withColumnRenamed("expected_total_duration", "expected_total_duration(s)")\
    .withColumn("expected_total_distance(miles)", round("expected_total_distance(miles)", 1)) \
    .drop("geometry", "expected_main_road", "_c0","Expected_AVG_speed") 
taxi_zones.show(5)

+------------------------------+--------------------------+---+
|expected_total_distance(miles)|expected_total_duration(s)|Key|
+------------------------------+--------------------------+---+
|                          28.8|                    2806.0|1-2|
|                          38.0|                    3823.1|1-3|
|                          20.9|                    2488.2|1-4|
|                           7.8|                    1251.2|1-5|
|                           8.2|                    1115.9|1-6|
+------------------------------+--------------------------+---+
only showing top 5 rows



In [19]:
weather = spark.read.format("csv").option("header", "true").load(os.path.join(data_dir, "Weather", "Weather processed.csv") ) \
.withColumn("date", to_timestamp("date", 'yyyy-MM-dd') ) 
weather.show(5)


+---+-------+-------+-------+-------------+----+---+-------------+-------+---------+-------------------+
|_c0|tempMax|tempMin|tempAvg|tempDeparture| hdd|cdd|precipitation|newSnow|snowDepth|               date|
+---+-------+-------+-------+-------------+----+---+-------------+-------+---------+-------------------+
|  0|   19.0|    6.8|   12.9|        -21.9|51.8|0.0|          0.0|    0.0|      0.0|2018-01-01 00:00:00|
|  1|   26.7|   12.8|   19.8|        -14.9|44.8|0.0|          0.0|    0.0|      0.0|2018-01-02 00:00:00|
|  2|   29.3|   12.5|   20.9|        -13.5|44.0|0.0|          0.0|    0.0|      0.0|2018-01-03 00:00:00|
|  3|   29.0|   19.0|   24.0|        -10.2|40.8|0.0|          0.6|    8.3|      1.0|2018-01-04 00:00:00|
|  4|   19.0|    9.2|   14.1|        -20.0|50.8|0.0|          0.1|    0.0|      7.4|2018-01-05 00:00:00|
+---+-------+-------+-------+-------------+----+---+-------------+-------+---------+-------------------+
only showing top 5 rows



In [24]:
print(f"Number of row before joined {sdf.count()}")

Number of row before joined 55387165


In [26]:
trip_taxi = sdf.join(taxi_zones, sdf.Key == taxi_zones.Key, "inner") 
merge_dataset = trip_taxi.join(weather, trip_taxi.date == weather.date, "inner") 
merge_dataset.show(5)

+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+------+-------------------+------------------------------+--------------------------+------+---+-------+-------+-------+-------------+----+----+-------------+-------+---------+-------------------+
|VendorID|tpep_pickup_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration(m)|   Key|               date|expected_total_distance(miles)|expected_total_duration(s)|   Key|_c0|tempMax|tempMin|tempAvg|tempDeparture| hdd| cdd|precipitation|newSnow|snowDepth|               date|
+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+-------

In [48]:
merge_dataset = merge_dataset.drop("date", "Key", "_c0")

In [28]:
print(f"Number of row after joined {merge_dataset.count()}")

Number of row after joined 55142680


In [32]:
for col in merge_dataset.dtypes:
    print(f"Column {col[0]} is of type {col[1]}")

Column VendorID is of type string
Column tpep_pickup_datetime is of type timestamp
Column passenger_count is of type string
Column trip_distance is of type int
Column RatecodeID is of type string
Column PULocationID is of type string
Column DOLocationID is of type string
Column payment_type is of type string
Column fare_amount is of type string
Column extra is of type string
Column mta_tax is of type string
Column tip_amount is of type string
Column tolls_amount is of type string
Column improvement_surcharge is of type string
Column total_amount is of type string
Column duration(m) is of type double
Column Key is of type string
Column date is of type timestamp
Column expected_total_distance(miles) is of type double
Column expected_total_duration(s) is of type string
Column Key is of type string
Column _c0 is of type string
Column tempMax is of type string
Column tempMin is of type string
Column tempAvg is of type string
Column tempDeparture is of type string
Column hdd is of type strin

# Split train and test set

## USe 2018 data as trainset

In [49]:
data2018 = merge_dataset.filter(merge_dataset.tpep_pickup_datetime > "2018-01-01").filter(merge_dataset.tpep_pickup_datetime < "2019-01-01")
data2018.show(5)


+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+------------------------------+--------------------------+-------+-------+-------+-------------+----+----+-------------+-------+---------+
|VendorID|tpep_pickup_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration(m)|expected_total_distance(miles)|expected_total_duration(s)|tempMax|tempMin|tempAvg|tempDeparture| hdd| cdd|precipitation|newSnow|snowDepth|
+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+------------------------------+--------------------------+-------+-------+-------+-------------+--

In [50]:
print(f"Size of training set: {(data2018.count(), len(data2018.columns))}")

Size of training set: (30206863, 27)


In [53]:
data2018.repartition(1).write.csv(os.path.join(root_folder, "Data", "Merge", "train.csv"), header=True)

## Use 2019 data as testset

In [54]:
data2019 = merge_dataset.filter(merge_dataset.tpep_pickup_datetime > "2018-12-31").filter(merge_dataset.tpep_pickup_datetime < "2020-01-01")
data2019.show(5)

+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+------------------------------+--------------------------+-------+-------+-------+-------------+----+---+-------------+-------+---------+
|VendorID|tpep_pickup_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|duration(m)|expected_total_distance(miles)|expected_total_duration(s)|tempMax|tempMin|tempAvg|tempDeparture| hdd|cdd|precipitation|newSnow|snowDepth|
+--------+--------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+-----------+------------------------------+--------------------------+-------+-------+-------+-------------+----

In [41]:
print(f"Size of test set: {(data2019.count(), len(data2019.columns))}")


Size of test set: (24997342, 32)


In [56]:
data2019.repartition(1).write.csv(os.path.join(root_folder, "Data", "Merge", "test.csv"), header=True)

# Clean up folder

In [None]:
for file in os.listdir(os.path.join(root_folder, "Data", "Merge", "test.csv")):
    if file.endswith(".csv"):
        os.rename(os.path.join(root_folder, "Data", "Merge", "test.csv", file),os.path.join( root_folder, "Data", "Merge", "test.csv"))
        shutil.rmtree(os.path.join(root_folder, "Data", "Merge", "test.csv"))
for file in os.listdir(os.path.join(root_folder, "Data", "Merge", "train.csv")):
    if file.endswith(".csv"):
        os.rename(os.path.join(root_folder, "Data", "Merge", "train.csv", file),os.path.join( root_folder, "Data", "Merge", "train.csv"))
        shutil.rmtree(os.path.join(root_folder, "Data", "Merge", "train.csv"))