In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("plops") \
.getOrCreate()

In [4]:
from collections import OrderedDict
config_schema = OrderedDict()
config_schema = [
    ('OccupancyDateTime', 'STRING'),
    ('PaidOccupancy', 'INT'),
    ('BlockfaceName', 'STRING'),
    ('SideOfStreet', 'STRING'),
    ('SourceElementKey', 'INT'),
    ('ParkingTimeLimitCategory', 'STRING'),
    ('ParkingSpaceCount', 'INT'),
    ('PaidParkingArea', 'STRING'),
    ('PaidParkingSubArea', 'STRING'),
    ('PaidParkingRate', 'STRING'),
    ('ParkingCategory', 'STRING'),
    ('Location', 'STRING'),
    ('emptycol1','STRING'),
    ('emptycol2','STRING'),
    ('emptycol3','STRING'),
    ('emptycol4','STRING'),
    ('emptycol5','STRING')
]

In [5]:
schema = ", ".join(["{} {}".format(col, type) for col, type in config_schema])
schema

'OccupancyDateTime STRING, PaidOccupancy INT, BlockfaceName STRING, SideOfStreet STRING, SourceElementKey INT, ParkingTimeLimitCategory STRING, ParkingSpaceCount INT, PaidParkingArea STRING, PaidParkingSubArea STRING, PaidParkingRate STRING, ParkingCategory STRING, Location STRING, emptycol1 STRING, emptycol2 STRING, emptycol3 STRING, emptycol4 STRING, emptycol5 STRING'

In [6]:
df = None

In [7]:
df = spark.read.csv(
    "s3a://project.datasets/Last_48_hours.csv", header=True, mode="DROPMALFORMED", schema=schema
    #"s3a://project.datasets/last_48h.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
    #"s3a://project.datasets/2019-Paid-Parking-Occupancy.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
)

In [8]:
df.schema

StructType(List(StructField(OccupancyDateTime,StringType,true),StructField(PaidOccupancy,IntegerType,true),StructField(BlockfaceName,StringType,true),StructField(SideOfStreet,StringType,true),StructField(SourceElementKey,IntegerType,true),StructField(ParkingTimeLimitCategory,StringType,true),StructField(ParkingSpaceCount,IntegerType,true),StructField(PaidParkingArea,StringType,true),StructField(PaidParkingSubArea,StringType,true),StructField(PaidParkingRate,StringType,true),StructField(ParkingCategory,StringType,true),StructField(Location,StringType,true),StructField(emptycol1,StringType,true),StructField(emptycol2,StringType,true),StructField(emptycol3,StringType,true),StructField(emptycol4,StringType,true),StructField(emptycol5,StringType,true)))

In [9]:
df.show(1)

+--------------------+-------------+--------------------+------------+----------------+------------------------+-----------------+---------------+------------------+---------------+---------------+--------------------+---------+---------+---------+---------+---------+
|   OccupancyDateTime|PaidOccupancy|       BlockfaceName|SideOfStreet|SourceElementKey|ParkingTimeLimitCategory|ParkingSpaceCount|PaidParkingArea|PaidParkingSubArea|PaidParkingRate|ParkingCategory|            Location|emptycol1|emptycol2|emptycol3|emptycol4|emptycol5|
+--------------------+-------------+--------------------+------------+----------------+------------------------+-----------------+---------------+------------------+---------------+---------------+--------------------+---------+---------+---------+---------+---------+
|03/05/2019 10:14:...|            2|TERRY AVE BETWEEN...|          NE|           35730|                     240|                5| Denny Triangle|             North|           null|   Paid Park

In [10]:
df = df.drop("BlockfaceName", 
             "SideOfStreet", 
             "ParkingTimeLimitCategory", 
             "ParkingSpaceCount", 
             "PaidParkingArea", 
             "PaidParkingSubArea",
             "PaidParkingRate",
             "ParkingCategory",
             "emptycol1",
             "emptycol2",
             "emptycol3",
             "emptycol4",
             "emptycol5"
            )

In [11]:
df.printSchema()

root
 |-- OccupancyDateTime: string (nullable = true)
 |-- PaidOccupancy: integer (nullable = true)
 |-- SourceElementKey: integer (nullable = true)
 |-- Location: string (nullable = true)



In [12]:
from pyspark.sql import functions as F

In [13]:
df = df.withColumn("OccupancyDateTime", F.to_timestamp(df.OccupancyDateTime, format="mm/dd/yyyy hh:mm:ss a"))

In [14]:
df.printSchema()

root
 |-- OccupancyDateTime: timestamp (nullable = true)
 |-- PaidOccupancy: integer (nullable = true)
 |-- SourceElementKey: integer (nullable = true)
 |-- Location: string (nullable = true)



In [15]:
df.show(1)

+-------------------+-------------+----------------+--------------------+
|  OccupancyDateTime|PaidOccupancy|SourceElementKey|            Location|
+-------------------+-------------+----------------+--------------------+
|2019-01-05 10:14:00|            2|           35730|POINT (47.6159364...|
+-------------------+-------------+----------------+--------------------+
only showing top 1 row



In [16]:
df = df.withColumn('DayOfWeek', F.dayofweek(df.OccupancyDateTime))
df = df.withColumn('Hour', F.hour(df.OccupancyDateTime))

In [17]:
df.show(10)

+-------------------+-------------+----------------+--------------------+---------+----+
|  OccupancyDateTime|PaidOccupancy|SourceElementKey|            Location|DayOfWeek|Hour|
+-------------------+-------------+----------------+--------------------+---------+----+
|2019-01-05 10:14:00|            2|           35730|POINT (47.6159364...|        7|  10|
|2019-01-05 18:39:00|           11|           34214|POINT (47.6058076...|        7|  18|
|2019-01-05 09:06:00|            4|           59246|POINT (47.6187126...|        7|   9|
|2019-01-05 15:34:00|            3|            1278|POINT (47.6214913...|        7|  15|
|2019-01-04 14:23:00|            1|            7250|POINT (47.6062128...|        6|  14|
|2019-01-05 12:59:00|            4|           76201|POINT (47.6226588...|        7|  12|
|2019-01-05 09:45:00|            5|            7265|POINT (47.6182771...|        7|   9|
|2019-01-04 15:54:00|            2|           80545|POINT (47.6232851...|        6|  15|
|2019-01-05 08:21:00|

In [18]:
#Create New DataFrame
# from pyspark.sql.types import (StructType,
#                                TimestampType,
#                                IntegerType,
#                                DoubleType)
# field = [
#     StructField('DateTime', TimestampType(), True),
#     StructField('StationID', StringType(), True),
#     StructField('AveOpenSpots', IntegerType(), True),
#     StructField('AveOpenRate', DoubleType(), True),
#     StructField('GroupByPeriod', StringType(), True),
#     StructField('Location', StringType(), True)
# ]
# new_schema = StructType(field)
# df_ave = sqlContext.createDataFrame(sc.emptyRDD(), new_schema)
# df_ave.show()

In [19]:
df.printSchema()

root
 |-- OccupancyDateTime: timestamp (nullable = true)
 |-- PaidOccupancy: integer (nullable = true)
 |-- SourceElementKey: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Hour: integer (nullable = true)



In [20]:
import os
from pyspark.sql import DataFrameWriter
my_writer = DataFrameWriter(df)

In [21]:
url_connect = "jdbc:postgresql://ec2-52-39-242-144.us-west-2.compute.amazonaws.com:5432/occupancy"
table = "hist_occupancy"
mode = "overwrite"
properties = {"user":"spark_user", 
              "password":os.environ['POSTGRES_PASS'],
              "driver": "org.postgresql.Driver"
             }

In [22]:
my_writer.jdbc(url_connect, table, mode, properties)