In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("plops_edit_dim") \
.getOrCreate()

In [2]:
from collections import OrderedDict
config_schema = OrderedDict()
config_schema = [
    ('objectid', 'INT'), 
    ('station_id', 'INT'), 
    ('segkey', 'INT'), 
    ('unitid', 'INT'), 
    ('unitid2', 'INT'), 
    ('station_address', 'STRING'), 
    ('side', 'STRING'), 
    ('block_id', 'STRING'), 
    ('block_nbr', 'INT'), 
    ('csm', 'STRING'), 
    ('parking_category', 'STRING'), 
    ('load', 'INT'), 
    ('zone', 'INT'), 
    ('total_zones', 'INT'), 
    ('wkd_rate1', 'DOUBLE'), 
    ('wkd_start1', 'STRING'), 
    ('wkd_end1', 'STRING'), 
    ('wkd_rate2', 'DOUBLE'), 
    ('wkd_start2', 'STRING'), 
    ('wkd_end2', 'STRING'), 
    ('wkd_rate3', 'DOUBLE'), 
    ('wkd_start3', 'STRING'), 
    ('wkd_end3', 'STRING'), 
    ('sat_rate1', 'DOUBLE'), 
    ('sat_start1', 'STRING'), 
    ('sat_end1', 'STRING'), 
    ('sat_rate2', 'DOUBLE'), 
    ('sat_start2', 'STRING'), 
    ('sat_end2', 'STRING'), 
    ('sat_rate3', 'DOUBLE'), 
    ('sat_start3', 'STRING'), 
    ('sat_end3', 'STRING'), 
    ('rpz_zone', 'STRING'), 
    ('rpz_area', 'DOUBLE'), 
    ('paidarea', 'STRING'), 
    ('parking_time_limit', 'DOUBLE'), 
    ('subarea', 'STRING'), 
    ('start_time_wkd', 'STRING'), 
    ('end_time_wkd', 'STRING'), 
    ('start_time_sat', 'STRING'), 
    ('end_time_sat', 'STRING'), 
    ('primarydistrictcd', 'STRING'), 
    ('secondarydistrictcd', 'STRING'), 
    ('overrideyn', 'STRING'), 
    ('overridecomment', 'INT'), 
    ('shape_length', 'DOUBLE')
]
schema = ", ".join(["{} {}".format(col, type) for col, type in config_schema])
schema

'objectid INT, station_id INT, segkey INT, unitid INT, unitid2 INT, station_address STRING, side STRING, block_id STRING, block_nbr INT, csm STRING, parking_category STRING, load INT, zone INT, total_zones INT, wkd_rate1 DOUBLE, wkd_start1 STRING, wkd_end1 STRING, wkd_rate2 DOUBLE, wkd_start2 STRING, wkd_end2 STRING, wkd_rate3 DOUBLE, wkd_start3 STRING, wkd_end3 STRING, sat_rate1 DOUBLE, sat_start1 STRING, sat_end1 STRING, sat_rate2 DOUBLE, sat_start2 STRING, sat_end2 STRING, sat_rate3 DOUBLE, sat_start3 STRING, sat_end3 STRING, rpz_zone STRING, rpz_area DOUBLE, paidarea STRING, parking_time_limit DOUBLE, subarea STRING, start_time_wkd STRING, end_time_wkd STRING, start_time_sat STRING, end_time_sat STRING, primarydistrictcd STRING, secondarydistrictcd STRING, overrideyn STRING, overridecomment INT, shape_length DOUBLE'

In [3]:
dim_df = spark.read.csv(
    "s3a://project.datasets/Blockface.csv.gz", header=True, mode="FAILFAST", schema=schema
)

In [4]:
dim_df = dim_df.drop('objectid',
    'segkey',
    'unitid',
    'unitid2',
    'block_id',
    'csm',
    'load',
    'zone',
    'total_zones',
    'rpz_zone',
    'rpz_area',
    'paidarea',
    'start_time_wkd',
    'end_time_wkd',
    'start_time_sat',
    'end_time_sat',
    'primarydistrictcd',
    'secondarydistrictcd',
    'overrideyn',
    'overridecomment',
    'shape_length',
)

In [5]:
dim_df.show(1)

+----------+--------------------+----+---------+----------------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+------------------+-------+
|station_id|     station_address|side|block_nbr|parking_category|wkd_rate1|wkd_start1|wkd_end1|wkd_rate2|wkd_start2|wkd_end2|wkd_rate3|wkd_start3|wkd_end3|sat_rate1|sat_start1|sat_end1|sat_rate2|sat_start2|sat_end2|sat_rate3|sat_start3|sat_end3|parking_time_limit|subarea|
+----------+--------------------+----+---------+----------------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+------------------+-------+
|    124699|I5 EXPRESS NE 103...|   N|    10200|            None|      0.0|         0|       0|      0.0|         0|       0|      0.0|         0|       0|      0.0|         0|     

In [6]:
# get occupancy data

In [7]:
from collections import OrderedDict
config_schema = OrderedDict()
config_schema = [
    ('timestamp', 'string'),
    ('occupancy', 'INT'),
    ('blockface_name', 'STRING'),
    ('side_of_street', 'STRING'),
    ('station_id', 'INT'),
    ('time_limit_category', 'STRING'),
    ('space_count', 'INT'),
    ('area', 'STRING'),
    ('subarea', 'STRING'),
    ('rate', 'STRING'),
    ('category', 'STRING'),
    ('location', 'STRING'),
    ('emptycol1','STRING'),
    ('emptycol2','STRING'),
    ('emptycol3','STRING'),
    ('emptycol4','STRING'),
    ('emptycol5','STRING')
]
schema = ", ".join(["{} {}".format(col, type) for col, type in config_schema])
schema

'timestamp string, occupancy INT, blockface_name STRING, side_of_street STRING, station_id INT, time_limit_category STRING, space_count INT, area STRING, subarea STRING, rate STRING, category STRING, location STRING, emptycol1 STRING, emptycol2 STRING, emptycol3 STRING, emptycol4 STRING, emptycol5 STRING'

In [9]:
oc_df = spark.read.csv(
    #"s3a://project.datasets/2019-Paid-Parking-Occupancy.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
    "s3a://project.datasets/Last_48_hours.csv", header=True, mode="DROPMALFORMED", schema=schema
)

In [10]:
oc_df = oc_df.drop("blockface_name", 
             "side_of_street", 
             "time_limit_category", 
             "area", 
             "subarea",
             "rate",
             "category",
             "emptycol1",
             "emptycol2",
             "emptycol3",
             "emptycol4",
             "emptycol5"
            )

In [11]:
from pyspark.sql import functions as F
# oc_df = oc_df.withColumn("timestamp", F.col("timestamp").cast("timestamp")) # also it works!!
oc_df = oc_df.withColumn("timestamp", F.to_timestamp(oc_df.timestamp, format="mm/dd/yyyy hh:mm:ss a"))

In [12]:
oc_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- occupancy: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- space_count: integer (nullable = true)
 |-- location: string (nullable = true)



In [13]:
try:
    oc_df.createTempView("occupancy")
except:
    print('aready occupancy is exists')

In [14]:
oc_df = spark.sql("select t1.station_id, t1.space_count, t1.location from occupancy t1 where t1.timestamp = (select max(t2.timestamp) from occupancy t2 where t1.station_id = t2.station_id)")

In [15]:
#dim_df = oc_df.join(dim_df, oc_df.station_id == dim_df.station_id, "inner").drop(dim_df.station_id)
dim_df = oc_df.join(dim_df, ["station_id"], "inner")#.drop(dim_df.station_id)

In [16]:
dim_df.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- space_count: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- station_address: string (nullable = true)
 |-- side: string (nullable = true)
 |-- block_nbr: integer (nullable = true)
 |-- parking_category: string (nullable = true)
 |-- wkd_rate1: double (nullable = true)
 |-- wkd_start1: string (nullable = true)
 |-- wkd_end1: string (nullable = true)
 |-- wkd_rate2: double (nullable = true)
 |-- wkd_start2: string (nullable = true)
 |-- wkd_end2: string (nullable = true)
 |-- wkd_rate3: double (nullable = true)
 |-- wkd_start3: string (nullable = true)
 |-- wkd_end3: string (nullable = true)
 |-- sat_rate1: double (nullable = true)
 |-- sat_start1: string (nullable = true)
 |-- sat_end1: string (nullable = true)
 |-- sat_rate2: double (nullable = true)
 |-- sat_start2: string (nullable = true)
 |-- sat_end2: string (nullable = true)
 |-- sat_rate3: double (nullable = true)
 |-- sat_start3: string (nullable = tru

In [17]:
dim_df = dim_df.withColumn('effective_date_start', F.to_date(F.lit("01/01/2010"), format="mm/dd/yyyy")) \
                .withColumn('effective_date_end', F.to_date(F.lit("01/01/3000"), format="mm/dd/yyyy"))

In [18]:
dim_df.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- space_count: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- station_address: string (nullable = true)
 |-- side: string (nullable = true)
 |-- block_nbr: integer (nullable = true)
 |-- parking_category: string (nullable = true)
 |-- wkd_rate1: double (nullable = true)
 |-- wkd_start1: string (nullable = true)
 |-- wkd_end1: string (nullable = true)
 |-- wkd_rate2: double (nullable = true)
 |-- wkd_start2: string (nullable = true)
 |-- wkd_end2: string (nullable = true)
 |-- wkd_rate3: double (nullable = true)
 |-- wkd_start3: string (nullable = true)
 |-- wkd_end3: string (nullable = true)
 |-- sat_rate1: double (nullable = true)
 |-- sat_start1: string (nullable = true)
 |-- sat_end1: string (nullable = true)
 |-- sat_rate2: double (nullable = true)
 |-- sat_start2: string (nullable = true)
 |-- sat_end2: string (nullable = true)
 |-- sat_rate3: double (nullable = true)
 |-- sat_start3: string (nullable = tru

In [19]:
import os
from pyspark.sql import DataFrameWriter
my_writer = DataFrameWriter(dim_df)

In [20]:
database_name = 'occupancy'
table = "dim_stations"
hostname = 'ec2-52-39-242-144.us-west-2.compute.amazonaws.com'
mode = "overwrite"
properties = {"user":"spark_user", 
              "password":os.environ['POSTGRES_PASS'],
              "driver": "org.postgresql.Driver"
             }

url_connect = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=hostname, db=database_name)

In [21]:
my_writer.jdbc(url_connect, table, mode, properties)