In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("plops") \
.getOrCreate()

In [2]:
from collections import OrderedDict
config_schema = OrderedDict()
config_schema = [
    ('timestamp', 'STRING'),
    ('occupancy', 'INT'),
    ('blockface_name', 'STRING'),
    ('side_of_street', 'STRING'),
    ('station_id', 'INT'),
    ('time_limit_category', 'STRING'),
    ('space_count', 'INT'),
    ('area', 'STRING'),
    ('subarea', 'STRING'),
    ('rate', 'STRING'),
    ('category', 'STRING'),
    ('location', 'STRING'),
    ('emptycol1','STRING'),
    ('emptycol2','STRING'),
    ('emptycol3','STRING'),
    ('emptycol4','STRING'),
    ('emptycol5','STRING')
]


In [3]:
schema = ", ".join(["{} {}".format(col, type) for col, type in config_schema])
schema

'timestamp STRING, occupancy INT, blockface_name STRING, side_of_street STRING, station_id INT, time_limit_category STRING, space_count INT, area STRING, subarea STRING, rate STRING, category STRING, location STRING, emptycol1 STRING, emptycol2 STRING, emptycol3 STRING, emptycol4 STRING, emptycol5 STRING'

In [4]:
df = spark.read.csv(
    #"s3a://project.datasets/Last_48_hours.csv", header=True, mode="DROPMALFORMED", schema=schema
    "s3a://project.datasets/last_48h.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
    #"s3a://project.datasets/2019-Paid-Parking-Occupancy.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
)

In [5]:
df.schema

StructType(List(StructField(timestamp,StringType,true),StructField(occupancy,IntegerType,true),StructField(blockface_name,StringType,true),StructField(side_of_street,StringType,true),StructField(station_id,IntegerType,true),StructField(time_limit_category,StringType,true),StructField(space_count,IntegerType,true),StructField(area,StringType,true),StructField(subarea,StringType,true),StructField(rate,StringType,true),StructField(category,StringType,true),StructField(location,StringType,true),StructField(emptycol1,StringType,true),StructField(emptycol2,StringType,true),StructField(emptycol3,StringType,true),StructField(emptycol4,StringType,true),StructField(emptycol5,StringType,true)))

In [6]:
df = df.select("timestamp", "occupancy", "station_id", "location")

In [7]:
df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- occupancy: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- location: string (nullable = true)



In [8]:
from pyspark.sql import functions as F

In [9]:
df = df.withColumn("timestamp", F.to_timestamp(df.timestamp, format="mm/dd/yyyy hh:mm:ss a"))

In [10]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- occupancy: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- location: string (nullable = true)



In [11]:
df = df.withColumn('day_of_week', F.dayofweek(df.timestamp))
df = df.withColumn('hour', F.hour(df.timestamp))

In [12]:
#Create New DataFrame
# from pyspark.sql.types import (StructType,
#                                TimestampType,
#                                IntegerType,
#                                DoubleType)
# field = [
#     StructField('DateTime', TimestampType(), True),
#     StructField('StationID', StringType(), True),
#     StructField('AveOpenSpots', IntegerType(), True),
#     StructField('AveOpenRate', DoubleType(), True),
#     StructField('GroupByPeriod', StringType(), True),
#     StructField('Location', StringType(), True)
# ]
# new_schema = StructType(field)
# df_ave = sqlContext.createDataFrame(sc.emptyRDD(), new_schema)
# df_ave.show()

In [13]:
df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- occupancy: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [14]:
import os
from pyspark.sql import DataFrameWriter
my_writer = DataFrameWriter(df)

In [15]:
database_name = 'occupancy'
hostname = 'ec2-52-39-242-144.us-west-2.compute.amazonaws.com'
url_connect = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=hostname, db=database_name)

table = "hist_occupancy"
#table = "spark_output_occupancy"
mode = "overwrite"
properties = {"user":"spark_user", 
              "password":os.environ['POSTGRES_PASS'],
              "driver": "org.postgresql.Driver"
             }

In [16]:
my_writer.jdbc(url_connect, table, mode, properties)

In [64]:
from collections import OrderedDict
config_schema = OrderedDict()
config_schema = [
    ('data_id', 'INT'),
    ('meter_id', 'INT'),
    ('transaction_id', 'INT'),
    ('transaction_timestamp', 'STRING'),
    ('amount_usd', 'INT'),
    ('usernumber', 'STRING'),
    ('payment_mean', 'STRING'),
    ('paid_duration', 'INT'),
    ('station_id', 'INT'),
    ('year', 'INT'),
    ('month', 'INT'),
    ('vendor', 'STRING'),
]
schema = ", ".join(["{} {}".format(col, type) for col, type in config_schema])
schema

'data_id INT, meter_id INT, transaction_id INT, transaction_timestamp STRING, amount_usd INT, usernumber STRING, payment_mean STRING, paid_duration INT, station_id INT, year INT, month INT, vendor STRING'

In [65]:
tr_df = spark.read.csv(
    "s3a://project.datasets/transactions/01182019.csv.gz", header=True, mode="PERMISSIVE", schema=schema
)
tr_df.printSchema()
tr_df.show(1)

root
 |-- data_id: integer (nullable = true)
 |-- meter_id: integer (nullable = true)
 |-- transaction_id: integer (nullable = true)
 |-- transaction_timestamp: string (nullable = true)
 |-- amount_usd: integer (nullable = true)
 |-- usernumber: string (nullable = true)
 |-- payment_mean: string (nullable = true)
 |-- paid_duration: integer (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- vendor: string (nullable = true)

+--------+--------+--------------+---------------------+----------+----------+------------+-------------+----------+----+-----+------+
| data_id|meter_id|transaction_id|transaction_timestamp|amount_usd|usernumber|payment_mean|paid_duration|station_id|year|month|vendor|
+--------+--------+--------------+---------------------+----------+----------+------------+-------------+----------+----+-----+------+
|26060471|19337010|     652611123|  01/12/2019 08:35:42|        19|      NULL

In [67]:
tr_df = tr_df.select('transaction_timestamp','station_id','paid_duration','amount_usd')
tr_df = tr_df.withColumn("transaction_timestamp", F.to_timestamp(tr_df.transaction_timestamp, format="MM/dd/yyyy HH:mm:ss"))
tr_df.printSchema()

root
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- paid_duration: integer (nullable = true)
 |-- amount_usd: integer (nullable = true)



In [35]:
database_name = 'occupancy'
hostname = 'ec2-52-39-242-144.us-west-2.compute.amazonaws.com'
url_connect = "jdbc:postgresql://{hostname}:5432/{db}".format(hostname=hostname, db=database_name)

properties = {"user":"spark_user", 
              "password":os.environ['POSTGRES_PASS'],
              "driver": "org.postgresql.Driver"
             }

In [41]:
table = "dim_stations"
dim_df = spark.read.jdbc(url=url_connect, table=table, properties=properties)

In [42]:
dim_df.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- space_count: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- station_address: string (nullable = true)
 |-- side: string (nullable = true)
 |-- block_nbr: integer (nullable = true)
 |-- parking_category: string (nullable = true)
 |-- wkd_rate1: double (nullable = true)
 |-- wkd_start1: string (nullable = true)
 |-- wkd_end1: string (nullable = true)
 |-- wkd_rate2: double (nullable = true)
 |-- wkd_start2: string (nullable = true)
 |-- wkd_end2: string (nullable = true)
 |-- wkd_rate3: double (nullable = true)
 |-- wkd_start3: string (nullable = true)
 |-- wkd_end3: string (nullable = true)
 |-- sat_rate1: double (nullable = true)
 |-- sat_start1: string (nullable = true)
 |-- sat_end1: string (nullable = true)
 |-- sat_rate2: double (nullable = true)
 |-- sat_start2: string (nullable = true)
 |-- sat_end2: string (nullable = true)
 |-- sat_rate3: double (nullable = true)
 |-- sat_start3: string (nullable = tru

In [43]:
dim_df.show(1)

+----------+-----------+--------------------+--------------------+----+---------+----------------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+------------------+-------+--------------------+------------------+--------------------+
|station_id|space_count|            location|     station_address|side|block_nbr|parking_category|wkd_rate1|wkd_start1|wkd_end1|wkd_rate2|wkd_start2|wkd_end2|wkd_rate3|wkd_start3|wkd_end3|sat_rate1|sat_start1|sat_end1|sat_rate2|sat_start2|sat_end2|sat_rate3|sat_start3|sat_end3|parking_time_limit|subarea|effective_date_start|effective_date_end|       location_geom|
+----------+-----------+--------------------+--------------------+----+---------+----------------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+----------+--------+---------+--------

In [68]:
tr_df.show(1)

+---------------------+----------+-------------+----------+
|transaction_timestamp|station_id|paid_duration|amount_usd|
+---------------------+----------+-------------+----------+
|  2019-01-12 08:35:42|     10873|        20780|        19|
+---------------------+----------+-------------+----------+
only showing top 1 row



In [46]:
try:
    tr_df.createTempView("streaming_occupancy")
except:
    print('aready occupancy is exists')

In [79]:
dim_df = dim_df.select('station_id')
oc_df = dim_df.join(tr_df, ["station_id"], "left")

In [80]:
oc_df.printSchema()

root
 |-- station_id: integer (nullable = true)
 |-- transaction_timestamp: timestamp (nullable = true)
 |-- paid_duration: integer (nullable = true)
 |-- amount_usd: integer (nullable = true)



In [81]:
oc_df.take(10)

[Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 18, 6, 40), paid_duration=27600, amount_usd=10),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 18, 8, 45, 40), paid_duration=29087, amount_usd=11),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 18, 9, 46), paid_duration=3600, amount_usd=1),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 17, 6, 0), paid_duration=25200, amount_usd=9),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 17, 8, 44, 24), paid_duration=14712, amount_usd=5),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 15, 11, 1, 27), paid_duration=4800, amount_usd=2),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 15, 13, 43, 42), paid_duration=2400, amount_usd=1),
 Row(station_id=53138, transaction_timestamp=datetime.datetime(2019, 1, 15, 7, 57, 21), paid_duration=3600, amount_usd=1),
 Row(station_id=53138,