In [34]:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("plops") \
.getOrCreate()

In [35]:
from collections import OrderedDict
config_schema = OrderedDict()
config_schema = [
    ('OccupancyDateTime', 'STRING'),
    ('PaidOccupancy', 'INT'),
    ('BlockfaceName', 'STRING'),
    ('SideOfStreet', 'STRING'),
    ('SourceElementKey', 'INT'),
    ('ParkingTimeLimitCategory', 'STRING'),
    ('ParkingSpaceCount', 'INT'),
    ('PaidParkingArea', 'STRING'),
    ('PaidParkingSubArea', 'STRING'),
    ('PaidParkingRate', 'STRING'),
    ('ParkingCategory', 'STRING'),
    ('Location', 'STRING'),
    ('emptycol1','STRING'),
    ('emptycol2','STRING'),
    ('emptycol3','STRING'),
    ('emptycol4','STRING'),
    ('emptycol5','STRING')
]

In [36]:
schema = ", ".join(["{} {}".format(col, type) for col, type in config_schema])
schema

'OccupancyDateTime STRING, PaidOccupancy INT, BlockfaceName STRING, SideOfStreet STRING, SourceElementKey INT, ParkingTimeLimitCategory STRING, ParkingSpaceCount INT, PaidParkingArea STRING, PaidParkingSubArea STRING, PaidParkingRate STRING, ParkingCategory STRING, Location STRING, emptycol1 STRING, emptycol2 STRING, emptycol3 STRING, emptycol4 STRING, emptycol5 STRING'

In [37]:
df = None

In [38]:
df = spark.read.csv(
    "s3a://project.datasets/Last_48_hours.csv", header=True, mode="DROPMALFORMED", schema=schema
    #"s3a://project.datasets/last_48h.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
    #"s3a://project.datasets/2019-Paid-Parking-Occupancy.csv.gz", header=True, mode="DROPMALFORMED", schema=schema
)

In [39]:
df.schema

StructType(List(StructField(OccupancyDateTime,StringType,true),StructField(PaidOccupancy,IntegerType,true),StructField(BlockfaceName,StringType,true),StructField(SideOfStreet,StringType,true),StructField(SourceElementKey,IntegerType,true),StructField(ParkingTimeLimitCategory,StringType,true),StructField(ParkingSpaceCount,IntegerType,true),StructField(PaidParkingArea,StringType,true),StructField(PaidParkingSubArea,StringType,true),StructField(PaidParkingRate,StringType,true),StructField(ParkingCategory,StringType,true),StructField(Location,StringType,true),StructField(emptycol1,StringType,true),StructField(emptycol2,StringType,true),StructField(emptycol3,StringType,true),StructField(emptycol4,StringType,true),StructField(emptycol5,StringType,true)))

In [40]:
df.show(1)

+--------------------+-------------+--------------------+------------+----------------+------------------------+-----------------+---------------+------------------+---------------+---------------+--------------------+---------+---------+---------+---------+---------+
|   OccupancyDateTime|PaidOccupancy|       BlockfaceName|SideOfStreet|SourceElementKey|ParkingTimeLimitCategory|ParkingSpaceCount|PaidParkingArea|PaidParkingSubArea|PaidParkingRate|ParkingCategory|            Location|emptycol1|emptycol2|emptycol3|emptycol4|emptycol5|
+--------------------+-------------+--------------------+------------+----------------+------------------------+-----------------+---------------+------------------+---------------+---------------+--------------------+---------+---------+---------+---------+---------+
|03/05/2019 10:14:...|            2|TERRY AVE BETWEEN...|          NE|           35730|                     240|                5| Denny Triangle|             North|           null|   Paid Park

In [41]:
df = df.drop("BlockfaceName", 
             "SideOfStreet", 
             "ParkingTimeLimitCategory", 
             "ParkingSpaceCount", 
             "PaidParkingArea", 
             "PaidParkingSubArea",
             "PaidParkingRate",
             "ParkingCategory",
             "emptycol1",
             "emptycol2",
             "emptycol3",
             "emptycol4",
             "emptycol5"
            )

In [42]:
df.printSchema()

root
 |-- OccupancyDateTime: string (nullable = true)
 |-- PaidOccupancy: integer (nullable = true)
 |-- SourceElementKey: integer (nullable = true)
 |-- Location: string (nullable = true)



In [43]:
from pyspark.sql import functions as F

In [44]:
df = df.withColumn("OccupancyDateTime", F.to_timestamp(df.OccupancyDateTime, format="mm/dd/yyyy hh:mm:ss a"))

In [45]:
df.printSchema()

root
 |-- OccupancyDateTime: timestamp (nullable = true)
 |-- PaidOccupancy: integer (nullable = true)
 |-- SourceElementKey: integer (nullable = true)
 |-- Location: string (nullable = true)

