# Schemas

## Setup environment

In [3]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [None]:
#!wget https://data.cityofchicago.org/api/views/ijzp-q8t2/rows.csv?accessType=DOWNLOAD
#!ls -l

In [32]:
from pyspark.sql.functions import to_timestamp,col,lit
path ="../datasets/sparkbyexamples/police-stations.csv"

In [33]:
rc = spark.read.csv(path,header=True)

In [34]:
rc.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [36]:
from pyspark.sql.functions import to_timestamp,col,lit

rc = spark.read.csv(path,header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))
rc.dtypes

[('ID', 'string'),
 ('Case Number', 'string'),
 ('Date', 'timestamp'),
 ('Block', 'string'),
 ('IUCR', 'string'),
 ('Primary Type', 'string'),
 ('Description', 'string'),
 ('Location Description', 'string'),
 ('Arrest', 'string'),
 ('Domestic', 'string'),
 ('Beat', 'string'),
 ('District', 'string'),
 ('Ward', 'string'),
 ('Community Area', 'string'),
 ('FBI Code', 'string'),
 ('X Coordinate', 'string'),
 ('Y Coordinate', 'string'),
 ('Year', 'string'),
 ('Updated On', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string'),
 ('Location', 'string')]

In [15]:
rc.select("Date").distinct().show()

+-------------------+
|               Date|
+-------------------+
|2022-04-05 12:00:00|
|2022-12-11 20:21:00|
|2022-12-30 23:00:00|
|2022-12-18 11:00:00|
|2022-12-21 17:25:00|
|2022-12-31 01:01:00|
|2022-02-13 11:31:00|
|2022-04-07 00:00:00|
|2022-11-23 02:00:00|
|2022-11-30 15:00:00|
|2022-12-18 05:30:00|
|2022-12-21 01:00:00|
|2022-12-21 16:00:00|
|2022-12-01 08:00:00|
|2022-05-11 03:00:00|
|2022-02-02 09:00:00|
|2022-09-02 02:00:00|
|2022-10-27 14:37:00|
|2022-11-27 19:00:00|
|2022-07-09 02:30:00|
+-------------------+
only showing top 20 rows



In [23]:
rc2=rc.filter(col('Date') >= lit('2022-12-01'))

In [28]:
from pyspark.sql.functions import to_date

In [29]:
rc3 = rc2.withColumn('Date', to_date(col('Date'), 'MM/dd/yyyy'))

In [31]:
rc3.select(col('Date')).distinct().sort('Date').show(31)

+----------+
|      Date|
+----------+
|2022-12-01|
|2022-12-02|
|2022-12-03|
|2022-12-04|
|2022-12-05|
|2022-12-06|
|2022-12-07|
|2022-12-08|
|2022-12-09|
|2022-12-10|
|2022-12-11|
|2022-12-12|
|2022-12-13|
|2022-12-14|
|2022-12-15|
|2022-12-16|
|2022-12-17|
|2022-12-18|
|2022-12-19|
|2022-12-20|
|2022-12-21|
|2022-12-22|
|2022-12-23|
|2022-12-24|
|2022-12-25|
|2022-12-26|
|2022-12-27|
|2022-12-28|
|2022-12-29|
|2022-12-30|
|2022-12-31|
+----------+



## Schemas