-- Notepad to myself --

# Working with Dates

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [2]:
from pyspark.sql.functions import to_timestamp, col
df = spark.read.csv('data/Crimes-2021.csv', header=True, inferSchema=True) \
    .withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



## Simple Date Formats

Simple Date Formats -> https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html 

or check -> https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html

In [3]:
from pyspark.sql.functions import to_date, to_timestamp, lit

In [18]:
df.select('Date',
         to_date(col('Date'), 'yyyy-MM-dd HH:mm:ss')) \
.show(3, truncate=False)

+-------------------+----------------------------------+
|Date               |to_date(Date, yyyy-MM-dd HH:mm:ss)|
+-------------------+----------------------------------+
|2021-04-16 20:45:00|2021-04-16                        |
|2021-10-21 11:00:00|2021-10-21                        |
|2021-12-11 20:00:00|2021-12-11                        |
+-------------------+----------------------------------+
only showing top 3 rows



### "2019-12-25 13:30:00"

In [4]:
date = spark.createDataFrame([('2019-12-25 13:30:00',)], ['Christmas'])
date.show()

+-------------------+
|          Christmas|
+-------------------+
|2019-12-25 13:30:00|
+-------------------+



In [5]:
date.printSchema()

root
 |-- Christmas: string (nullable = true)



In [6]:
date = date.select(to_date(col('Christmas'), 'yyyy-MM-dd HH:mm:ss'), 
                   to_timestamp(col('Christmas'), 'yyyy-MM-dd HH:mm:ss'))
date.show()

+---------------------------------------+--------------------------------------------+
|to_date(Christmas, yyyy-MM-dd HH:mm:ss)|to_timestamp(Christmas, yyyy-MM-dd HH:mm:ss)|
+---------------------------------------+--------------------------------------------+
|                             2019-12-25|                         2019-12-25 13:30:00|
+---------------------------------------+--------------------------------------------+



In [7]:
date.printSchema()

root
 |-- to_date(Christmas, yyyy-MM-dd HH:mm:ss): date (nullable = true)
 |-- to_timestamp(Christmas, yyyy-MM-dd HH:mm:ss): timestamp (nullable = true)



### "25/Dec/2019 13:30:00"

In [8]:
date2 = spark.createDataFrame([('25/Dec/2019 13:30:00',)], ['Christmas'])
date2.show()

+--------------------+
|           Christmas|
+--------------------+
|25/Dec/2019 13:30:00|
+--------------------+



In [9]:
date2.printSchema()

root
 |-- Christmas: string (nullable = true)



In [10]:
date2 = date2.select(to_date(col('Christmas'), 'dd/MMM/yyyy HH:mm:ss'), 
                     to_timestamp(col('Christmas'), 'dd/MMM/yyyy HH:mm:ss'))
date2.show()

+----------------------------------------+---------------------------------------------+
|to_date(Christmas, dd/MMM/yyyy HH:mm:ss)|to_timestamp(Christmas, dd/MMM/yyyy HH:mm:ss)|
+----------------------------------------+---------------------------------------------+
|                              2019-12-25|                          2019-12-25 13:30:00|
+----------------------------------------+---------------------------------------------+



In [11]:
date2.printSchema()

root
 |-- to_date(Christmas, dd/MMM/yyyy HH:mm:ss): date (nullable = true)
 |-- to_timestamp(Christmas, dd/MMM/yyyy HH:mm:ss): timestamp (nullable = true)



### "12/25/2019 01:30:00 PM"

In [12]:
date3 = spark.createDataFrame([('12/25/2019 01:30:00 PM',)], ['Christmas'])
date3.show(truncate=False)

+----------------------+
|Christmas             |
+----------------------+
|12/25/2019 01:30:00 PM|
+----------------------+



In [13]:
date3.printSchema()

root
 |-- Christmas: string (nullable = true)



In [14]:
date3 = date3.select(to_date(col('Christmas'), 'MM/dd/yyyy hh:mm:ss a'), 
                     to_timestamp(col('Christmas'), 'MM/dd/yyyy hh:mm:ss a'))
date3.show()

+-----------------------------------------+----------------------------------------------+
|to_date(Christmas, MM/dd/yyyy hh:mm:ss a)|to_timestamp(Christmas, MM/dd/yyyy hh:mm:ss a)|
+-----------------------------------------+----------------------------------------------+
|                               2019-12-25|                           2019-12-25 13:30:00|
+-----------------------------------------+----------------------------------------------+



In [15]:
date3.printSchema()

root
 |-- to_date(Christmas, MM/dd/yyyy hh:mm:ss a): date (nullable = true)
 |-- to_timestamp(Christmas, MM/dd/yyyy hh:mm:ss a): timestamp (nullable = true)

