-- Notepad to myself --

# Functions

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

In [18]:
from pyspark.sql.functions import to_timestamp, col
df = spark.read.csv('data/Crimes-2021.csv', header=True, inferSchema=True) \
    .withColumn('Date', to_timestamp(col('Date'), 'MM/dd/yyyy hh:mm:ss a'))
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: integer (nullable = true)
 |-- Y Coordinate: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Location: string (nullable = true)



### All the functions available to us

In [19]:
from pyspark.sql import functions
print(dir(functions))



### String Functions

#### Display the Primary Type column in lower and upper characters, the first letter capitalized and the first 4 characters of the column

In [20]:
from pyspark.sql.functions import lower, upper, initcap, substring

In [21]:
help(substring)

Help on function substring in module pyspark.sql.functions:

substring(str, pos, len)
    Substring starts at `pos` and is of length `len` when str is String type or
    returns the slice of byte array that starts at `pos` in byte and is of length `len`
    when str is Binary type.
    
    .. versionadded:: 1.5.0
    
    Notes
    -----
    The position is not zero based, but 1 based index.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
    [Row(s='ab')]



In [22]:
df.select(lower(col('Primary Type')), 
          upper(col('Primary Type')), 
          initcap(col('Primary Type')), 
          substring(col('Primary Type'), 1, 4)).show(5, truncate=False)

+--------------------------+--------------------------+--------------------------+-----------------------------+
|lower(Primary Type)       |upper(Primary Type)       |initcap(Primary Type)     |substring(Primary Type, 1, 4)|
+--------------------------+--------------------------+--------------------------+-----------------------------+
|theft                     |THEFT                     |Theft                     |THEF                         |
|other offense             |OTHER OFFENSE             |Other Offense             |OTHE                         |
|offense involving children|OFFENSE INVOLVING CHILDREN|Offense Involving Children|OFFE                         |
|theft                     |THEFT                     |Theft                     |THEF                         |
|battery                   |BATTERY                   |Battery                   |BATT                         |
+--------------------------+--------------------------+--------------------------+--------------

#### To do some sort of padding (e.g. left-pad)

In [23]:
from pyspark.sql.functions import lpad

In [24]:
help(lpad)

Help on function lpad in module pyspark.sql.functions:

lpad(col, len, pad)
    Left-pad the string column to width `len` with `pad`.
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('abcd',)], ['s',])
    >>> df.select(lpad(df.s, 6, '#').alias('s')).collect()
    [Row(s='##abcd')]



In [25]:
df_lpad = df.select('District',
                    lpad(col('District'), 3, '0'))
df_lpad.distinct().show(10)

+--------+--------------------+
|District|lpad(District, 3, 0)|
+--------+--------------------+
|      22|                 022|
|      15|                 015|
|       5|                 005|
|      31|                 031|
|      10|                 010|
|       2|                 002|
|       4|                 004|
|       7|                 007|
|       3|                 003|
|       1|                 001|
+--------+--------------------+
only showing top 10 rows



In [26]:
df_lpad.printSchema()

root
 |-- District: integer (nullable = true)
 |-- lpad(District, 3, 0): string (nullable = true)



Change the data type again into integer

In [27]:
df_lpad = df_lpad.withColumn("District_formatted", col("lpad(District, 3, 0)").cast("int"))
df_lpad.printSchema()

root
 |-- District: integer (nullable = true)
 |-- lpad(District, 3, 0): string (nullable = true)
 |-- District_formatted: integer (nullable = true)



### Numeric Functions

#### Show the oldest date and the most recent date

In [28]:
from pyspark.sql.functions import min, max

In [29]:
df.select(min(col('Date')), 
          max(col('Date'))).show(truncate=False)

+-------------------+-------------------+
|min(Date)          |max(Date)          |
+-------------------+-------------------+
|2021-01-01 00:00:00|2021-12-31 23:59:00|
+-------------------+-------------------+



### Date Functions

#### What is 3 days earlier that the oldest date and 3 days later than the most recent date?

In [30]:
from pyspark.sql.functions import date_add, date_sub, to_date

In [31]:
help(date_add)

Help on function date_add in module pyspark.sql.functions:

date_add(start, days)
    Returns the date that is `days` days after `start`
    
    .. versionadded:: 1.5.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()
    [Row(next_date=datetime.date(2015, 4, 9))]



In [32]:
help(to_date)

Help on function to_date in module pyspark.sql.functions:

to_date(col, format=None)
    Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType`
    using the optionally specified format. Specify formats according to `datetime pattern`_.
    By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format
    is omitted. Equivalent to ``col.cast("date")``.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 2.2.0
    
    Examples
    --------
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t).alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]
    
    >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
    >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
    [Row(date=datetime.date(1997, 2, 28))]



In [33]:
df.select(to_date(min(col('Date'))),
          date_sub(min(col('Date')), 3), 
          to_date(max(col('Date'))),
          date_add(max(col('Date')), 3)).show(truncate=False)

+------------------+----------------------+------------------+----------------------+
|to_date(min(Date))|date_sub(min(Date), 3)|to_date(max(Date))|date_add(max(Date), 3)|
+------------------+----------------------+------------------+----------------------+
|2021-01-01        |2020-12-29            |2021-12-31        |2022-01-03            |
+------------------+----------------------+------------------+----------------------+

