Read a CSV

In [0]:
fireCsvPath = "/FileStore/tables/sf_fire_calls.csv"

fireDF = (spark
           .read
           .option("header", True)
           .option("inferSchema", True)
           .csv(fireCsvPath)
          )

In [0]:
fireDF.show(5, False)

+----------+------+--------------+----------------+----------+----------+--------------------+----------------------+---------------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+---------------------+-------------------------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|CallType        |CallDate  |WatchDate |CallFinalDisposition|AvailableDtTm         |Address                    |City|Zipcode|Battalion|StationArea|Box |OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|Neighborhood         |Location                             |RowID        |Delay    |
+----------+------+--------------+----------------+----------+----------+--------------------+----------------------+---------------------------+----+-------+--

Access to the DataFrame's schema

In [0]:
fireDF.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [0]:
fireDF.schema

Out[4]: StructType(List(StructField(CallNumber,IntegerType,true),StructField(UnitID,StringType,true),StructField(IncidentNumber,IntegerType,true),StructField(CallType,StringType,true),StructField(CallDate,StringType,true),StructField(WatchDate,StringType,true),StructField(CallFinalDisposition,StringType,true),StructField(AvailableDtTm,StringType,true),StructField(Address,StringType,true),StructField(City,StringType,true),StructField(Zipcode,IntegerType,true),StructField(Battalion,StringType,true),StructField(StationArea,StringType,true),StructField(Box,StringType,true),StructField(OriginalPriority,StringType,true),StructField(Priority,StringType,true),StructField(FinalPriority,IntegerType,true),StructField(ALSUnit,BooleanType,true),StructField(CallTypeGroup,StringType,true),StructField(NumAlarms,IntegerType,true),StructField(UnitType,StringType,true),StructField(UnitSequenceInCallDispatch,IntegerType,true),StructField(FirePreventionDistrict,StringType,true),StructField(SupervisorDistri

In [0]:
fireDF.select(fireDF["CallNumber"], fireDF["Box"]).show(5)

+----------+----+
|CallNumber| Box|
+----------+----+
|  20110016|3362|
|  20110022|6495|
|  20110023|1455|
|  20110032|5626|
|  20110043|3223|
+----------+----+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import col

sortedDF = (fireDF.select(col("AvailableDtTm"))
                 .orderBy(col("AvailableDtTm").asc_nulls_last())
                 .show(10, False)
             )

+----------------------+
|AvailableDtTm         |
+----------------------+
|01/01/2001 01:28:04 AM|
|01/01/2001 01:45:54 AM|
|01/01/2001 02:05:38 AM|
|01/01/2001 02:10:47 AM|
|01/01/2001 02:33:55 PM|
|01/01/2001 02:46:15 AM|
|01/01/2001 03:08:20 AM|
|01/01/2001 04:43:42 PM|
|01/01/2001 04:53:50 PM|
|01/01/2001 05:02:40 AM|
+----------------------+
only showing top 10 rows



In [0]:
filterFireDF = (fireDF.select("IncidentNumber", "AvailableDtTm", "CallType")
                     .where(col("CallType") != "Medical Incident")
               )

filterFireDF.show(5, False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [0]:
filterFire2DF = (fireDF.select("IncidentNumber", "AvailableDtTm", "CallType")
                     .where("CallType != 'Medical Incident'")
               )

filterFire2DF.show(5, False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import countDistinct

countFireDF = (fireDF.select("CallType")
                      .filter(col("CallType").isNotNull())
                      .agg(countDistinct("CallType").alias("DistinctCallTypes"))
              )

countFireDF.show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [0]:
callTypeDF = (fireDF.select("CallType")
                   .where(col("CallType").isNotNull())
                   .distinct()
             )

callTypeDF.show(10, False)

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|Vehicle Fire                 |
|Other                        |
|Outside Fire                 |
|Electrical Hazard            |
|Structure Fire               |
|Medical Incident             |
+-----------------------------+
only showing top 10 rows



In [0]:
fireDF.select("CallDate", "WatchDate", "AvailableDtTm").show(10, False)

+----------+----------+----------------------+
|CallDate  |WatchDate |AvailableDtTm         |
+----------+----------+----------------------+
|01/11/2002|01/10/2002|01/11/2002 01:51:44 AM|
|01/11/2002|01/10/2002|01/11/2002 03:01:18 AM|
|01/11/2002|01/10/2002|01/11/2002 02:39:50 AM|
|01/11/2002|01/10/2002|01/11/2002 04:16:46 AM|
|01/11/2002|01/10/2002|01/11/2002 06:01:58 AM|
|01/11/2002|01/11/2002|01/11/2002 08:03:26 AM|
|01/11/2002|01/11/2002|01/11/2002 09:46:44 AM|
|01/11/2002|01/11/2002|01/11/2002 09:58:53 AM|
|01/11/2002|01/11/2002|01/11/2002 12:06:57 PM|
|01/11/2002|01/11/2002|01/11/2002 01:08:40 PM|
+----------+----------+----------------------+
only showing top 10 rows



In [0]:
from pyspark.sql.functions import to_timestamp

timestampFireDF = (fireDF.withColumn("IncidentDate", to_timestamp("CallDate", "MM/dd/yyyy"))
                         .drop("CallDate")
                         .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
                         .drop("WatchDate")
                         .withColumn("AvailableDtTS", to_timestamp("AvailableDtTm", "MM/dd/yyyy hh:mm:ss a"))
                         .drop("AvailableDtTm")
                  )
timestampFireDF.show(10, False)

+----------+------+--------------+----------------+--------------------+---------------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+------------------------------+-------------------------------------+-------------+---------+-------------------+-------------------+-------------------+
|CallNumber|UnitID|IncidentNumber|CallType        |CallFinalDisposition|Address                    |City|Zipcode|Battalion|StationArea|Box |OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|Neighborhood                  |Location                             |RowID        |Delay    |IncidentDate       |OnWatchDate        |AvailableDtTS      |
+----------+------+--------------+----------------+--------------------+---------------------------+----+-------

In [0]:
# Spark Timestamp consists of value in the format “yyyy-MM-dd HH:mm:ss.SSSS” and date format would be ” yyyy-MM-dd”, Use to_date() function to truncate time from Timestamp or to convert the timestamp to date on Spark DataFrame column.

# The input to this function should be timestamp column or string in TimestampType format and it returns just date in DateType column.

from pyspark.sql.functions import to_date

dateFireDF = (fireDF.withColumn("IncidentDate", to_date("CallDate", "MM/dd/yyyy"))
                    .withColumn("OnWatchDate", to_date(col("WatchDate"), "MM/dd/yyyy"))
                    .withColumn("AvailableDtTS", to_date("AvailableDtTm", "MM/dd/yyyy hh:mm:ss a"))
                    .select("CallDate", "IncidentDate", "WatchDate", "OnWatchDate", "AvailableDtTm", "AvailableDtTS")
              )
                                
dateFireDF.show(10, False)

+----------+------------+----------+-----------+----------------------+-------------+
|CallDate  |IncidentDate|WatchDate |OnWatchDate|AvailableDtTm         |AvailableDtTS|
+----------+------------+----------+-----------+----------------------+-------------+
|01/11/2002|2002-01-11  |01/10/2002|2002-01-10 |01/11/2002 01:51:44 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/10/2002|2002-01-10 |01/11/2002 03:01:18 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/10/2002|2002-01-10 |01/11/2002 02:39:50 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/10/2002|2002-01-10 |01/11/2002 04:16:46 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/10/2002|2002-01-10 |01/11/2002 06:01:58 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/11/2002|2002-01-11 |01/11/2002 08:03:26 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/11/2002|2002-01-11 |01/11/2002 09:46:44 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/11/2002|2002-01-11 |01/11/2002 09:58:53 AM|2002-01-11   |
|01/11/2002|2002-01-11  |01/11/2002|2002-01-11 |01/11/

In [0]:
from pyspark.sql.functions import year, month, dayofweek

yearDF = timestampFireDF.select("IncidentDate", year("IncidentDate"), month("IncidentDate"), dayofweek("IncidentDate"))

yearDF.show(10, False)

+-------------------+------------------+-------------------+-----------------------+
|IncidentDate       |year(IncidentDate)|month(IncidentDate)|dayofweek(IncidentDate)|
+-------------------+------------------+-------------------+-----------------------+
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6                      |
|2002-01-11 00:00:00|2002              |1                  |6    

In [0]:
year2DF = dateFireDF.select("IncidentDate", year("IncidentDate"), month("IncidentDate"), dayofweek("IncidentDate"))

year2DF.show(10, False)

+------------+------------------+-------------------+-----------------------+
|IncidentDate|year(IncidentDate)|month(IncidentDate)|dayofweek(IncidentDate)|
+------------+------------------+-------------------+-----------------------+
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6                      |
|2002-01-11  |2002              |1                  |6          

In [0]:
from pyspark.sql.functions import date_add

plus2DF = (timestampFireDF.withColumn("plus_two_days", date_add(col("IncidentDate"), 2))
                         .select("IncidentDate", "plus_two_days")
          )

plus2DF.show(5, False)

+-------------------+-------------+
|IncidentDate       |plus_two_days|
+-------------------+-------------+
|2002-01-11 00:00:00|2002-01-13   |
|2002-01-11 00:00:00|2002-01-13   |
|2002-01-11 00:00:00|2002-01-13   |
|2002-01-11 00:00:00|2002-01-13   |
|2002-01-11 00:00:00|2002-01-13   |
+-------------------+-------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import upper

upperDF = (fireDF.withColumn("UpperCallType", upper("CallType"))
                 .select("CallType", "UpperCallType")
          )

upperDF.show(10, False)

+----------------+----------------+
|CallType        |UpperCallType   |
+----------------+----------------+
|Structure Fire  |STRUCTURE FIRE  |
|Medical Incident|MEDICAL INCIDENT|
|Medical Incident|MEDICAL INCIDENT|
|Vehicle Fire    |VEHICLE FIRE    |
|Alarms          |ALARMS          |
|Structure Fire  |STRUCTURE FIRE  |
|Alarms          |ALARMS          |
|Alarms          |ALARMS          |
|Medical Incident|MEDICAL INCIDENT|
|Medical Incident|MEDICAL INCIDENT|
+----------------+----------------+
only showing top 10 rows



In [0]:
from pyspark.sql.functions import lower

fireDF.select(col("CallType"),
lower("CallType"),
upper(lower(col("CallType")))).show(2)

+----------------+----------------+----------------------+
|        CallType| lower(CallType)|upper(lower(CallType))|
+----------------+----------------+----------------------+
|  Structure Fire|  structure fire|        STRUCTURE FIRE|
|Medical Incident|medical incident|      MEDICAL INCIDENT|
+----------------+----------------+----------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import desc

fireDF2018 = (timestampFireDF.filter(year("IncidentDate") == 2002)
                             .groupBy("Neighborhood")
                             .count()
                             .orderBy(col("count").desc())
             )

fireDF2018.show(10)

+--------------------+-----+
|        Neighborhood|count|
+--------------------+-----+
|          Tenderloin|  962|
|             Mission|  784|
|     South of Market|  712|
|Financial Distric...|  568|
|Bayview Hunters P...|  495|
|     Sunset/Parkside|  383|
|    Western Addition|  297|
|            Nob Hill|  236|
|      Outer Richmond|  215|
|  West of Twin Peaks|  204|
+--------------------+-----+
only showing top 10 rows



In [0]:
from pyspark.sql.functions import concat

concatFireDF = fireDF.select(concat(fireDF.Address, fireDF.Zipcode).alias("FullAddress"), "City")

concatFireDF.show(5, False)

+--------------------------------+----+
|FullAddress                     |City|
+--------------------------------+----+
|2000 Block of CALIFORNIA ST94109|SF  |
|0 Block of SILVERVIEW DR94124   |SF  |
|MARKET ST/MCALLISTER ST94102    |SF  |
|APPLETON AV/MISSION ST94110     |SF  |
|1400 Block of SUTTER ST94109    |SF  |
+--------------------------------+----+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import concat_ws

concatWsFireDF = fireDF.select(concat_ws(" ", fireDF.Address, fireDF.Zipcode).alias("FullAddress"), "City")

concatWsFireDF.show(5, False)

+---------------------------------+----+
|FullAddress                      |City|
+---------------------------------+----+
|2000 Block of CALIFORNIA ST 94109|SF  |
|0 Block of SILVERVIEW DR 94124   |SF  |
|MARKET ST/MCALLISTER ST 94102    |SF  |
|APPLETON AV/MISSION ST 94110     |SF  |
|1400 Block of SUTTER ST 94109    |SF  |
+---------------------------------+----+
only showing top 5 rows



In [0]:
concat2WsFireDF = fireDF.select(concat_ws(" ", "Address", "Zipcode").alias("FullAddress"), "City")

concat2WsFireDF.show(1, False)

+---------------------------------+----+
|FullAddress                      |City|
+---------------------------------+----+
|2000 Block of CALIFORNIA ST 94109|SF  |
+---------------------------------+----+
only showing top 1 row



In [0]:
concat3WsFireDF = fireDF.select(concat_ws(" ", col("Address"), col("Zipcode")).alias("FullAddress"), "City")

concat3WsFireDF.show(1, False)

+---------------------------------+----+
|FullAddress                      |City|
+---------------------------------+----+
|2000 Block of CALIFORNIA ST 94109|SF  |
+---------------------------------+----+
only showing top 1 row



In [0]:
delayFireDF = fireDF.withColumn("DoubleDelay", col("Delay") * 2) \
                    .select("IncidentNumber", "CallType", "Delay", "DoubleDelay")

delayFireDF.show(5)

+--------------+----------------+---------+-----------+
|IncidentNumber|        CallType|    Delay|DoubleDelay|
+--------------+----------------+---------+-----------+
|       2003235|  Structure Fire|     2.95|        5.9|
|       2003241|Medical Incident|      4.7|        9.4|
|       2003242|Medical Incident|2.4333334|  4.8666668|
|       2003250|    Vehicle Fire|      1.5|        3.0|
|       2003259|          Alarms|3.4833333|  6.9666666|
+--------------+----------------+---------+-----------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import expr

delayFire2DF = fireDF.withColumn("DoubleDelay", expr("Delay * 2")) \
                    .select("IncidentNumber", "CallType", "Delay", "DoubleDelay")

delayFire2DF.show(5)

+--------------+----------------+---------+-----------+
|IncidentNumber|        CallType|    Delay|DoubleDelay|
+--------------+----------------+---------+-----------+
|       2003235|  Structure Fire|     2.95|        5.9|
|       2003241|Medical Incident|      4.7|        9.4|
|       2003242|Medical Incident|2.4333334|  4.8666668|
|       2003250|    Vehicle Fire|      1.5|        3.0|
|       2003259|          Alarms|3.4833333|  6.9666666|
+--------------+----------------+---------+-----------+
only showing top 5 rows



In [0]:
delayFire2DF.first().CallType

Out[26]: 'Structure Fire'

In [0]:
delayFire2DF.select(delayFire2DF.IncidentNumber).show(5)

+--------------+
|IncidentNumber|
+--------------+
|       2003235|
|       2003241|
|       2003242|
|       2003250|
|       2003259|
+--------------+
only showing top 5 rows



In [0]:
delayDF = fireDF.select("IncidentNumber", "CallType", "Delay", expr("Delay / 2 as HalfDelay"))

delayDF.show(5)

+--------------+----------------+---------+----------+
|IncidentNumber|        CallType|    Delay| HalfDelay|
+--------------+----------------+---------+----------+
|       2003235|  Structure Fire|     2.95|     1.475|
|       2003241|Medical Incident|      4.7|      2.35|
|       2003242|Medical Incident|2.4333334| 1.2166667|
|       2003250|    Vehicle Fire|      1.5|      0.75|
|       2003259|          Alarms|3.4833333|1.74166665|
+--------------+----------------+---------+----------+
only showing top 5 rows



In [0]:
delay2DF = fireDF.select("IncidentNumber", "CallType", "Delay", (col("Delay") / 2).alias("HalfDelay"))

delay2DF.show(5)

+--------------+----------------+---------+----------+
|IncidentNumber|        CallType|    Delay| HalfDelay|
+--------------+----------------+---------+----------+
|       2003235|  Structure Fire|     2.95|     1.475|
|       2003241|Medical Incident|      4.7|      2.35|
|       2003242|Medical Incident|2.4333334| 1.2166667|
|       2003250|    Vehicle Fire|      1.5|      0.75|
|       2003259|          Alarms|3.4833333|1.74166665|
+--------------+----------------+---------+----------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import coalesce

coalesceDF = fireDF.select("IncidentNumber", "CallType", "CallDate", coalesce(col("CallTypeGroup"), col("CallType")).alias("CallTypeGroup"))

coalesceDF.show(5)

+--------------+----------------+----------+----------------+
|IncidentNumber|        CallType|  CallDate|   CallTypeGroup|
+--------------+----------------+----------+----------------+
|       2003235|  Structure Fire|01/11/2002|  Structure Fire|
|       2003241|Medical Incident|01/11/2002|Medical Incident|
|       2003242|Medical Incident|01/11/2002|Medical Incident|
|       2003250|    Vehicle Fire|01/11/2002|    Vehicle Fire|
|       2003259|          Alarms|01/11/2002|          Alarms|
+--------------+----------------+----------+----------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import lit

coalesce2DF = fireDF.select("IncidentNumber", "CallType", "CallDate", coalesce(col("CallTypeGroup"), lit("test")).alias("CallTypeGroup"))

coalesce2DF.show(5)

+--------------+----------------+----------+-------------+
|IncidentNumber|        CallType|  CallDate|CallTypeGroup|
+--------------+----------------+----------+-------------+
|       2003235|  Structure Fire|01/11/2002|         test|
|       2003241|Medical Incident|01/11/2002|         test|
|       2003242|Medical Incident|01/11/2002|         test|
|       2003250|    Vehicle Fire|01/11/2002|         test|
|       2003259|          Alarms|01/11/2002|         test|
+--------------+----------------+----------+-------------+
only showing top 5 rows



In [0]:
coalesce3DF = fireDF.withColumn("CallGroup", coalesce("CallTypeGroup", lit(1))) \
                    .select("IncidentNumber", "CallType", "CallDate", "CallTypeGroup", "CallGroup") \
                    .show(5)

+--------------+----------------+----------+-------------+---------+
|IncidentNumber|        CallType|  CallDate|CallTypeGroup|CallGroup|
+--------------+----------------+----------+-------------+---------+
|       2003235|  Structure Fire|01/11/2002|         null|        1|
|       2003241|Medical Incident|01/11/2002|         null|        1|
|       2003242|Medical Incident|01/11/2002|         null|        1|
|       2003250|    Vehicle Fire|01/11/2002|         null|        1|
|       2003259|          Alarms|01/11/2002|         null|        1|
+--------------+----------------+----------+-------------+---------+
only showing top 5 rows



In [0]:
coalesce4DF = fireDF.withColumn("CallGroup", expr("coalesce(CallTypeGroup, 1)")) \
                    .select("IncidentNumber", "CallType", "CallDate", "CallTypeGroup", "CallGroup") \
                    .show(5)

+--------------+----------------+----------+-------------+---------+
|IncidentNumber|        CallType|  CallDate|CallTypeGroup|CallGroup|
+--------------+----------------+----------+-------------+---------+
|       2003235|  Structure Fire|01/11/2002|         null|        1|
|       2003241|Medical Incident|01/11/2002|         null|        1|
|       2003242|Medical Incident|01/11/2002|         null|        1|
|       2003250|    Vehicle Fire|01/11/2002|         null|        1|
|       2003259|          Alarms|01/11/2002|         null|        1|
+--------------+----------------+----------+-------------+---------+
only showing top 5 rows



In [0]:
ifnullDF = fireDF.withColumn("CallGroup", expr("ifnull(CallTypeGroup, 'test')")) \
                    .select("IncidentNumber", "CallType", "CallDate", "CallTypeGroup", "CallGroup") \
                    .show(5)

# ifnull() is a SparkSQL function and it is not available as a DataFrame function so it does not work as a DataFrame function

# you do not need lit() in Spark SQL

+--------------+----------------+----------+-------------+---------+
|IncidentNumber|        CallType|  CallDate|CallTypeGroup|CallGroup|
+--------------+----------------+----------+-------------+---------+
|       2003235|  Structure Fire|01/11/2002|         null|     test|
|       2003241|Medical Incident|01/11/2002|         null|     test|
|       2003242|Medical Incident|01/11/2002|         null|     test|
|       2003250|    Vehicle Fire|01/11/2002|         null|     test|
|       2003259|          Alarms|01/11/2002|         null|     test|
+--------------+----------------+----------+-------------+---------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import isnull

isnullDF = fireDF.withColumn("isNullCallTypeGroup", isnull("CallTypeGroup")) \
                    .select("IncidentNumber", "CallType", "CallDate", "CallTypeGroup", "isNullCallTypeGroup") \
                    .show(5)


+--------------+----------------+----------+-------------+-------------------+
|IncidentNumber|        CallType|  CallDate|CallTypeGroup|isNullCallTypeGroup|
+--------------+----------------+----------+-------------+-------------------+
|       2003235|  Structure Fire|01/11/2002|         null|               true|
|       2003241|Medical Incident|01/11/2002|         null|               true|
|       2003242|Medical Incident|01/11/2002|         null|               true|
|       2003250|    Vehicle Fire|01/11/2002|         null|               true|
|       2003259|          Alarms|01/11/2002|         null|               true|
+--------------+----------------+----------+-------------+-------------------+
only showing top 5 rows



In [0]:
display(fireDF)

CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,CallFinalDisposition,AvailableDtTm,Address,City,Zipcode,Battalion,StationArea,Box,OriginalPriority,Priority,FinalPriority,ALSUnit,CallTypeGroup,NumAlarms,UnitType,UnitSequenceInCallDispatch,FirePreventionDistrict,SupervisorDistrict,Neighborhood,Location,RowID,Delay
20110016,T13,2003235,Structure Fire,01/11/2002,01/10/2002,Other,01/11/2002 01:51:44 AM,2000 Block of CALIFORNIA ST,SF,94109.0,B04,38,3362,3,3,3,False,,1,TRUCK,2,4.0,5.0,Pacific Heights,"(37.7895840679362, -122.428071912459)",020110016-T13,2.95
20110022,M17,2003241,Medical Incident,01/11/2002,01/10/2002,Other,01/11/2002 03:01:18 AM,0 Block of SILVERVIEW DR,SF,94124.0,B10,42,6495,3,3,3,True,,1,MEDIC,1,10.0,10.0,Bayview Hunters Point,"(37.7337623673897, -122.396113802632)",020110022-M17,4.7
20110023,M41,2003242,Medical Incident,01/11/2002,01/10/2002,Other,01/11/2002 02:39:50 AM,MARKET ST/MCALLISTER ST,SF,94102.0,B03,1,1455,3,3,3,True,,1,MEDIC,2,3.0,6.0,Tenderloin,"(37.7811772186856, -122.411699931232)",020110023-M41,2.4333334
20110032,E11,2003250,Vehicle Fire,01/11/2002,01/10/2002,Other,01/11/2002 04:16:46 AM,APPLETON AV/MISSION ST,SF,94110.0,B06,32,5626,3,3,3,False,,1,ENGINE,1,6.0,9.0,Bernal Heights,"(37.7388432849018, -122.423948785199)",020110032-E11,1.5
20110043,B04,2003259,Alarms,01/11/2002,01/10/2002,Other,01/11/2002 06:01:58 AM,1400 Block of SUTTER ST,SF,94109.0,B04,3,3223,3,3,3,False,,1,CHIEF,2,4.0,2.0,Western Addition,"(37.7872890372638, -122.424236212664)",020110043-B04,3.4833333
20110072,T08,2003279,Structure Fire,01/11/2002,01/11/2002,Other,01/11/2002 08:03:26 AM,BEALE ST/FOLSOM ST,SF,94105.0,B03,35,2122,3,3,3,False,,1,TRUCK,2,3.0,6.0,Financial District/South Beach,"(37.7886866619654, -122.392722833778)",020110072-T08,1.75
20110125,E33,2003301,Alarms,01/11/2002,01/11/2002,Other,01/11/2002 09:46:44 AM,0 Block of FARALLONES ST,SF,94112.0,B09,33,8324,3,3,3,False,,1,ENGINE,2,9.0,11.0,Oceanview/Merced/Ingleside,"(37.7140353531157, -122.454117149916)",020110125-E33,2.7166667
20110130,E36,2003304,Alarms,01/11/2002,01/11/2002,Other,01/11/2002 09:58:53 AM,600 Block of POLK ST,SF,94102.0,B02,3,3114,3,3,3,False,,1,ENGINE,1,2.0,6.0,Tenderloin,"(37.7826266328595, -122.41915582123)",020110130-E36,1.7833333
20110197,E05,2003343,Medical Incident,01/11/2002,01/11/2002,Other,01/11/2002 12:06:57 PM,1500 Block of WEBSTER ST,SF,94115.0,B04,5,3513,3,3,3,False,,1,ENGINE,1,4.0,5.0,Japantown,"(37.784958590666, -122.431435274503)",020110197-E05,1.5166667
20110215,E06,2003348,Medical Incident,01/11/2002,01/11/2002,Other,01/11/2002 01:08:40 PM,DIAMOND ST/MARKET ST,SF,94114.0,B05,6,5415,3,3,3,False,,1,ENGINE,1,5.0,8.0,Castro/Upper Market,"(37.7618954753708, -122.437298717721)",020110215-E06,2.7666667


In [0]:
fireDF.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [0]:
castDF = fireDF.select("CallNumber", "UnitID", col("IncidentNumber").cast("String"))
               
castDF.show(5)

+----------+------+--------------+
|CallNumber|UnitID|IncidentNumber|
+----------+------+--------------+
|  20110016|   T13|       2003235|
|  20110022|   M17|       2003241|
|  20110023|   M41|       2003242|
|  20110032|   E11|       2003250|
|  20110043|   B04|       2003259|
+----------+------+--------------+
only showing top 5 rows



In [0]:
castDF.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: string (nullable = true)



In [0]:
cast2DF = fireDF.select("CallNumber", "UnitID", expr("STRING(IncidentNumber)"))
               
cast2DF.show(5)

+----------+------+--------------+
|CallNumber|UnitID|IncidentNumber|
+----------+------+--------------+
|  20110016|   T13|       2003235|
|  20110022|   M17|       2003241|
|  20110023|   M41|       2003242|
|  20110032|   E11|       2003250|
|  20110043|   B04|       2003259|
+----------+------+--------------+
only showing top 5 rows



In [0]:
cast2DF.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: string (nullable = true)



In [0]:
exprDF = fireDF.selectExpr("Delay", "CASE WHEN Delay <2 THEN 'minor delay' ELSE 'longer delay' END AS DelayType ")

exprDF.show(5)

+---------+------------+
|    Delay|   DelayType|
+---------+------------+
|     2.95|longer delay|
|      4.7|longer delay|
|2.4333334|longer delay|
|      1.5| minor delay|
|3.4833333|longer delay|
+---------+------------+
only showing top 5 rows



In [0]:
exprDF.limit(10).where("Delay > 3").show()

+---------+------------+
|    Delay|   DelayType|
+---------+------------+
|      4.7|longer delay|
|3.4833333|longer delay|
+---------+------------+



In [0]:
exprDF.where("Delay > 3").limit(10).show()

+---------+------------+
|    Delay|   DelayType|
+---------+------------+
|      4.7|longer delay|
|3.4833333|longer delay|
|     4.95|longer delay|
|     5.35|longer delay|
|4.4666667|longer delay|
|3.8833334|longer delay|
|      3.3|longer delay|
|      3.1|longer delay|
|3.3333333|longer delay|
|3.9666667|longer delay|
+---------+------------+



In [0]:
sampleDF = fireDF.sample(0.2, 5)

sampleDF.show(5)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

In [0]:
sample3DF = fireDF.sample(0.2, 5)

sample3DF.show(5)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

In [0]:
sample4DF = fireDF.sample(0.2, 11)

sample4DF.show(5)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+------------

In [0]:
sample2DF = fireDF.sample(True, 0.2)

sample2DF.show(5)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|      UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+

Spark can be used for the following purposes:
- Batch Data processing of large volumes using Spark Dataframe
- Real-Time Stream processing using Spark Structured Streaming
- Adhoc Data Analysis using Spark SQL
- Graph Processing using Spark GraphX
- Machine Learning using Spark MLLib

In [0]:
from pyspark.sql.types import IntegerType

mylist = [1002, 3001, 4002, 2003, 2002, 3004, 1003, 4006]

df = spark.createDataFrame(mylist, IntegerType()).toDF("value")

df.withColumn("key", col("value") % 1000) \
    .show()

+-----+---+
|value|key|
+-----+---+
| 1002|  2|
| 3001|  1|
| 4002|  2|
| 2003|  3|
| 2002|  2|
| 3004|  4|
| 1003|  3|
| 4006|  6|
+-----+---+



In [0]:
from pyspark.sql.functions import expr

mylist = [1002, 3001, 4002, 2003, 2002, 3004, 1003, 4006]

df = spark.createDataFrame(mylist, IntegerType()).toDF("value")

df.withColumn("key", col("value") % 1000) \
    .groupBy("Key") \
    .agg(expr("count(key) as count"), expr("sum(key) as sum")) \
    .orderBy(col("key").desc()) \
    .show()

+---+-----+---+
|Key|count|sum|
+---+-----+---+
|  6|    1|  6|
|  4|    1|  4|
|  3|    2|  6|
|  2|    3|  6|
|  1|    1|  1|
+---+-----+---+



In [0]:
myList1 = [1,2,3,4,5]

dfNumeros = spark.createDataFrame(myList1, IntegerType()).toDF("num")

In [0]:
myList2 = [1,3,5,7,9]

dfNumeros2 = spark.createDataFrame(myList2, IntegerType()).toDF("value")

In [0]:
dfNumeros3 = dfNumeros.union(dfNumeros2)

dfNumeros3.show()

+---+
|num|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  1|
|  3|
|  5|
|  7|
|  9|
+---+



In [0]:
dfNumeros4 = dfNumeros.union(dfNumeros2).dropDuplicates()

dfNumeros4.show()

+---+
|num|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  7|
|  9|
+---+



In [0]:
df.head()

Out[55]: Row(value=1002)

In [0]:
df.first()

Out[56]: Row(value=1002)

In [0]:
df.describe()

Out[57]: DataFrame[summary: string, value: string]

In [0]:
df.summary().show()

+-------+------------------+
|summary|             value|
+-------+------------------+
|  count|                 8|
|   mean|          2502.875|
| stddev|1195.7673492663314|
|    min|              1002|
|    25%|              1003|
|    50%|              2003|
|    75%|              3004|
|    max|              4006|
+-------+------------------+



In [0]:
myList3 = [1,3,5,7,9,1,5,7,11,13]

numbersDF = spark.createDataFrame(myList3, IntegerType()).toDF("numbers")

numbersDF.show()

+-------+
|numbers|
+-------+
|      1|
|      3|
|      5|
|      7|
|      9|
|      1|
|      5|
|      7|
|     11|
|     13|
+-------+



In [0]:
newNumbersDF = numbersDF.dropDuplicates(["numbers"])

newNumbersDF.show()

+-------+
|numbers|
+-------+
|      1|
|      3|
|      5|
|      9|
|      7|
|     13|
|     11|
+-------+



In [0]:
from pyspark.sql.functions import sum, avg, min, max

delaysDF = fireDF.select(sum("NumAlarms"), avg("Delay"), min("Delay"), max("Delay"))

delaysDF.show()

+--------------+------------------+-----------+----------+
|sum(NumAlarms)|        avg(Delay)| min(Delay)|max(Delay)|
+--------------+------------------+-----------+----------+
|        176170|3.8923641541750134|0.016666668|   1844.55|
+--------------+------------------+-----------+----------+



In [0]:
delays2DF = fireDF.agg(sum("NumAlarms").alias("TotalNumAlarms"), avg("Delay").alias("avgDelay"), min("Delay").alias("minDelay"), max("Delay").alias("maxDelay")).select("TotalNumAlarms", "avgDelay", "minDelay", "maxDelay")

delays2DF.show()

+--------------+------------------+-----------+--------+
|TotalNumAlarms|          avgDelay|   minDelay|maxDelay|
+--------------+------------------+-----------+--------+
|        176170|3.8923641541750134|0.016666668| 1844.55|
+--------------+------------------+-----------+--------+



In [0]:
delays3DF = fireDF.agg(expr("sum(NumAlarms) as TotalNumAlarms"))

delays3DF.show()

+--------------+
|TotalNumAlarms|
+--------------+
|        176170|
+--------------+



In [0]:
new_fire_df = fireDF.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
 .select("ResponseDelayedinMins")
 .where(expr("ResponseDelayedinMins > 5"))
 .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc

windowSpec = Window\
    .partitionBy("Neighborhood")\
    .orderBy(desc("NumAlarms"))\

from pyspark.sql.functions import dense_rank

rankDF = fireDF.withColumn("DenseRank", dense_rank().over(windowSpec))\
               .select("Neighborhood", "numAlarms", "DenseRank", "CallType")

rankDF.show(10)

+--------------------+---------+---------+--------------+
|        Neighborhood|numAlarms|DenseRank|      CallType|
+--------------------+---------+---------+--------------+
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        4|        1|Structure Fire|
|Bayview Hunters P...|        3|        2|Structure Fire|
|Bayview Hunters P...|        3|        2|Structure Fire|
+--------------------+---------+---------+--------------+
only showing top 10 rows



In [0]:
windowSpec2 = Window\
    .partitionBy("CallType")\
    .orderBy("NumAlarms")\

from pyspark.sql.functions import rank

rank2DF = fireDF.withColumn("Rank", rank().over(windowSpec2))\
               .select("Neighborhood", "numAlarms", "Rank", "CallType")

rank2DF.show(10)

+--------------------+---------+----+------------------+
|        Neighborhood|numAlarms|Rank|          CallType|
+--------------------+---------+----+------------------+
|Financial Distric...|        1|   1|    Administrative|
|             Mission|        1|   1|    Administrative|
|     South of Market|        1|   1|    Administrative|
|                None|        1|   1|Aircraft Emergency|
|                None|        1|   1|Aircraft Emergency|
|                None|        1|   1|Aircraft Emergency|
|                None|        1|   1|Aircraft Emergency|
|                None|        1|   1|Aircraft Emergency|
|                None|        1|   1|Aircraft Emergency|
|                None|        1|   1|Aircraft Emergency|
+--------------------+---------+----+------------------+
only showing top 10 rows



In [0]:
dateFire = fireDF.select(col("CallNumber"), col("IncidentNumber"), to_date(col("CallDate"), "dd/MM/yyyy").alias("NewCallDate"))

dateFire.show(5)

+----------+--------------+-----------+
|CallNumber|IncidentNumber|NewCallDate|
+----------+--------------+-----------+
|  20110016|       2003235| 2002-11-01|
|  20110022|       2003241| 2002-11-01|
|  20110023|       2003242| 2002-11-01|
|  20110032|       2003250| 2002-11-01|
|  20110043|       2003259| 2002-11-01|
+----------+--------------+-----------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import date_add

plus2DF = dateFire.withColumn("plus_two_days", date_add("NewCallDate", 2))

plus2DF.show(5, False)

+----------+--------------+-----------+-------------+
|CallNumber|IncidentNumber|NewCallDate|plus_two_days|
+----------+--------------+-----------+-------------+
|20110016  |2003235       |2002-11-01 |2002-11-03   |
|20110022  |2003241       |2002-11-01 |2002-11-03   |
|20110023  |2003242       |2002-11-01 |2002-11-03   |
|20110032  |2003250       |2002-11-01 |2002-11-03   |
|20110043  |2003259       |2002-11-01 |2002-11-03   |
+----------+--------------+-----------+-------------+
only showing top 5 rows



In [0]:
dateFire.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- NewCallDate: date (nullable = true)



In [0]:
tsFire = fireDF.withColumn("AvailableTimestamp", (to_timestamp("AvailableDtTm", "dd/MM/yyyy HH:mm:ss aa")))\
               .select("AvailableTimestamp", "IncidentNumber")

tsFire.show(5, False)

+-------------------+--------------+
|AvailableTimestamp |IncidentNumber|
+-------------------+--------------+
|2002-11-01 01:51:44|2003235       |
|2002-11-01 03:01:18|2003241       |
|2002-11-01 02:39:50|2003242       |
|2002-11-01 04:16:46|2003250       |
|2002-11-01 06:01:58|2003259       |
+-------------------+--------------+
only showing top 5 rows



In [0]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [0]:
tsFire2 = fireDF.select("IncidentNumber", to_timestamp("AvailableDtTm", "dd/MM/yyyy HH:mm:ss aa").alias("AvailableTimestamp"))

tsFire2.show(5, False)

+--------------+-------------------+
|IncidentNumber|AvailableTimestamp |
+--------------+-------------------+
|2003235       |2002-11-01 01:51:44|
|2003241       |2002-11-01 03:01:18|
|2003242       |2002-11-01 02:39:50|
|2003250       |2002-11-01 04:16:46|
|2003259       |2002-11-01 06:01:58|
+--------------+-------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import year, month, dayofweek, minute, second

daysDF = (tsFire
              .withColumn("year", year(col("AvailableTimestamp")))
              .withColumn("month", month(col("AvailableTimestamp")))
              .withColumn("dayofweek", dayofweek(col("AvailableTimestamp")))
              .withColumn("minute", minute(col("AvailableTimestamp")))
              .withColumn("second", second(col("AvailableTimestamp")))
             )
daysDF.show(5, False)

+-------------------+--------------+----+-----+---------+------+------+
|AvailableTimestamp |IncidentNumber|year|month|dayofweek|minute|second|
+-------------------+--------------+----+-----+---------+------+------+
|2002-11-01 01:51:44|2003235       |2002|11   |6        |51    |44    |
|2002-11-01 03:01:18|2003241       |2002|11   |6        |1     |18    |
|2002-11-01 02:39:50|2003242       |2002|11   |6        |39    |50    |
|2002-11-01 04:16:46|2003250       |2002|11   |6        |16    |46    |
|2002-11-01 06:01:58|2003259       |2002|11   |6        |1     |58    |
+-------------------+--------------+----+-----+---------+------+------+
only showing top 5 rows



In [0]:
concDF = fireDF.withColumn("Call", concat_ws(" ", "CallNumber", "CallType", "CallDate"))\
               .select("IncidentNumber", "Call")

concDF.show(5, False)

+--------------+------------------------------------+
|IncidentNumber|Call                                |
+--------------+------------------------------------+
|2003235       |20110016 Structure Fire 01/11/2002  |
|2003241       |20110022 Medical Incident 01/11/2002|
|2003242       |20110023 Medical Incident 01/11/2002|
|2003250       |20110032 Vehicle Fire 01/11/2002    |
|2003259       |20110043 Alarms 01/11/2002          |
+--------------+------------------------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import split

splitDF = concDF.withColumn("splitted", split("Call", " "))

splitDF.show(5, False)

+--------------+------------------------------------+-----------------------------------------+
|IncidentNumber|Call                                |splitted                                 |
+--------------+------------------------------------+-----------------------------------------+
|2003235       |20110016 Structure Fire 01/11/2002  |[20110016, Structure, Fire, 01/11/2002]  |
|2003241       |20110022 Medical Incident 01/11/2002|[20110022, Medical, Incident, 01/11/2002]|
|2003242       |20110023 Medical Incident 01/11/2002|[20110023, Medical, Incident, 01/11/2002]|
|2003250       |20110032 Vehicle Fire 01/11/2002    |[20110032, Vehicle, Fire, 01/11/2002]    |
|2003259       |20110043 Alarms 01/11/2002          |[20110043, Alarms, 01/11/2002]           |
+--------------+------------------------------------+-----------------------------------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import explode

expDF = splitDF.withColumn("exploded", explode(col("splitted")))

expDF.show(5, False)

+--------------+------------------------------------+-----------------------------------------+----------+
|IncidentNumber|Call                                |splitted                                 |exploded  |
+--------------+------------------------------------+-----------------------------------------+----------+
|2003235       |20110016 Structure Fire 01/11/2002  |[20110016, Structure, Fire, 01/11/2002]  |20110016  |
|2003235       |20110016 Structure Fire 01/11/2002  |[20110016, Structure, Fire, 01/11/2002]  |Structure |
|2003235       |20110016 Structure Fire 01/11/2002  |[20110016, Structure, Fire, 01/11/2002]  |Fire      |
|2003235       |20110016 Structure Fire 01/11/2002  |[20110016, Structure, Fire, 01/11/2002]  |01/11/2002|
|2003241       |20110022 Medical Incident 01/11/2002|[20110022, Medical, Incident, 01/11/2002]|20110022  |
+--------------+------------------------------------+-----------------------------------------+----------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import element_at

elementDF = splitDF.withColumn("type", element_at(col("splitted"), 2))

elementDF.show(5, False)

+--------------+------------------------------------+-----------------------------------------+---------+
|IncidentNumber|Call                                |splitted                                 |type     |
+--------------+------------------------------------+-----------------------------------------+---------+
|2003235       |20110016 Structure Fire 01/11/2002  |[20110016, Structure, Fire, 01/11/2002]  |Structure|
|2003241       |20110022 Medical Incident 01/11/2002|[20110022, Medical, Incident, 01/11/2002]|Medical  |
|2003242       |20110023 Medical Incident 01/11/2002|[20110023, Medical, Incident, 01/11/2002]|Medical  |
|2003250       |20110032 Vehicle Fire 01/11/2002    |[20110032, Vehicle, Fire, 01/11/2002]    |Vehicle  |
|2003259       |20110043 Alarms 01/11/2002          |[20110043, Alarms, 01/11/2002]           |Alarms   |
+--------------+------------------------------------+-----------------------------------------+---------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import collect_set

optionsDF = (fireDF
             .groupBy("CallDate")
             .agg(collect_set("CallType").alias("TypesOfCall"))
             .select("CallDate", "TypesOfCall")
            )

optionsDF.show(5, False)

+----------+------------------------------------------------------------------------------------------------------------------------+
|CallDate  |TypesOfCall                                                                                                             |
+----------+------------------------------------------------------------------------------------------------------------------------+
|01/01/2001|[Other, Medical Incident, Alarms, Structure Fire]                                                                       |
|01/01/2002|[Medical Incident, Alarms, Structure Fire]                                                                              |
|01/01/2003|[Outside Fire, Fuel Spill, Vehicle Fire, Other, Citizen Assist / Service Call, Medical Incident, Alarms, Structure Fire]|
|01/01/2004|[Elevator / Escalator Rescue, Other, Traffic Collision, Medical Incident, Alarms, Electrical Hazard, Structure Fire]    |
|01/01/2005|[Medical Incident, Alarms, Structure Fire]        

In [0]:
from pyspark.sql.functions import collect_list

options2DF = (fireDF
             .groupBy("CallDate")
             .agg(collect_list("CallType").alias("TypesOfCall"))
             .select("CallDate", "TypesOfCall")
            )

options2DF.show(5, False)

+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CallDate  |TypesOfCall                                                                                                                                                                                                                                                                                                                         

In [0]:
upDF = fireDF.select(upper("CallType"))

upDF.show(5)

+----------------+
| upper(CallType)|
+----------------+
|  STRUCTURE FIRE|
|MEDICAL INCIDENT|
|MEDICAL INCIDENT|
|    VEHICLE FIRE|
|          ALARMS|
+----------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import regexp_replace

regex = fireDF.withColumn('newaddress', regexp_replace('Address', 'ST', 'STREET')) \
              .select("IncidentNumber", "Address", "newaddress")

regex.show(7, False)

+--------------+---------------------------+-----------------------------------+
|IncidentNumber|Address                    |newaddress                         |
+--------------+---------------------------+-----------------------------------+
|2003235       |2000 Block of CALIFORNIA ST|2000 Block of CALIFORNIA STREET    |
|2003241       |0 Block of SILVERVIEW DR   |0 Block of SILVERVIEW DR           |
|2003242       |MARKET ST/MCALLISTER ST    |MARKET STREET/MCALLISTREETER STREET|
|2003250       |APPLETON AV/MISSION ST     |APPLETON AV/MISSION STREET         |
|2003259       |1400 Block of SUTTER ST    |1400 Block of SUTTER STREET        |
|2003279       |BEALE ST/FOLSOM ST         |BEALE STREET/FOLSOM STREET         |
|2003301       |0 Block of FARALLONES ST   |0 Block of FARALLONES STREET       |
+--------------+---------------------------+-----------------------------------+
only showing top 7 rows



In [0]:
from pyspark.sql.functions import translate

translate = fireDF.withColumn('newaddress', translate('Address', 'ST', 'st')) \
              .select("IncidentNumber", "Address", "newaddress")

translate.show(7, False)

+--------------+---------------------------+---------------------------+
|IncidentNumber|Address                    |newaddress                 |
+--------------+---------------------------+---------------------------+
|2003235       |2000 Block of CALIFORNIA ST|2000 Block of CALIFORNIA st|
|2003241       |0 Block of SILVERVIEW DR   |0 Block of sILVERVIEW DR   |
|2003242       |MARKET ST/MCALLISTER ST    |MARKEt st/MCALLIstER st    |
|2003250       |APPLETON AV/MISSION ST     |APPLEtON AV/MIssION st     |
|2003259       |1400 Block of SUTTER ST    |1400 Block of sUttER st    |
|2003279       |BEALE ST/FOLSOM ST         |BEALE st/FOLsOM st         |
|2003301       |0 Block of FARALLONES ST   |0 Block of FARALLONEs st   |
+--------------+---------------------------+---------------------------+
only showing top 7 rows



In [0]:
def firstLetterFunction(word):
    return word[0]

firstLetterFunction("truck")

Out[162]: 't'

In [0]:
myUDF = udf(firstLetterFunction) #Register UDF

In [0]:
udfDF = fireDF.select("CallType", myUDF("UnitType").alias("UnitTypeInitial"))

udfDF.show(5)

+----------------+---------------+
|        CallType|UnitTypeInitial|
+----------------+---------------+
|  Structure Fire|              T|
|Medical Incident|              M|
|Medical Incident|              M|
|    Vehicle Fire|              E|
|          Alarms|              C|
+----------------+---------------+
only showing top 5 rows



In [0]:
mySqlUDF = spark.udf.register("sql_udf", firstLetterFunction) #Register for sql

In [0]:
fireDF.createOrReplaceTempView("fire")

In [0]:
%sql

SELECT sql_udf(UnitType) AS firstLetter FROM fire LIMIT 5

firstLetter
T
M
M
E
C


In [0]:
udf2DF = fireDF.select("CallType", mySqlUDF("UnitType").alias("UnitTypeInitial"))

udf2DF.show(5)

+----------------+---------------+
|        CallType|UnitTypeInitial|
+----------------+---------------+
|  Structure Fire|              T|
|Medical Incident|              M|
|Medical Incident|              M|
|    Vehicle Fire|              E|
|          Alarms|              C|
+----------------+---------------+
only showing top 5 rows



In [0]:
@udf("string")
def decoratorUDF(word: str) -> str:
    return word[0]

In [0]:
udf3DF = fireDF.select("CallType", decoratorUDF("UnitType").alias("UnitTypeInitial"))

udf3DF.show(5)

+----------------+---------------+
|        CallType|UnitTypeInitial|
+----------------+---------------+
|  Structure Fire|              T|
|Medical Incident|              M|
|Medical Incident|              M|
|    Vehicle Fire|              E|
|          Alarms|              C|
+----------------+---------------+
only showing top 5 rows



In [0]:
import pandas as pd
from pyspark.sql.functions import pandas_udf

@pandas_udf("string")
def pandaUDF(word: pd.Series) -> pd.Series:
    return word.str[0]

In [0]:
udf4DF = fireDF.select("CallType", pandaUDF("UnitType").alias("UnitTypeInitial"))

udf4DF.show(5)

+----------------+---------------+
|        CallType|UnitTypeInitial|
+----------------+---------------+
|  Structure Fire|              T|
|Medical Incident|              M|
|Medical Incident|              M|
|    Vehicle Fire|              E|
|          Alarms|              C|
+----------------+---------------+
only showing top 5 rows

