In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [2]:
sc = pyspark.SparkContext()
ss = SparkSession(sc)

In [50]:
# define the schema
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
                          StructField('UnitID', StringType(), True),
                          StructField('IncidentNumber', IntegerType(), True),
                          StructField('CallType', StringType(), True),
                          StructField('CallDate', DateType(), True),
                          StructField('WatchDate', DateType(), True),
                          StructField('CallFinalDisposition', StringType(), True),
                          StructField('AvailableDtTm', TimestampType(), True),
                          StructField('Address', StringType(), True),
                          StructField('City', StringType(), True),
                          StructField('Zipcode', IntegerType(), True),
                          StructField('Battalion', StringType(), True),
                          StructField('StationArea', StringType(), True),
                          StructField('Box', StringType(), True),
                          StructField('OriginalPriority', StringType(), True),
                          StructField('Priority', StringType(), True),
                          StructField('FinalPriority', IntegerType(), True),
                          StructField('ALSUnit', BooleanType(), True),
                          StructField('CallTypeGroup', StringType(), True),
                          StructField('NumAlarms', IntegerType(), True),
                          StructField('UnitType', StringType(), True),
                          StructField('UnitSequenceInCallDispatch', IntegerType(), True),
                          StructField('FirePreventionDistrict', StringType(), True),
                          StructField('SupervisorDistrict', StringType(), True),
                          StructField('Neighborhood', StringType(), True),
                          StructField('Location', StringType(), True),
                          StructField('RowID', StringType(), True),
                          StructField('Delay', FloatType(), True)])

In [54]:
# define and load the file - https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
fil = './LearningSparkV2-master/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv'
fire_df = ss.read.csv(fil, header=True, schema=fire_schema, dateFormat='MM/dd/yyyy', timestampFormat='MM/dd/yyyy hh:mm:ss a').cache()

fire_df.show(5)

+----------+------+--------------+----------------+----------+----------+--------------------+-------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+---------+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|      AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|    Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+-------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-

In [8]:
# select a few columns and get non-medical calls
fire_df.select("IncidentNumber", "AvailableDtTm", "CallType").where(col("CallType") != "Medical Incident").show(5)

+--------------+--------------------+--------------+
|IncidentNumber|       AvailableDtTm|      CallType|
+--------------+--------------------+--------------+
|       2003235|01/11/2002 01:51:...|Structure Fire|
|       2003250|01/11/2002 04:16:...|  Vehicle Fire|
|       2003259|01/11/2002 06:01:...|        Alarms|
|       2003279|01/11/2002 08:03:...|Structure Fire|
|       2003301|01/11/2002 09:46:...|        Alarms|
+--------------+--------------------+--------------+
only showing top 5 rows



In [15]:
# pd.DataFrame.value_counts()
fire_df.select('CallType').where(col('CallType').isNotNull()).agg(countDistinct('CallType').alias('Call Types')).show()
cnts = fire_df.select('CallType').groupBy('CallType').count().orderBy('count', ascending=False)
cnts.show(10)

+----------+
|Call Types|
+----------+
|        30|
+----------+

+--------------------+------+
|            CallType| count|
+--------------------+------+
|    Medical Incident|113794|
|      Structure Fire| 23319|
|              Alarms| 19406|
|   Traffic Collision|  7013|
|Citizen Assist / ...|  2524|
|               Other|  2166|
|        Outside Fire|  2094|
|        Vehicle Fire|   854|
|Gas Leak (Natural...|   764|
|        Water Rescue|   755|
+--------------------+------+
only showing top 10 rows



In [62]:
# count calls by month
fire_df.select('CallDate').groupBy(month('CallDate')).count().orderBy(month('CallDate')).show()
# get the included years
fire_df.select(year('CallDate')).distinct().orderBy(year('CallDate'), ascending=True).show()

+---------------+-----+
|month(CallDate)|count|
+---------------+-----+
|              1|14586|
|              2|13402|
|              3|14582|
|              4|14140|
|              5|15099|
|              6|14553|
|              7|14762|
|              8|15126|
|              9|14991|
|             10|15410|
|             11|13863|
|             12|14782|
+---------------+-----+

+--------------+
|year(CallDate)|
+--------------+
|          2000|
|          2001|
|          2002|
|          2003|
|          2004|
|          2005|
|          2006|
|          2007|
|          2008|
|          2009|
|          2010|
|          2011|
|          2012|
|          2013|
|          2014|
|          2015|
|          2016|
|          2017|
|          2018|
+--------------+



In [75]:
# count by call month and type
fire_counts = fire_df.select('CallDate', 'CallType').withColumn('CallMonth', month('CallDate')).groupBy(['CallMonth', 'CallType']).count().orderBy(['CallMonth', 'count'], ascending=[True, False])
fire_counts.show(5)
# note that this creates a poorly-named csv file in a directory using the name I specified
fil = './LearningSparkV2-master/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls_countmonthtype.csv'
fire_counts.coalesce(1).write.options(header=True).format('csv').save(fil)

+---------+-----------------+-----+
|CallMonth|         CallType|count|
+---------+-----------------+-----+
|        1| Medical Incident| 9469|
|        1|   Structure Fire| 1916|
|        1|           Alarms| 1653|
|        1|Traffic Collision|  517|
|        1|     Outside Fire|  238|
+---------+-----------------+-----+
only showing top 5 rows



In [79]:
# get the most common calls by month
fire_counts.groupBy('CallMonth').agg(first('CallType').alias('MostCommon')).orderBy('CallMonth').show()

+---------+----------------+
|CallMonth|      MostCommon|
+---------+----------------+
|        1|Medical Incident|
|        2|Medical Incident|
|        3|Medical Incident|
|        4|Medical Incident|
|        5|Medical Incident|
|        6|Medical Incident|
|        7|Medical Incident|
|        8|Medical Incident|
|        9|Medical Incident|
|       10|Medical Incident|
|       11|Medical Incident|
|       12|Medical Incident|
+---------+----------------+



In [99]:
fire_counts.explain(True)

== Parsed Logical Plan ==
'Sort ['CallMonth ASC NULLS FIRST, 'count DESC NULLS LAST], true
+- Aggregate [CallMonth#34389, CallType#21930], [CallMonth#34389, CallType#21930, count(1) AS count#34397L]
   +- Project [CallDate#21931, CallType#21930, month(CallDate#21931) AS CallMonth#34389]
      +- Project [CallDate#21931, CallType#21930]
         +- Relation[CallNumber#21927,UnitID#21928,IncidentNumber#21929,CallType#21930,CallDate#21931,WatchDate#21932,CallFinalDisposition#21933,AvailableDtTm#21934,Address#21935,City#21936,Zipcode#21937,Battalion#21938,StationArea#21939,Box#21940,OriginalPriority#21941,Priority#21942,FinalPriority#21943,ALSUnit#21944,CallTypeGroup#21945,NumAlarms#21946,UnitType#21947,UnitSequenceInCallDispatch#21948,FirePreventionDistrict#21949,SupervisorDistrict#21950,... 4 more fields] csv

== Analyzed Logical Plan ==
CallMonth: int, CallType: string, count: bigint
Sort [CallMonth#34389 ASC NULLS FIRST, count#34397L DESC NULLS LAST], true
+- Aggregate [CallMonth#34389

In [100]:
sc.stop()