# Fire Hazard Incidents Analytics Pipeline

## Importing Required Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Creating Spark Session and using all available cores

In [2]:
spark = SparkSession \
    .builder \
    .appName("532: Project") \
    .config("local[*]") \
    .getOrCreate()

## Loading civilian casuality, firefighter casuality and basic module data

In [6]:
civilian_casualities_df = spark.read.options(delimiter="^", header=True).csv("civiliancasualty.txt")
firefighter_casualities_df = spark.read.options(delimiter="^", header=True).csv("ffcasualty.txt")
basic_module_df = spark.read.options(delimiter="^", header=True).csv("basicincident.txt")

In [9]:
basic_module_df = basic_module_df.na.drop(subset=("PROP_LOSS","CONT_LOSS"))
total_loss = basic_module_df.groupBy("INCIDENT_KEY").agg(F.sum(basic_module_df.PROP_LOSS + basic_module_df.CONT_LOSS).alias("TOTAL_LOSS"))

In [11]:
civilian_severity = civilian_casualities_df.select("INCIDENT_KEY","SEV")
ff_severity = firefighter_casualities_df.select("INCIDENT_KEY","SEVERITY")

In [12]:
civilian_dollar_loss = total_loss.join(civilian_severity, total_loss.INCIDENT_KEY == civilian_severity.INCIDENT_KEY)
firefighter_dollar_loss = total_loss.join(ff_severity, total_loss.INCIDENT_KEY == ff_severity.INCIDENT_KEY)

In [13]:
civilian_dollar_loss.groupBy("SEV").agg(F.avg("TOTAL_LOSS").alias("AVG_LOSS")).show()

+---+------------------+
|SEV|          AVG_LOSS|
+---+------------------+
|  3|  79483.0781527531|
|  5|183130.81066268063|
|  U|         49871.625|
|  1| 72752.35869242199|
|  4| 64030.51603498542|
|  2| 72601.71149966375|
+---+------------------+



In [14]:
firefighter_dollar_loss.groupBy("SEVERITY").agg(F.avg("TOTAL_LOSS").alias("AVG_LOSS")).show()

+--------+------------------+
|SEVERITY|          AVG_LOSS|
+--------+------------------+
|       7|       5669414.375|
|       3|202613.09714889125|
|       5| 99906.39583333333|
|       U|103815.78947368421|
|       6|3003931.7647058824|
|       1|155121.58772949106|
|       4|200074.46902654867|
|       2|189474.27731092437|
+--------+------------------+



In [15]:
basic_time_module_df = basic_module_df.withColumn("ALARM_TIME", F.to_timestamp("ALARM","MMddyyyyHHmm")).withColumn("CONT_TIME", F.to_timestamp("INC_CONT","MMddyyyyHHmm")).withColumn("ARV_TIME", F.to_timestamp("ARRIVAL","MMddyyyyHHmm"))

In [16]:
response_time = basic_time_module_df.withColumn("RESPONSE_TIME", basic_time_module_df.CONT_TIME.cast("long") - basic_time_module_df.ALARM_TIME.cast("long"))

civilian_casualty_response_time = response_time.join(civilian_casualities_df, response_time.INCIDENT_KEY == civilian_casualities_df.INCIDENT_KEY)

In [17]:
civilian_casualty_response_time.groupBy("SEV").agg(F.avg("RESPONSE_TIME").alias("AVG_RSP_TIME")).select("SEV","AVG_RSP_TIME").show()

+---+------------------+
|SEV|      AVG_RSP_TIME|
+---+------------------+
|  3|3327.0697674418607|
|  5| 57596.78048780488|
|  U| 4558.762886597938|
|  1| 3235.604938271605|
|  4| 3184.255319148936|
|  2|3451.0526315789475|
+---+------------------+



In [18]:
ff_response_time = basic_time_module_df.withColumn("FF_RESPONSE_TIME", basic_time_module_df.CONT_TIME.cast("long") - basic_time_module_df.ARV_TIME.cast("long"))

ff_severity_response_time = ff_response_time.join(firefighter_casualities_df, ff_response_time.INCIDENT_KEY == firefighter_casualities_df.INCIDENT_KEY)

In [19]:
ff_severity_response_time.groupBy("SEVERITY").agg(F.avg("FF_RESPONSE_TIME").alias("AVG_RSP_TIME")).select("SEVERITY","AVG_RSP_TIME").show()

+--------+------------------+
|SEVERITY|      AVG_RSP_TIME|
+--------+------------------+
|       7|         1662820.0|
|       3|15369.195979899498|
|       5|            5880.0|
|       U|           74915.0|
|       6|          290688.0|
|       1|14545.226781857451|
|       4| 74654.48780487805|
|       2| 6510.491803278688|
+--------+------------------+



In [20]:
fire_incidents_df = spark.read.options(delimiter="^", header=True).csv("fireincident.txt")

In [21]:
fire_incidents_df.printSchema()

root
 |-- INCIDENT_KEY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- FDID: string (nullable = true)
 |-- INC_DATE: string (nullable = true)
 |-- INC_NO: string (nullable = true)
 |-- EXP_NO: string (nullable = true)
 |-- VERSION: string (nullable = true)
 |-- NUM_UNIT: string (nullable = true)
 |-- NOT_RES: string (nullable = true)
 |-- BLDG_INVOL: string (nullable = true)
 |-- ACRES_BURN: string (nullable = true)
 |-- LESS_1ACRE: string (nullable = true)
 |-- ON_SITE_M1: string (nullable = true)
 |-- MAT_STOR1: string (nullable = true)
 |-- ON_SITE_M2: string (nullable = true)
 |-- MAT_STOR2: string (nullable = true)
 |-- ON_SITE_M3: string (nullable = true)
 |-- MAT_STOR3: string (nullable = true)
 |-- AREA_ORIG: string (nullable = true)
 |-- HEAT_SOURC: string (nullable = true)
 |-- FIRST_IGN: string (nullable = true)
 |-- CONF_ORIG: string (nullable = true)
 |-- TYPE_MAT: string (nullable = true)
 |-- CAUSE_IGN: string (nullable = true)
 |-- FACT_IGN_1: string

In [22]:
ignition_cause = fire_incidents_df.select("INCIDENT_KEY","CAUSE_IGN")
ignition_cause = ignition_cause.na.drop(subset=("CAUSE_IGN"))
ignition_cause.select('CAUSE_IGN').distinct().show()

+---------+
|CAUSE_IGN|
+---------+
|        3|
|        0|
|        5|
|        U|
|        1|
|        4|
|        2|
+---------+



In [23]:
ignition_cause_vs_loss = total_loss.join(ignition_cause, total_loss.INCIDENT_KEY == ignition_cause.INCIDENT_KEY)

In [24]:
ignition_cause_vs_loss.groupBy("CAUSE_IGN").agg(F.avg("TOTAL_LOSS").alias("AVG_LOSS")).show()

+---------+------------------+
|CAUSE_IGN|          AVG_LOSS|
+---------+------------------+
|        3|15242.249057862919|
|        0| 35729.22572246811|
|        5| 51502.80952183813|
|        U|13574.973333618687|
|        1|  9201.64097173145|
|        4| 43640.38177635527|
|        2|12956.953294596438|
+---------+------------------+



month vs major states vs types of fire

In [25]:
from pyspark.sql.functions import sum, col, desc

incident_state = basic_module_df.select("INCIDENT_KEY", "STATE", "INC_DATE")
incident_state.groupBy('STATE').count().sort(desc("count")).show()

# selecting top 10 states
incident_state = incident_state.withColumn("INC_MONTH", F.month(F.to_date(incident_state.INC_DATE,"MMddyyyy")))

incidents_ca = incident_state.filter(incident_state.STATE == "CA").groupBy("INC_MONTH").count()
incidents_tx = incident_state.filter(incident_state.STATE == "TX").groupBy("INC_MONTH").count()
incidents_pa = incident_state.filter(incident_state.STATE == "PA").groupBy("INC_MONTH").count()
incidents_fl = incident_state.filter(incident_state.STATE == "FL").groupBy("INC_MONTH").count()
incidents_oh = incident_state.filter(incident_state.STATE == "OH").groupBy("INC_MONTH").count()
incidents_ny = incident_state.filter(incident_state.STATE == "NY").groupBy("INC_MONTH").count()
incidents_il = incident_state.filter(incident_state.STATE == "IL").groupBy("INC_MONTH").count()
incidents_ga = incident_state.filter(incident_state.STATE == "GA").groupBy("INC_MONTH").count()
incidents_nc = incident_state.filter(incident_state.STATE == "NC").groupBy("INC_MONTH").count()
incidents_mi = incident_state.filter(incident_state.STATE == "MI").groupBy("INC_MONTH").count()
incidents_ma = incident_state.filter(incident_state.STATE == "MA").groupBy("INC_MONTH").count()

# incidents_ca.show()
# incidents_tx.show()
# incidents_ma.show()

+-----+------+
|STATE| count|
+-----+------+
|   CA|144036|
|   TX| 94231|
|   PA| 92031|
|   FL| 69808|
|   OH| 69475|
|   NY| 62552|
|   IL| 55041|
|   GA| 51846|
|   NC| 51489|
|   MI| 50708|
|   MA| 50192|
|   IN| 37987|
|   VA| 36658|
|   SC| 35597|
|   MN| 34495|
|   TN| 34084|
|   WA| 33666|
|   NJ| 29644|
|   OK| 26207|
|   CO| 25135|
+-----+------+
only showing top 20 rows



In [26]:
incident_type = basic_module_df.select("INCIDENT_KEY", "INC_TYPE")
incident_type.select("INC_TYPE").distinct().show()

+--------+
|INC_TYPE|
+--------+
|     451|
|     462|
|     442|
|     155|
|     132|
|     154|
|     422|
|     138|
|     424|
|     112|
|     113|
|     443|
|     133|
|     162|
|     160|
|     423|
|     171|
|     441|
|     410|
|     110|
+--------+
only showing top 20 rows



In [27]:
incident_type_count = incident_type.groupBy('INC_TYPE').count().select("count")
incident_type_count.show()
incident_type_count.count()

+-----+
|count|
+-----+
| 5204|
|13129|
|10176|
|  486|
|11483|
|29130|
| 4091|
| 7267|
|40593|
| 9961|
|77983|
| 1428|
|  369|
| 7580|
|10694|
|  577|
| 1942|
|10260|
| 3354|
|  178|
+-----+
only showing top 20 rows



72