# Fire Hazard Incidents Analytics Pipeline

## Importing Required Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Creating Spark Session and using all available cores

In [2]:
spark = SparkSession \
    .builder \
    .appName("532: Project") \
    .config("local[*]") \
    .getOrCreate()

## Loading civilian casuality, firefighter casuality and basic module data

In [3]:
civilian_casualities_df = spark.read.options(delimiter="^", header=True).csv("civiliancasualty.txt")
firefighter_casualities_df = spark.read.options(delimiter="^", header=True).csv("ffcasualty.txt")
basic_module_df = spark.read.options(delimiter="^", header=True).csv("basicincident.txt")

In [4]:
basic_module_df.printSchema()

root
 |-- INCIDENT_KEY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- FDID: string (nullable = true)
 |-- INC_DATE: string (nullable = true)
 |-- INC_NO: string (nullable = true)
 |-- EXP_NO: string (nullable = true)
 |-- VERSION: string (nullable = true)
 |-- DEPT_STA: string (nullable = true)
 |-- INC_TYPE: string (nullable = true)
 |-- ADD_WILD: string (nullable = true)
 |-- AID: string (nullable = true)
 |-- ALARM: string (nullable = true)
 |-- ARRIVAL: string (nullable = true)
 |-- INC_CONT: string (nullable = true)
 |-- LU_CLEAR: string (nullable = true)
 |-- SHIFT: string (nullable = true)
 |-- ALARMS: string (nullable = true)
 |-- DISTRICT: string (nullable = true)
 |-- ACT_TAK1: string (nullable = true)
 |-- ACT_TAK2: string (nullable = true)
 |-- ACT_TAK3: string (nullable = true)
 |-- APP_MOD: string (nullable = true)
 |-- SUP_APP: string (nullable = true)
 |-- EMS_APP: string (nullable = true)
 |-- OTH_APP: string (nullable = true)
 |-- SUP_PER: string 

In [5]:
total_loss = basic_module_df.groupBy("INCIDENT_KEY").agg(F.sum(basic_module_df.PROP_LOSS + basic_module_df.CONT_LOSS).alias("TOTAL_LOSS")).na.drop()

In [6]:
civilian_severity = civilian_casualities_df.select("INCIDENT_KEY","SEV")
ff_severity = firefighter_casualities_df.select("INCIDENT_KEY","SEVERITY")

In [7]:
civilian_dollar_loss = total_loss.join(civilian_severity, total_loss.INCIDENT_KEY == civilian_severity.INCIDENT_KEY)
firefighter_dollar_loss = total_loss.join(ff_severity, total_loss.INCIDENT_KEY == ff_severity.INCIDENT_KEY)

In [8]:
civilian_dollar_loss.groupBy("SEV").agg(F.avg("TOTAL_LOSS").alias("AVG_LOSS")).show()

+---+------------------+
|SEV|          AVG_LOSS|
+---+------------------+
|  3|  79483.0781527531|
|  5|183130.81066268063|
|  U|         49871.625|
|  1| 72752.35869242199|
|  4| 64030.51603498542|
|  2| 72601.71149966375|
+---+------------------+



In [9]:
firefighter_dollar_loss.groupBy("SEVERITY").agg(F.avg("TOTAL_LOSS").alias("AVG_LOSS")).show()

+--------+------------------+
|SEVERITY|          AVG_LOSS|
+--------+------------------+
|       7|       5669414.375|
|       3|202613.09714889125|
|       5| 99906.39583333333|
|       U|103815.78947368421|
|       6|3003931.7647058824|
|       1|155121.58772949106|
|       4|200074.46902654867|
|       2|189474.27731092437|
+--------+------------------+



In [10]:
basic_time_module_df = basic_module_df.withColumn("ALARM_TIME", F.to_timestamp("ALARM","MMddyyyyHHmm")).withColumn("CONT_TIME", F.to_timestamp("INC_CONT","MMddyyyyHHmm")).withColumn("ARV_TIME", F.to_timestamp("ARRIVAL","MMddyyyyHHmm"))

In [11]:
response_time = basic_time_module_df.withColumn("RESPONSE_TIME", basic_time_module_df.CONT_TIME.cast("long") - basic_time_module_df.ALARM_TIME.cast("long"))

civilian_casualty_response_time = response_time.join(civilian_casualities_df, response_time.INCIDENT_KEY == civilian_casualities_df.INCIDENT_KEY)

In [12]:
civilian_casualty_response_time.groupBy("SEV").agg(F.avg("RESPONSE_TIME").alias("AVG_RSP_TIME")).select("SEV","AVG_RSP_TIME").show()

+---+------------------+
|SEV|      AVG_RSP_TIME|
+---+------------------+
|  3|3311.3513513513512|
|  5|55352.130384167634|
|  U| 4580.606060606061|
|  1|3214.5323741007196|
|  4| 3255.205479452055|
|  2|3411.1973018549747|
+---+------------------+



In [13]:
ff_response_time = basic_time_module_df.withColumn("FF_RESPONSE_TIME", basic_time_module_df.CONT_TIME.cast("long") - basic_time_module_df.ARV_TIME.cast("long"))

ff_severity_response_time = ff_response_time.join(firefighter_casualities_df, ff_response_time.INCIDENT_KEY == firefighter_casualities_df.INCIDENT_KEY)

In [14]:
ff_severity_response_time.groupBy("SEVERITY").agg(F.avg("FF_RESPONSE_TIME").alias("AVG_RSP_TIME")).select("SEVERITY","AVG_RSP_TIME").show()

+--------+------------------+
|SEVERITY|      AVG_RSP_TIME|
+--------+------------------+
|       7|         1247962.5|
|       3|14811.670588235294|
|       5|            5880.0|
|       U|           74915.0|
|       6|          242410.0|
|       1| 14919.53038674033|
|       4| 70525.73394495413|
|       2| 6535.170278637771|
+--------+------------------+

