In [1]:
from IPython.display import display, HTML
display(HTML("<style>pre{white-space: pre !important;}</style>"))

In [2]:
import pandas as pd
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import DateType, TimestampType, IntegerType

In [3]:
findspark.init()

In [4]:
spark = SparkSession.builder \
    .appName("Fire Incident") \
    .master("local[1]") \
    .config("spark.driver.host", "host.docker.internal") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .config("spark.network.timeout", "600s") \
    .config("spark.executor.heartbeatInterval", "120s") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .config("spark.executor.instances", "1") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "512m") \
    .config("spark.driver.memory", "512m") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/16 12:01:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = spark.read.csv('../data/raw/Fire_Incidents_20240516.csv', header=True)
df.show(truncate=False)

24/07/16 12:01:53 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------------+---------------+---------+--------------------------+-------------+-----------+----------------------+----------------------+----------------------+-------------+-------+---------+------------+----+-----------------+---------------------+---------+-------------+-----------+---------------+-------------------+-----------------------+-----------------------+---------------+-------------+-------------------+-----------------+----------------+----------------------------------------------------+----------+-------------------------------------------+----------------------+------------------+----------------------------+----------------------------------------------------+-------------------+--------------+-----------------------+-------------------------+-----------+------------------+--------------------------------------+--------------+----------------+--------------------+-----------+---------------+------------------------------------+-----------------------------------

## Inconsistent labels

In [43]:
df = df.withColumn('zipcode', regexp_replace('zipcode', '\-\w+[0-9]{3,}', ''))

In [44]:
df = df.filter(col("Primary Situation").startswith("1"))

In [45]:
df = df.withColumn('Primary Situation', regexp_replace('Primary Situation', '[*-]', ''))
df = df.withColumn('Primary Situation', trim('Primary Situation'))

In [46]:
mutual_aid_mapping = {
    'Mutual aid given': '3 Mutual aid given',
    'Mutual aid received': '1 Mutual aid received',
    'Other aid given': '5 Other aid given',
    'Automatic or contract aid received': '2 Automatic aid received',
    'Automatic aid given': '4 Automatic aid given',
    '3 Mutual aid given': '3 Mutual aid given',
    'None' : 'N None'}
df = df.replace(mutual_aid_mapping, subset = ['Mutual Aid'])

In [47]:
df = df.withColumn('Ignition Factor Primary', regexp_replace('Ignition Factor Primary', '-', ''))

In [48]:
ignition_factor_secondary_mapping = {
    '12 Heat source too close to combustibles.': '12 Heat source too close to combustibles.',
    '30 Electrical failure, malfunction, othe': '30 Electrical failure, malfunction, other',
    '11 Abandoned or discarded materials or p': '11 Abandoned or discarded materials or products',
    '32 Short circuit arc from mechanical dam': '32 Short-circuit arc from mechanical damage',
    '20 Mechanical failure, malfunction, othe': '20 Mechanical failure, malfunction, other',
    '18 Improper container or storage': '18 Improper container or storage procedure',
    '00 Factors contributing to ignition, other': '00 Other factor contributed to ignition',
    '33 Short cir. arc, defect/worn insulatio': '33 Short-circuit arc from defective, worn insulation',
    '52 Accidentally turned on, not turned of': '52 Accidentally turned on, not turned off',
    '57 Equipment used for not intended purpo': '57 Equipment not used for purpose intended',
    '13 Cuttin/welding too close to combustib': '13 Cutting/welding too close to combustibles',
    '73 Outside/open fire, debris/waste dispo': '73 Outside/open fire for debris or waste disposal',
    '74 Outside/open fire for warming or cook': '74 Outside/open fire for warming or cooking',
    '18 Improper container or storage procedure': '18 Improper container or storage',
}
df = df.withColumn('Ignition Factor Secondary', regexp_replace('Ignition Factor Secondary', '-', ''))
df = df.replace(ignition_factor_secondary_mapping, subset = ['Ignition Factor Secondary'])


In [49]:
heat_source_mapping = {
    '11 Spark/ember/flame from operating equi': '11 Spark, ember, or flame from operating equipment',
    '12 Radiated/conducted heat operating equ': '12 Radiated or conducted heat from operating equipment',
    '13 Arcing': '13 Electrical arcing',
    '60 Heat; other open flame/smoking materi': '60 Heat from other open flame or smoking materials, other',
    '63 Heat from undetermined smoking materi': '63 Heat from undetermined smoking material',
    '65 Cigarette lighter': '65 Lighter: cigarette, cigar',
    '67 Warning or road flare; fusee': '67 Warning or road flare; fuse',
    '68 Backfire from internal combustion eng': '68 Backfire from internal combustion engine',
    '72 Chemical reaction': '72 Spontaneous combustion, chemical reaction',
    '97 Multiple heat sources including multi': '97 Multiple heat sources including multiple ignitions'
}
df = df.withColumn('Heat Source', regexp_replace('Heat Source', '-', ''))
df = df.replace(heat_source_mapping, subset = ['Heat Source'])

In [50]:
item_first_ignited_mapping = {
    '96 Rubbish, trash, waste': '96 Rubbish, trash, or waste',
    '62 Flammable liquid/gas in/from engine or burner': '62 Flam. liq/gas-in/from engine or burne',
    '21 Upholstered sofa, chair, vehicle seats': '21 Upholstered sofa, chair, vehicle seat',
    '59 Rolled, wound material (paper and fabrics)': '59 Rolled, wound material (paper, fabric',
    '73 Heavy vegetation not crop, including trees': '73 Heavy vegetation no crops, inc. tre',
    '76 Cooking materials, including edible materials': '76 Cooking materials, inc. Edible materi',
    '00 Item first ignited, other': '00 Item First Ignited, Other',
    '14 Floor covering or rug/carpet/mat, surface': '14 Floor covering or rug/carpet/mat',
    '36 Curtain, blind, drapery, tapestry': '36 Curtains, blinds, drapery, tapestry',
    '11 Exterior roof covering, surface, finish': '11 Exterior roof covering or finish',
    '64 Flammable liquid/gas in container or pipe': '64 Flam liq/gas in container or pipe',
    '72 Light vegetation not crop, including grass': '72 Light vegetation no crops, inc. gra',
    '37 Goods not made up, including fabrics and yard goods': '37 Raw Goods, incl. fabrics and yarn',
    '66 Pipe, duct, conduit, hose': '66 Pipe, duct, conduit or hose',
    '61 Atomized liquid, vaporized liquid, aerosol.': '61 Atomized liq., vaporized liq.,aersol',
    '95 Film, residue, including paint & resi': '95 Film, residue, including paint and resin',
    '63 Flammable liquid/gas in/from final container': '63 Flam Liq/gas-in/from final container',
    '94 Dust, fiber, lint, including sawdust and excelsior': '94 Dust/fiber/lint. inc. sawdust, excels',
    '15 Interior wall covering excluding drapes, etc.': '15 Int. Wall cover  exclude drapes, etc.',
    '47 Tarpaulin, tent': '47 Tarpaulin or tent',
    '71 Agricultural crop, including fruits and vegetables': '71 Crop, incl. fruits and vegitables',
    '82 Transformer, including transformer fluids': '82 Transformer, including transformer fl',
    '18 Thermal, acoustical insulation within wall, partition or floor/ceiling space': '18 Insulation within structural area',
    '40 Adornment, recreational material, signs, other': '40 Adornment, recreational mat., signs,',
    '43 Sign, including outdoor signs such as billboards': '43 Sign, inc. outdoor sign/billboards',
    '74 Animal living or dead': '74 Animal, living or dead',
    '77 Feathers or fur, not on bird or anima': '77 Feathers or fur, not on bird or animal',
    '58 Palletized material, material stored on pallets.': '58 Palletized material',
    '54 Cord, rope, twine': '54 Cord, rope, twine, yarn'}

df = df.withColumn('Item First Ignited', regexp_replace('Item First Ignited', '-', ''))
df = df.replace(item_first_ignited_mapping, subset = ['Item First Ignited'])

In [51]:
df = df.withColumn('Human Factors Associated with Ignition', regexp_replace('Human Factors Associated with Ignition', '[Â§]', ''))

In [52]:
structure_type_mapping = {'4 Air-supported structure': '4 Air supported structure',
                          '7 Underground structure work area': '7 Underground structure work areas'}
df = df.withColumn('Structure Type', regexp_replace('Structure Type', '-', ''))
df = df.replace(structure_type_mapping, subset = ['Structure Type'])

In [53]:
structure_status_mapping = {'0 Building status, other': '0 Other'}
df = df.withColumn('Structure Status', regexp_replace('Structure Status', '-', ''))
df = df.replace(structure_status_mapping, subset = ['Structure Status'])

In [54]:
fire_spread_mapping = mapping = {
    '00 Item first ignited, other': '00 Item First Ignited, Other',
    '11 Exterior roof covering, surface, finish': '11 Exterior roof covering or finish',
    '66 Pipe, duct, conduit, hose': '66 Pipe, duct, conduit or hose',
    '15 Interior wall covering excluding drapes, etc.': '15 Int. Wall cover  exclude drapes, etc.',
    '76 Cooking materials, including edible materials': '76 Cooking materials, inc. Edible materia',
    '94 Dust/fiber/lint. inc. sawdust, excelsi': '94 Dust/fiber/lint. inc. sawdust, excelsi',
    '96 Rubbish, trash, or waste': '96 Rubbish, trash, waste',
    '61 Atomized liq., vaporized liq.,aersol': '61 Atomized liquid, vaporized liquid, aerosol.'
}
df = df.withColumn('Fire Spread', regexp_replace('Fire Spread', '-', ''))
df = df.replace(fire_spread_mapping, subset = ['Fire Spread'])

In [55]:
no_flame_spread_mapping = {
    'NO': '0', 
    'N': '0', 
    'False': '0', 
    'YES': '1', 
    'Y': '1', 
    'True': '1'
}
df = df.replace(no_flame_spread_mapping, subset = ['No Flame Spread'])

In [56]:
detectors_present_mapping = {
    'N None present': 'N Not present'
}
df = df.withColumn('Detectors Present', regexp_replace('Detectors Present', '-', ''))
df = df.replace(detectors_present_mapping, subset = ['Detectors Present'])

In [57]:
detector_type_mapping = {
    '3 Combination smoke and heat in a single unit':'3 Combination smoke & heat in single unit'
}
df = df.withColumn('Detector Type', regexp_replace('Detector Type', '-', ''))
df = df.replace(detector_type_mapping, subset = ['Detector Type'])

In [58]:
df = df.withColumn('Detector Operation', regexp_replace('Detector Operation', '-', ''))

In [59]:
detectors_effectiveness_mapping = {'2 Alerted occupants-occ. failed to resond' : '2 Detector alerted occupants, occupants failed to respond',
                                   '4 Failed to alert occupants' : '4 Detector failed to alert occupants'}
df = df.withColumn('Detector Effectiveness', regexp_replace('Detector Effectiveness', '-', ''))
df = df.replace(detectors_effectiveness_mapping, subset = ['Detector Effectiveness'])

In [60]:
detector_failure_reason_mapping = {
    '0 -Detector failure reason, other': '0 Detector failure reason, other',
    '6 -Battery discharged or dead': '6 Battery discharged or dead',
    '5 -Battery missing or disconnected': '5 Battery missing or disconnected',
    '1 -Power fail/shutoff or disconnected dete': '1 Power failure, hardwired det. shut off, disconnect',
    '3 -Defective': '3 Defective',
    '4 -Lack of maintenance, inc. not cleaning': '4 Lack of maintenance, includes not cleaning',
    '2 -Improper installation or placement': '2 Improper installation or placement of detector'
}
df = df.replace(detector_failure_reason_mapping, subset = ['Detector Failure Reason'])

In [61]:
df = df.withColumn('Automatic Extinguishing System Present', regexp_replace('Automatic Extinguishing System Present', '-', ''))

In [62]:
aes_type_mapping = {
    '1 Wet-pipe sprinkler' : '1 Wet-pipe sprinkler system',
    '6 Halogen-type system': 'Halogen type system'
}
df = df.withColumn('Automatic Extinguishing Sytem Type', regexp_replace('Automatic Extinguishing Sytem Type', '-', ''))
df = df.replace(aes_type_mapping, subset = ['Automatic Extinguishing Sytem Type'])

In [63]:
df = df.withColumn('Automatic Extinguishing Sytem Perfomance', regexp_replace('Automatic Extinguishing Sytem Perfomance', '-', ''))

In [64]:
aes_failure_reason_mapping = {
    'Reason system not effective, other': '0 Reason system not effective, other',
    'System shut off' : '1 System shut off',
    'Not enough agent discharged to control the fire':'2 Not enough agent discharged to control the fire',
    'Agent discharged, but did not reach the fire': '3 Agent discharged, did not reach the fire',
    '3 Agent discharged, did not reach the fir': '3 Agent discharged, did not reach the fire',
    'Inappropriate system for the type of fire': '4 Inappropriate system for the type of fire',
    'Fire not in area protected by the system': '5 Fire not in area protected by the system',
}
df = df.withColumn('Automatic Extinguishing Sytem Failure Reason', regexp_replace('Automatic Extinguishing Sytem Failure Reason', '-', ''))
df = df.replace(aes_failure_reason_mapping, subset = ['Automatic Extinguishing Sytem Failure Reason'])

## Converting Data Types

In [None]:
df.dtypes

In [88]:
#Converting dates string into dates type

df = df.withColumn("Incident Date", from_unixtime(unix_timestamp("Incident Date",'yyyy/MM/dd'),'yyyy-MM-dd'))
df = df.withColumn("Incident Date", col('Incident Date').cast(DateType()))

df = df.withColumn("Alarm DtTm", from_unixtime(unix_timestamp("Alarm DtTm", 'yyyy/MM/dd hh:mm:ss a'), 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("Alarm DtTm", col('Alarm DtTm').cast(TimestampType()))

df = df.withColumn("Arrival DtTm", from_unixtime(unix_timestamp("Arrival DtTm", 'yyyy/MM/dd hh:mm:ss a'), 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("Arrival DtTm", col('Arrival DtTm').cast(TimestampType()))

df = df.withColumn("Close DtTm", from_unixtime(unix_timestamp("Close DtTm", 'yyyy/MM/dd hh:mm:ss a'), 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("Close DtTm", col('Close DtTm').cast(TimestampType()))

df = df.withColumn("data_as_of", from_unixtime(unix_timestamp("data_as_of", 'yyyy/MM/dd hh:mm:ss a'), 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("data_as_of", col('data_as_of').cast(TimestampType()))

df = df.withColumn("data_loaded_at", from_unixtime(unix_timestamp("data_loaded_at", 'yyyy/MM/dd hh:mm:ss a'), 'yyyy-MM-dd HH:mm:ss'))
df = df.withColumn("data_loaded_at", col('data_loaded_at').cast(TimestampType()))

In [None]:
numerical_columns = [
    'Exposure Number', 'Estimated Property Loss', 'Estimated Contents Loss', 'Fire Fatalities',
    'Fire Injuries', 'Civilian Fatalities', 'Civilian Injuries', 'Number of Alarms','Floor of Fire Origin',
    'Number of floors with minimum damage','Number of floors with significant damage',
    'Number of floors with heavy damage', 'Number of floors with extreme damage',
    'Number of Sprinkler Heads Operating', 'Suppression Units', 'Suppression Personnel', 'EMS Units',
    'EMS Personnel', 'Other Units', 'Other Personnel',
]
categorical_columns = [col for col in df.columns if col not in numerical_columns]
df = df.select(categorical_columns + [col(c).cast(IntegerType()) for c in numerical_columns])

In [None]:
df.dtypes

## Missing Data

In [68]:
missing_df = df.select([(count(when(isnan(c) | col(c).isNull(), c))/count(lit(1))).alias(c) for c in df.columns])
missing_df.show(truncate=False)



+---------------+---------------+---+--------------------+-------------+-----------+----------+------------+-------------------+--------------------+--------------------+---------+--------------------+-------------------+-----------------+---------------------+---------+-------------+-----------+---------------+-------------------+-----------------------+-----------------------+---------------+-------------+-------------------+-----------------+----------------+-----------------+----------+--------------------+----------------------+------------------+--------------------------+---------------------+-------------------+------------------+-----------------------+-------------------------+------------------+------------------+--------------------------------------+------------------+------------------+--------------------+------------------+------------------+------------------------------------+----------------------------------------+----------------------------------+-----------------

                                                                                

## Duplicate Data

In [69]:
df = df.drop_duplicates()

## Feature Engineering

In [74]:
df.dtypes

[('Incident Number', 'string'),
 ('Exposure Number', 'string'),
 ('ID', 'string'),
 ('Address', 'string'),
 ('Incident Date', 'string'),
 ('Call Number', 'string'),
 ('Alarm DtTm', 'string'),
 ('Arrival DtTm', 'string'),
 ('Close DtTm', 'string'),
 ('City', 'string'),
 ('zipcode', 'string'),
 ('Battalion', 'string'),
 ('Station Area', 'string'),
 ('Box', 'string'),
 ('Suppression Units', 'string'),
 ('Suppression Personnel', 'string'),
 ('EMS Units', 'string'),
 ('EMS Personnel', 'string'),
 ('Other Units', 'string'),
 ('Other Personnel', 'string'),
 ('First Unit On Scene', 'string'),
 ('Estimated Property Loss', 'string'),
 ('Estimated Contents Loss', 'string'),
 ('Fire Fatalities', 'string'),
 ('Fire Injuries', 'string'),
 ('Civilian Fatalities', 'string'),
 ('Civilian Injuries', 'string'),
 ('Number of Alarms', 'string'),
 ('Primary Situation', 'string'),
 ('Mutual Aid', 'string'),
 ('Action Taken Primary', 'string'),
 ('Action Taken Secondary', 'string'),
 ('Action Taken Other', 's

In [71]:
# Extract latitude and longitude using regex
df = df.withColumn('Latitude', regexp_extract('point', '\(-?\d+\.\d+', 0))
df = df.withColumn('Longitude', regexp_extract('point', '-?\d+\.\d+\)', 0))#

# Removing open and closing brackets
df = df.withColumn('Latitude', regexp_replace('Latitude', '\(', ''))
df = df.withColumn('Longitude', regexp_replace('Longitude', '\)', ''))

# Casting latitude and longitude into float type
df = df.withColumn('Latitude', col("Latitude").cast("float"))
df = df.withColumn('Longitude', col("Longitude").cast("float"))

#Drop point column
df = df.drop('point')

In [72]:
df = df.withColumn('Response Time', (unix_timestamp(col("Arrival DtTm")) - unix_timestamp(col("Alarm DtTm")))/60)
df = df.withColumn('Supression Time',(unix_timestamp(col("Close DtTm")) - unix_timestamp(col("Arrival DtTm")))/60)

In [73]:
df.show()



+---------------+---------------+---------+--------------------+-------------+-----------+-------------------+-------------------+-------------------+-------------+-------+---------+------------+----+-----------------+---------------------+---------+-------------+-----------+---------------+-------------------+-----------------------+-----------------------+---------------+-------------+-------------------+-----------------+----------------+--------------------+----------+--------------------+----------------------+------------------+--------------------------+--------------------+--------------------+--------------------+-----------------------+-------------------------+--------------------+--------------------+--------------------------------------+-------------------+----------------+--------------------+-----------+---------------+------------------------------------+----------------------------------------+----------------------------------+------------------------------------+---

                                                                                