# Fire Incidents ETL

In [10]:
from sqlalchemy import create_engine

from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np
import re 

# Connect to PostgreSQL database in Docker container
# engine = create_engine('postgresql://user:password@localhost:5432/fire-incident-db')


In [3]:
date_columns = ['Incident Date', 'Alarm DtTm', 'Arrival DtTm', 'Close DtTm', 'data_as_of', 'data_loaded_at']

In [17]:
df = pd.read_csv('../data/raw/Fire_Incidents_20240516.csv', parse_dates=date_columns)

  df = pd.read_csv('../data/raw/Fire_Incidents_20240516.csv', parse_dates=date_columns)


In [90]:
for col in df.columns:
    type_counts = df[col].map(type).value_counts()
    if len(type_counts) > 1:
        print(f"Column '{col}' type counts:\n{type_counts}\n")

Column 'Address' type counts:
Address
<class 'str'>      664968
<class 'float'>       315
Name: count, dtype: int64

Column 'Alarm DtTm' type counts:
Alarm DtTm
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    665282
<class 'pandas._libs.tslibs.nattype.NaTType'>              1
Name: count, dtype: int64

Column 'Arrival DtTm' type counts:
Arrival DtTm
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    665214
<class 'pandas._libs.tslibs.nattype.NaTType'>             69
Name: count, dtype: int64

Column 'Close DtTm' type counts:
Close DtTm
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    665268
<class 'pandas._libs.tslibs.nattype.NaTType'>             15
Name: count, dtype: int64

Column 'City' type counts:
City
<class 'str'>      663410
<class 'float'>      1873
Name: count, dtype: int64

Column 'zipcode' type counts:
zipcode
<class 'float'>    270340
<class 'int'>      237568
<class 'str'>      157375
Name: count, dtype: int64

Column 'Station Area' type counts:
Station Are

## Mixed Data Type Processing

In [91]:
def rows_with_specific_type(dataframe, column, dtype):
    return dataframe[dataframe[column].map(type) == dtype]

In [92]:
string_rows = rows_with_specific_type(df, 'Action Taken Primary', str)
print("Rows with float in column 'Action Taken Primary':\n", string_rows['Action Taken Primary'])

Rows with float in column 'Action Taken Primary':
 0         11 Extinguishment by fire service personnel
1                                      86 Investigate
2                                      86 Investigate
3                 32 Provide basic life support (BLS)
4                        63 Restore fire alarm system
                             ...                     
665278                                 86 Investigate
665279                                 86 Investigate
665280                         00 Action taken, other
665281                                 86 Investigate
665282                                 86 Investigate
Name: Action Taken Primary, Length: 665179, dtype: object


In [5]:
df['zipcode'].unique()

array(['94124', '94109', '94107', '94112', '94123', '94115', '94111',
       '94102', '94103', '94132', '94116', '94133', '94117', '94121',
       '94134', '94104', '94110', '94114', '94122', '94108', '94130',
       '94131', '94129', '94105', '94118', '94111-0000', '94108-0000',
       '94127', '94104-0000', '94005', '94105-0000', '94128', '94601',
       '94133-0000', '94103-0000', '94107-0000', '95011', '94501', 94124,
       94108, 94103, 94116, 94107, 94102, 94109, 94132, 94118, 94115,
       94117, 94114, 94133, 94129, 94121, 94131, 94110, 94122, 94112,
       94123, 94134, 94127, 94105, 94111, 94104, 94130, 94401, 94128,
       94005, 94931, 94158, 94014, nan, '94014', '94122-9412', '94143',
       94143, 94101, 94965, '11111-1111', '94066', 94521, '94158',
       '94106', '94101', '94111-1111', '96102', '94115-0000',
       '94125-0006', '94901', '94110-0000', '94510', '94116-0000',
       '94902', '95102', '94137', '95103', '94113', '94011', '94121-0000',
       '94164', '9461

In [94]:
df['zipcode'] = df['zipcode'].str.replace(re.compile('\-\w+[0-9]{3,}'), '',regex=True)

In [95]:
df['Station Area'].unique()

array([42.0, 3.0, 37.0, 43.0, 16.0, 38.0, 13.0, 1.0, 17.0, 19.0, 40.0,
       2.0, 5.0, 10.0, 14.0, 41.0, 25.0, 23.0, 7.0, 11.0, 24.0, 18.0, 6.0,
       15.0, 28.0, 48.0, 36.0, 8.0, 32.0, 44.0, 34.0, 9.0, 29.0, 12.0,
       51.0, 20.0, 35.0, 31.0, 33.0, 26.0, 4.0, 21.0, 22.0, 39.0, 47.0,
       nan, '20', '38', '35', '14', '17', '11', '03', '13', '09', '23',
       '06', '07', '01', '10', '33', '40', '37', '36', '16', '41', '25',
       '18', '32', '42', '28', '43', '12', '29', '44', '08', '21', '15',
       '26', '05', '22', '19', '31', '48', '34', '39', '02', '24', '53',
       'A3', 'A1', 'A2', 'OT', '51', 'A4', 'H1', '99', '04', '47', 53,
       94.0], dtype=object)

In [96]:
df['Station Area'] = df['Station Area'].astype(str)
df['Station Area'] = df['Station Area'].str.replace('.0', '')

In [97]:
df['Box'] = df['Box'].astype(str)

# Remove trailing zeros from float values
df['Box'] = df['Box'].apply(lambda x: str(float(x)).rstrip('0').rstrip('.') if '.' in x else x)

In [98]:
df['Primary Situation'].unique()

array(['118 Trash or rubbish fire, contained',
       '735 Alarm system sounded due to malfunction',
       '440 Electrical  wiring/equipment problem, other',
       '322 Motor vehicle accident with injuries',
       '730 System malfunction, other',
       '740 Unintentional transmission of alarm, other',
       '571 Cover assignment, standby, moveup',
       '700 False alarm or false call, other',
       '745 Alarm system activation, no fire - unintentional',
       '324 Motor vehicle accident with no injuries.',
       '554 Assist invalid', '500 Service Call, other',
       '113 Cooking fire, confined to container', '356 High-angle rescue',
       '151 Outside rubbish, trash or waste fire',
       '733 Smoke detector activation due to malfunction',
       '251 Excessive heat, scorch burns with no ignition',
       '311 Medical assist, assist EMS crew', '100 Fire, other',
       '550 Public service assistance, other',
       '150 Outside rubbish fire, other',
       '555 Defective ele

In [99]:
df['Primary Situation'] = df['Primary Situation'].str.replace('-', '') \
                         .str.replace('*', '').str.replace(r'\s+', ' ')

In [9]:
df['Mutual Aid'].unique()

array(['N None', '5 Other aid given', '4 Automatic aid given',
       '2 Automatic aid received', '1 Mutual aid received', nan,
       'Mutual aid given', 'Mutual aid received', 'Other aid given',
       'Automatic or contract aid received', 'Automatic aid given',
       '3 Mutual aid given'], dtype=object)

In [6]:
mutual_aid_mapping = {
    'Mutual aid given': '3 Mutual aid given',
    'Mutual aid received': '1 Mutual aid received',
    'Other aid given': '5 Other aid given',
    'Automatic or contract aid received': '2 Automatic aid received',
    'Automatic aid given': '4 Automatic aid given',
    '3 Mutual aid given': '3 Mutual aid given'}
df['Mutual Aid'] = df['Mutual Aid'].map(mutual_aid_mapping)


In [102]:
df['Ignition Factor Primary'].unique()

array([nan, 'NN None', '60 Natural condition, other',
       '74 Outside/open fire for warming or cooking', 'UU Undetermined',
       '34 Unspecified short-circuit arc', '19 Playing with heat source',
       '10 Misuse of material or product, other',
       '11 Abandoned or discarded materials or products',
       '16 Flammable liquid used to kindle fire', '23 Leak or break',
       '25 Worn out', '30 Electrical failure, malfunction, other',
       '41 Design deficiency',
       '00 Factors contributing to ignition, other',
       '13 Cutting, welding too close to combustible',
       '35 Arc from faulty contact, broken conductor',
       '20 Mechanical failure, malfunction, other',
       '53 Equipment unattended',
       '12 Heat source too close to combustibles.',
       '36 Arc, spark from operating equipment',
       '58 Equipment not operated properly', '72 Rekindle',
       '18 Improper container or storage procedure',
       '70 Fire spread or control, other',
       '51 Collis

In [103]:
df['Ignition Factor Primary'] = df['Ignition Factor Primary'].str.replace(r' -', '')

109
77


In [20]:
df['Ignition Factor Secondary'].unique()

array([nan, '73 Outside/open fire for debris or waste disposal',
       '32 Short-circuit arc from mechanical damage',
       '12 Heat source too close to combustibles.',
       '00 Other factor contributed to ignition', '55 Failure to clean',
       '11 Abandoned or discarded materials or products',
       '60 Natural condition, other', '50 Operational deficiency, other',
       '10 Misuse of material or product, other',
       '70 Fire spread or control, other',
       '20 Mechanical failure, malfunction, other',
       '34 Unspecified short-circuit arc',
       '30 Electrical failure, malfunction, other',
       '53 Equipment unattended',
       '12 Heat source too close to combustibles', '25 Worn out',
       '18 Improper container or storage procedure',
       '14 Flammable liquid or gas spilled',
       '33 Short-circuit arc from defective, worn insulation',
       '61 High wind', '52 Accidentally turned on, not turned off',
       '21 Automatic control failure', '54 Equipment ov

In [19]:
ignition_factor_secondary_mapping = {
    '12 Heat source too close to combustibles.': '12 Heat source too close to combustibles.',
    '30 Electrical failure, malfunction, othe': '30 Electrical failure, malfunction, other',
    '11 Abandoned or discarded materials or p': '11 Abandoned or discarded materials or products',
    '32 Short circuit arc from mechanical dam': '32 Short-circuit arc from mechanical damage',
    '20 Mechanical failure, malfunction, othe': '20 Mechanical failure, malfunction, other',
    '18 Improper container or storage': '18 Improper container or storage procedure',
    '00 Factors contributing to ignition, other': '00 Other factor contributed to ignition',
    '33 Short cir. arc, defect/worn insulatio': '33 Short-circuit arc from defective, worn insulation',
    '52 Accidentally turned on, not turned of': '52 Accidentally turned on, not turned off',
    '57 Equipment used for not intended purpo': '57 Equipment not used for purpose intended',
    '13 Cuttin/welding too close to combustib': '13 Cutting/welding too close to combustibles',
    '73 Outside/open fire, debris/waste dispo': '73 Outside/open fire for debris or waste disposal',
    '74 Outside/open fire for warming or cook': '74 Outside/open fire for warming or cooking',
    '18 Improper container or storage procedure': '18 Improper container or storage',
}
df['Ignition Factor Secondary'] = df['Ignition Factor Secondary'].str.replace(r'- ', '')
df['Ignition Factor Secondary'] = df['Ignition Factor Secondary'].replace(ignition_factor_secondary_mapping)


In [106]:
df['Heat Source'].unique()

array([nan, '40 Hot or smoldering object, other',
       '12 Radiated or conducted heat from operating equipment',
       '83 Flying brand, ember, spark',
       '60 Heat from other open flame or smoking materials, other',
       'UU Undetermined', '13 Electrical arcing', '00 Heat source: other',
       '65 Lighter: cigarette, cigar',
       '63 Heat from undetermined smoking material',
       '69 Flame/torch used for lighting',
       '11 Spark, ember, or flame from operating equipment',
       '43 Hot ember or ash', '41 Heat, spark from friction',
       '10 Heat from powered equipment, other',
       '81 Heat from direct flame, convection currents', '61 Cigarette',
       '72 Spontaneous combustion, chemical reaction', '66 Candle',
       '71 Sunlight', '54 Fireworks', '64 Match',
       '42 Molten, hot material', '82 Radiated heat from another fire',
       '84 Conducted heat from another fire',
       '80 Heat spread from another fire, other',
       '70 Chemical, natural heat sou

In [107]:
heat_source_mapping = {
    '11 Spark/ember/flame from operating equi': '11 Spark, ember, or flame from operating equipment',
    '12 Radiated/conducted heat operating equ': '12 Radiated or conducted heat from operating equipment',
    '13 Arcing': '13 Electrical arcing',
    '60 Heat; other open flame/smoking materi': '60 Heat from other open flame or smoking materials, other',
    '63 Heat from undetermined smoking materi': '63 Heat from undetermined smoking material',
    '65 Cigarette lighter': '65 Lighter: cigarette, cigar',
    '67 Warning or road flare; fusee': '67 Warning or road flare; fuse',
    '68 Backfire from internal combustion eng': '68 Backfire from internal combustion engine',
    '72 Chemical reaction': '72 Spontaneous combustion, chemical reaction',
    '97 Multiple heat sources including multi': '97 Multiple heat sources including multiple ignitions'
}
df['Heat Source'] = df['Heat Source'].str.replace(r' -', '')
df['Heat Source'] = df['Heat Source'].replace(heat_source_mapping)

In [108]:
df['Item First Ignited'].unique()

array([nan, 'UU Undetermined', '66 Pipe, duct, conduit, hose',
       '96 Rubbish, trash, waste', '00 Item first ignited, other',
       '76 Cooking materials, including edible materials',
       '81 Electrical wire, cable insulation',
       '92 Magazine, newspaper, writing paper', '31 Mattress, pillow',
       '38 Luggage', '20 Furniture, utensils, other',
       '17 Structural member or framing',
       '21 Upholstered sofa, chair, vehicle seats',
       '30 Soft goods, wearing apparel, other',
       '61 Atomized liquid, vaporized liquid, aerosol.',
       '11 Exterior roof covering, surface, finish',
       '72 Light vegetation - not crop, including grass',
       '70 Organic materials, other', '13 Exterior trim, including doors',
       '12 Exterior sidewall covering, surface, finish',
       '34 Wearing apparel not on a person',
       '14 Floor covering or rug/carpet/mat, surface',
       '33 Linen; other than bedding', '26 Household utensils',
       '64 Flammable liquid/gas i

In [109]:
item_first_ignited_mapping = {
    '96 Rubbish, trash, waste': '96 Rubbish, trash, or waste',
    '62 Flammable liquid/gas in/from engine or burner': '62 Flam. liq/gas-in/from engine or burne',
    '21 Upholstered sofa, chair, vehicle seats': '21 Upholstered sofa, chair, vehicle seat',
    '59 Rolled, wound material (paper and fabrics)': '59 Rolled, wound material (paper, fabric',
    '73 Heavy vegetation not crop, including trees': '73 Heavy vegetation no crops, inc. tre',
    '76 Cooking materials, including edible materials': '76 Cooking materials, inc. Edible materi',
    '00 Item first ignited, other': '00 Item First Ignited, Other',
    '14 Floor covering or rug/carpet/mat, surface': '14 Floor covering or rug/carpet/mat',
    '36 Curtain, blind, drapery, tapestry': '36 Curtains, blinds, drapery, tapestry',
    '11 Exterior roof covering, surface, finish': '11 Exterior roof covering or finish',
    '64 Flammable liquid/gas in container or pipe': '64 Flam liq/gas in container or pipe',
    '72 Light vegetation not crop, including grass': '72 Light vegetation no crops, inc. gra',
    '37 Goods not made up, including fabrics and yard goods': '37 Raw Goods, incl. fabrics and yarn',
    '66 Pipe, duct, conduit, hose': '66 Pipe, duct, conduit or hose',
    '61 Atomized liquid, vaporized liquid, aerosol.': '61 Atomized liq., vaporized liq.,aersol',
    '95 Film, residue, including paint & resi': '95 Film, residue, including paint and resin',
    '63 Flammable liquid/gas in/from final container': '63 Flam Liq/gas-in/from final container',
    '94 Dust, fiber, lint, including sawdust and excelsior': '94 Dust/fiber/lint. inc. sawdust, excels',
    '15 Interior wall covering excluding drapes, etc.': '15 Int. Wall cover  exclude drapes, etc.',
    '47 Tarpaulin, tent': '47 Tarpaulin or tent',
    '71 Agricultural crop, including fruits and vegetables': '71 Crop, incl. fruits and vegitables',
    '82 Transformer, including transformer fluids': '82 Transformer, including transformer fl',
    '18 Thermal, acoustical insulation within wall, partition or floor/ceiling space': '18 Insulation within structural area',
    '40 Adornment, recreational material, signs, other': '40 Adornment, recreational mat., signs,',
    '43 Sign, including outdoor signs such as billboards': '43 Sign, inc. outdoor sign/billboards',
    '74 Animal living or dead': '74 Animal, living or dead',
    '77 Feathers or fur, not on bird or anima': '77 Feathers or fur, not on bird or animal',
    '58 Palletized material, material stored on pallets.': '58 Palletized material',
    '54 Cord, rope, twine': '54 Cord, rope, twine, yarn'}
df['Item First Ignited'] = df['Item First Ignited'].str.replace(r' -', '')
df['Item First Ignited'] = df['Item First Ignited'].replace(item_first_ignited_mapping)

In [23]:
df['Human Factors Associated with Ignition'].unique()

array([nan, '3 Unattended or unsupervised person', 'N None',
       '2 Possibly impaired by alcohol or drugs',
       '243 Possibly impaired by alcohol or drugsPossibly mentally disabledUnattended or unsupervised person',
       '12 AsleepPossibly impaired by alcohol or drugs',
       '124 AsleepPossibly impaired by alcohol or drugsPossibly mentally disabled',
       '1 Asleep', '6 Multiple persons involved',
       '4 Possibly mentally disabled',
       '624 Multiple persons involvedPossibly impaired by alcohol or drugsPossibly mentally disabled',
       '62 Multiple persons involvedPossibly impaired by alcohol or drugs',
       '7 Age was a factor',
       '24 Possibly impaired by alcohol or drugsPossibly mentally disabled',
       '43 Possibly mentally disabledUnattended or unsupervised person',
       '63 Multiple persons involvedUnattended or unsupervised person',
       '42 Possibly mentally disabledPossibly impaired by alcohol or drugs',
       '5 Physically disabled',
       '2

In [22]:
df['Human Factors Associated with Ignition'] = df['Human Factors Associated with Ignition'].str.replace('Â', '')
df['Human Factors Associated with Ignition'] = df['Human Factors Associated with Ignition'].str.replace('§', '')


In [112]:
df['Structure Type'].unique()

array([nan, '1 Enclosed building', '0 Structure type, other', '5 Tent',
       '3 Open structure', '8 Connective structure',
       '2 Fixed portable or mobile structure', '6 Open platform',
       '7 Underground structure work area', '4 Air-supported structure',
       '1 -Enclosed building', '3 -Open structure',
       '2 -Fixed portable or mobile structure',
       '0 -Structure type, other', '6 -Open platform',
       '7 -Underground structure work areas', '5 -Tent',
       '8 -Connective structure', '4 -Air supported structure'],
      dtype=object)

In [113]:
structure_type_mapping = {'4 Air-supported structure': '4 Air supported structure',
                          '7 Underground structure work area': '7 Underground structure work areas'}

df['Structure Type'] = df['Structure Type'].str.replace(r' -', ' ')
df['Structure Type'] = df['Structure Type'].replace(structure_type_mapping)

In [114]:
df['Structure Status'].unique()

array([nan, '6 Vacant and unsecured', '2 In normal use',
       '3 Idle, not routinely used', '1 Under construction',
       '4 Under major renovation', 'U Undetermined',
       '5 Vacant and secured', '0 Building status, other',
       '7 Being demolished', '2 -In normal use',
       '3 -Idle, not routinely used', '4 -Under major renovation',
       '5 -Vacant and secured', '6 -Vacant and unsecured',
       '1 -Under construction', '0 -Other', 'U -Undetermined',
       '7 -Being demolished'], dtype=object)

In [115]:
structure_status_mapping = {'0 Building status, other': '0 Other'}
df['Structure Status'] = df['Structure Status'].str.replace(r' -', ' ')
df['Structure Status'] = df['Structure Status'].replace(structure_status_mapping)

In [116]:
df['Fire Spread'].unique()

array([nan, '17 Structural member or framing', 'UU Undetermined',
       '00 Item first ignited, other', '70 Organic materials, other',
       '11 Exterior roof covering, surface, finish',
       '76 Cooking materials, including edible materials',
       '15 Interior wall covering excluding drapes, etc.',
       '66 Pipe, duct, conduit, hose', '20 -Furniture, utensils, other',
       '21 -Upholstered sofa, chair, vehicle seats',
       '17 -Structural member or framing',
       '00 -Item First Ignited, Other',
       '10 -Structural component or finish, other', 'UU -Undetermined',
       '30 -Soft goods, wearing apparel, other',
       '14 -Floor covering or rug/carpet/mat',
       '88 -Pyrotechnics, explosives',
       '11 -Exterior roof covering or finish',
       '65 -Flammable liquid/gas - uncontained',
       '23 -Cabinetry (including built-in)',
       '33 -Linen; other than bedding', '50 -Storage supplies, other',
       '12 -Exterior wall covering or finish',
       '36 -Curtai

In [117]:
fire_spread_mapping = mapping = {
    '00 Item first ignited, other': '00 Item First Ignited, Other',
    '11 Exterior roof covering, surface, finish': '11 Exterior roof covering or finish',
    '66 Pipe, duct, conduit, hose': '66 Pipe, duct, conduit or hose',
    '15 Interior wall covering excluding drapes, etc.': '15 Int. Wall cover  exclude drapes, etc.',
    '76 Cooking materials, including edible materials': '76 Cooking materials, inc. Edible materia',
    '94 Dust/fiber/lint. inc. sawdust, excelsi': '94 Dust/fiber/lint. inc. sawdust, excelsi',
    '96 Rubbish, trash, or waste': '96 Rubbish, trash, waste',
    '61 Atomized liq., vaporized liq.,aersol': '61 Atomized liquid, vaporized liquid, aerosol.'
}
df['Fire Spread'] = df['Fire Spread'].str.replace(r'\s*-\s*', ' ', regex=True)
df['Fire Spread'] = df['Fire Spread'].replace(fire_spread_mapping)

In [118]:
df['No Flame Spread'].unique()

array([nan, 'NO', 'YES', '5', '1', '2', '4', '3', 'Y', 1.0, 2.0, 3.0, 5.0,
       4.0, 'N', 'False', 'True'], dtype=object)

In [119]:
#No Flame Spread mapping
no_flame_spread_mapping = {
    'NO': 0, 'N': 0, 'False': 0, 'YES': 1, 'Y': 1, 'True': 1,
    '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4, 5.0: 5,
    np.nan: np.nan}

# Convert the list to a pandas Series
df['No Flame Spread'] = df['No Flame Spread'].map(no_flame_spread_mapping)


In [120]:
df['Detectors Present'].unique()


array([nan, 'N None present', '1 Present', 'U Undetermined',
       'N -Not present', '1 -Present', 'U -Undetermined'], dtype=object)

In [121]:
detectors_present_mapping = {
    'N None present': 'N Not present'
}
df['Detectors Present'] = df['Detectors Present'].str.replace(r' -', ' ')
df['Detectors Present'] = df['Detectors Present'].replace(detectors_present_mapping)

In [122]:
df['Detector Type'].unique()


array([nan, '1 Smoke', 'U Undetermined', '5 More than one type present',
       '4 Sprinkler, water flow detection', '2 Heat',
       '3 Combination smoke and heat in a single unit', '1 -Smoke',
       '4 -Sprinkler, water flow detection', 'U -Undetermined', '2 -Heat',
       '5 -More than one type present',
       '3 -Combination smoke & heat in single unit',
       '0 -Detector type, other', '0 Detector type, other'], dtype=object)

In [123]:
detectors_type_mapping = {'3 Combination smoke and heat in a single unit':'3 Combination smoke & heat in single unit'}
df['Detector Type'] = df['Detector Type'].str.replace(r' -', ' ')
df['Detector Type'] = df['Detector Type'].replace(detectors_type_mapping)


In [124]:
df['Detector Operation'].unique()


array([nan, 'U Undetermined', '2 Detector operated',
       '1 Fire too small to activate detector',
       '3 Detector failed to operate', '3 -Detector failed to operate',
       '2 -Detector operated', 'U -Undetermined',
       '1 -Fire too small to activate detector'], dtype=object)

In [125]:
df['Detector Operation'] = df['Detector Operation'].str.replace(r' -', ' ')

In [126]:
df['Detector Effectiveness'].unique()


array([nan, '1 Detector alerted occupants, occupants responded',
       'U Undetermined', '3 There were no occupants',
       '4 Detector failed to alert occupants',
       '2 Detector alerted occupants, occupants failed to respond',
       '1 -Alerted occupants, occupants responded',
       '2 -Alerted occupants-occ. failed to resond',
       '3 -There were no occupants', 'U -Undetermined',
       '4 -Failed to alert occupants'], dtype=object)

In [127]:
detectors_effectiveness_mapping = {'2 Alerted occupants-occ. failed to resond' : 'Detector alerted occupants, occupants failed to respond',
                                   '4 Failed to alert occupants' : '4 Detector failed to alert occupants'}
df['Detector Effectiveness'] = df['Detector Effectiveness'].str.replace(r' -', ' ')
df['Detector Effectiveness'] = df['Detector Effectiveness'].replace(detectors_effectiveness_mapping)


In [128]:
df['Detector Failure Reason'].unique()

array([nan, 'U Undetermined',
       '1 Power failure, hardwired det. shut off, disconnect',
       '6 Battery discharged or dead',
       '2 Improper installation or placement of detector',
       '0 Detector failure reason, other',
       '5 Battery missing or disconnected', 'U -Undetermined',
       '0 -Detector failure reason, other',
       '6 -Battery discharged or dead',
       '5 -Battery missing or disconnected',
       '1 -Power fail/shutoff or disconnected dete', '3 -Defective',
       '4 -Lack of maintenance, inc. not cleaning',
       '2 -Improper installation or placement', '3 Defective detector',
       '4 Lack of maintenance, includes not cleaning'], dtype=object)

In [129]:
detector_failure_reason_mapping = {
    'U Undertermined': np.nan,
    'U -Undetermined': np.nan,
    '0 -Detector failure reason, other': '0 Detector failure reason, other',
    '6 -Battery discharged or dead': '6 Battery discharged or dead',
    '5 -Battery missing or disconnected': '5 Battery missing or disconnected',
    '1 -Power fail/shutoff or disconnected dete': '1 Power failure, hardwired det. shut off, disconnect',
    '3 -Defective': '3 Defective',
    '4 -Lack of maintenance, inc. not cleaning': '4 Lack of maintenance, includes not cleaning',
    '2 -Improper installation or placement': '2 Improper installation or placement of detector'
}

df['Detector Failure Reason'] = df['Detector Failure Reason'].replace(detector_failure_reason_mapping)

In [130]:
df['Automatic Extinguishing System Present'].unique()


array([nan, 'N None Present', '1 Present', 'U Undetermined',
       '2 Partial system present', 'N -None Present', '1 -Present',
       '2 -Partial system present', 'U -Undetermined'], dtype=object)

In [131]:
df['Automatic Extinguishing System Present'] = df['Automatic Extinguishing System Present'].str.replace(r' -', ' ')

In [132]:
df['Automatic Extinguishing Sytem Type'].unique()


array([nan, '1 Wet-pipe sprinkler system', 'U Undetermined',
       '2 Dry-pipe sprinkler system', '4 Dry chemical system',
       '0 Special hazard system, other', '6 Halogen-type system',
       '3 Other sprinkler system', '1 -Wet-pipe sprinkler',
       '0 -Special hazard system, other', 'U -Undetermined',
       '4 -Dry chemical system', '5 Foam system',
       '6 -Halogen type system', '5 -Foam system',
       '7 Carbon dioxide system'], dtype=object)

In [133]:
aes_type_mapping = {
    '1 Wet-pipe sprinkler' : '1 Wet-pipe sprinkler system',
    '6 Halogen-type system': 'Halogen type system'}
df['Automatic Extinguishing Sytem Type'] = df['Automatic Extinguishing Sytem Type'].str.replace(r' -', ' ')
df['Automatic Extinguishing Sytem Type'] = df['Automatic Extinguishing Sytem Type'].replace(aes_type_mapping)

In [134]:
df['Automatic Extinguishing Sytem Perfomance'].unique()

array([nan, '3 Fire too small to activate system',
       '1 System operated and was effective', '4 System did not operate',
       'U Undetermined', '2 System operated and was not effective',
       '1 -System operated and was effective',
       '3 -Fire too small to activate system',
       '4 -System did not operate',
       '2 -System operated and was not effective', 'U -Undetermined',
       '0 Operation of AES, other', '0 -Operation of AES, other'],
      dtype=object)

In [135]:
aes_performance_mapping = {
    '3 -Fire too small to activate system': '3 Fire too small to activate system',
    '1 -System operated and was effective': '1 System operated and was effective',
    '4 -System did not operate': '4 System did not operate',
    'U -Undetermined': np.nan,
    'U Undetermined': np.nan,
    '2 -System operated and was not effective': '2 System operated and was not effective',
    '0 -Operation of AES, other': '0 Operation of AES, other'
}


df['Automatic Extinguishing Sytem Perfomance'] = df['Automatic Extinguishing Sytem Perfomance'] \
                                                .replace(aes_performance_mapping)

In [136]:
df['Automatic Extinguishing Sytem Failure Reason'].unique()

array([nan, 'Fire not in area protected by the system', 'System shut off',
       'Reason system not effective, other',
       'Not enough agent discharged to control the fire',
       '5 -Fire not in area protected by the syste', 'U -Undetermined',
       '0 -Reason system not effective, other',
       '7 -Lack of maintenance, including corrosio', 'Undetermined',
       'Inappropriate system for the type of fire',
       '3 -Agent discharged, did not reach the fir',
       'Agent discharged, but did not reach the fire',
       '1 -System shut off'], dtype=object)

In [137]:
aes_failure_reason_mapping = {
    '5 -Fire not in area protected by the syste': 'Fire not in area protected by the system',
    'U -Undetermined': np.nan,
    'Undetermined': np.nan,
    'U Undetermined': np.nan,
    '0 -Reason system not effective, other': 'Reason system not effective, other',
    '7 -Lack of maintenance, including corrosio': 'Lack of maintenance, including corrosion',
    '3 -Agent discharged, did not reach the fir': 'Agent discharged, did not reach the fire',
    '1 -System shut off': 'System shut off'
}

df['Automatic Extinguishing Sytem Failure Reason'] = df['Automatic Extinguishing Sytem Failure Reason'].map(aes_failure_reason_mapping)

In [138]:
df['neighborhood_district'].unique()


array(['Portola', 'Nob Hill', 'Potrero Hill', 'Excelsior', 'Marina',
       'Pacific Heights', 'Chinatown', 'Tenderloin', 'South of Market',
       'Bayview Hunters Point', 'Lakeshore', 'West of Twin Peaks',
       'Hayes Valley', 'Lincoln Park', 'Financial District/South Beach',
       'Sunset/Parkside', 'Russian Hill', 'Castro/Upper Market',
       'Noe Valley', 'Mission', 'Treasure Island', 'McLaren Park',
       'Visitacion Valley', 'Outer Richmond', 'Haight Ashbury',
       'North Beach', 'Presidio', 'Twin Peaks', 'Inner Sunset',
       'Seacliff', 'Mission Bay', 'Golden Gate Park', nan,
       'Lone Mountain/USF', 'Western Addition', 'Japantown', 'Glen Park',
       'Bernal Heights', 'Oceanview/Merced/Ingleside', 'Presidio Heights',
       'Inner Richmond', 'Outer Mission'], dtype=object)

## Duplicated data

In [139]:
df.drop_duplicates(inplace=True)