# Dataframe Cleaning

In [63]:
import numpy as np
import pandas as pd
import seaborn as sns
import os

## 1.   Loading the Dataframe and detecting the missing values such as: (0, "N/a", "na", np.nan) ->

In [64]:
##
    #! df = pd.read_csv(url, low_memory=False, na_values = missing_values)
    #! Define dtype for columns or use low_memory=False

url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/64a26694-01dc-4ec3-aa87-ad8509604f50/resource/1e824947-d73b-4f48-9bac-7f7f3731a6b9/download/Fire%20Incidents%20Data.csv"
missing_values = ["N/a", "na", np.nan]
dtype_dict = {
    '_id': int,
    'Area_of_Origin': str,
    'Building_Status': str,
    'Business_Impact': str,
    'Civilian_Casualties': float,
    'Count_of_Persons_Rescued': float,
    'Estimated_Dollar_Loss': float,
    'Estimated_Number_Of_Persons_Displaced': float,
    'Exposures': float,
    'Ext_agent_app_or_defer_time': str,
    'Extent_Of_Fire': str,
    'Final_Incident_Type': str,
    'Fire_Alarm_System_Impact_on_Evacuation': str,
    'Fire_Alarm_System_Operation': str,
    'Fire_Alarm_System_Presence': str,
    'Fire_Under_Control_Time': str,
    'Ignition_Source': str,
    'Incident_Number': str,
    'Incident_Station_Area': str,
    'Incident_Ward': float,
    'Initial_CAD_Event_Type': str,
    'Intersection': str,
    'Last_TFS_Unit_Clear_Time': str,
    'Latitude': float,
    'Level_Of_Origin': str,
    'Longitude': float,
    'Material_First_Ignited': str,
    'Method_Of_Fire_Control': str,
    'Number_of_responding_apparatus': float,
    'Number_of_responding_personnel': float,
    'Possible_Cause': str,
    'Property_Use': str,
    'Smoke_Alarm_at_Fire_Origin': str,
    'Smoke_Alarm_at_Fire_Origin_Alarm_Failure': str,
    'Smoke_Alarm_at_Fire_Origin_Alarm_Type': str,
    'Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation': str,
    'Smoke_Spread': str,
    'Sprinkler_System_Operation': str,
    'Sprinkler_System_Presence': str,
    'Status_of_Fire_On_Arrival': str,
    'TFS_Alarm_Time': str,
    'TFS_Arrival_Time': str,
    'TFS_Firefighter_Casualties': float
}
#
df = pd.read_csv(url, dtype=dtype_dict, na_values = missing_values)
# df = pd.read_csv(url, dtype=dtype_dict)


    #? Show missing values
# df.isnull().sum()

    #? Or in boolean format:
# df.isnull().any()

    #? Heatmap of missing values
# sns.heatmap(df.isnull(), yticklabels=False)
# sns.heatmap(df.isnull(), yticklabels=False, annot=True)

    #? 29425 rows × 43 columns with NaN
# df

## 2.   Handling the missing values ->

In [65]:
    #? Remove only the rows that are all NaN (still 29425 rows × 43 columns, no row has only NaN)
df_dropped = df
df_dropped = df_dropped.dropna(how="all")


    #? Swap all NaN values to '0'
df_dropped = df_dropped.fillna(0)
# df_dropped.isnull().any()

    #? Or swap only one row's NaN's to '0'
# df_dropped["Area_of_Origin"] = df_dropped["Area_of_Origin"].fillna(0)
# df_dropped.isnull().any()

# df_dropped

    #? This this allows calculations to be made on that dataset

## 3.   Handling Duplicates ->

In [90]:
    #? Check for Duplicates
df_dropped.duplicated()

    #? Drop all the duplicates (none on this case still 29425 rows × 43 columns)
df_dropped.drop_duplicates(keep="first", inplace=True)
df_dropped

Unnamed: 0,_id,Area_of_Origin,Building_Status,Business_Impact,Civilian_Casualties,Count_of_Persons_Rescued,Estimated_Dollar_Loss,Estimated_Number_Of_Persons_Displaced,Exposures,Ext_agent_app_or_defer_time,...,Smoke_Alarm_at_Fire_Origin_Alarm_Failure,Smoke_Alarm_at_Fire_Origin_Alarm_Type,Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation,Smoke_Spread,Sprinkler_System_Operation,Sprinkler_System_Presence,Status_of_Fire_On_Arrival,TFS_Alarm_Time,TFS_Arrival_Time,TFS_Firefighter_Casualties
0,2946141,81 - Engine Area,0,0,0,0,15000.0,0.0,0.0,2018-02-24 21:12:00,...,0,0,0,0,0,0,"7 - Fully involved (total structure, vehicle, ...",2018-02-24 21:04:29,2018-02-24 21:10:11,0.0
1,2946142,"75 - Trash, rubbish area (outside)",0,0,0,0,50.0,0.0,0.0,2018-02-24 21:29:42,...,0,0,0,0,0,0,2 - Fire with no evidence from street,2018-02-24 21:24:43,2018-02-24 21:29:31,0.0
2,2946143,0,0,0,0,0,0.0,0.0,0.0,NaT,...,0,0,0,0,0,0,0,2018-02-25 13:29:59,2018-02-25 13:36:49,0.0
3,2946144,"75 - Trash, rubbish area (outside)",01 - Normal (no change),1 - No business interruption,0,0,0.0,0.0,0.0,2018-02-25 14:19:25,...,98 - Not applicable: Alarm operated OR presenc...,9 - Type undetermined,"8 - Not applicable: No alarm, no persons present",99 - Undetermined,8 - Not applicable - no sprinkler system present,9 - Undetermined,3 - Fire with smoke showing only - including v...,2018-02-25 14:13:39,2018-02-25 14:18:07,0.0
4,2946145,0,0,0,0,0,0.0,0.0,0.0,NaT,...,0,0,0,0,0,0,0,2018-02-25 18:20:43,2018-02-25 18:26:19,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29420,2975561,42 - Garage,01 - Normal (no change),1 - No business interruption,0,0,0.0,0.0,0.0,2022-11-26 06:20:00,...,98 - Not applicable: Alarm operated OR presenc...,4 - Interconnected,3 - No one (at risk) evacuated as a result of ...,3 - Spread to entire room of origin,3 - Did not activate: fire too small to trigge...,1 - Full sprinkler system present,2 - Fire with no evidence from street,2022-11-26 06:06:11,2022-11-26 06:12:09,0.0
29421,2975562,81 - Engine Area,0,0,0,0,5000.0,0.0,0.0,2022-11-26 06:33:56,...,0,0,0,0,0,0,3 - Fire with smoke showing only - including v...,2022-11-26 06:27:58,2022-11-26 06:32:01,0.0
29422,2975563,"44 - Trash, Rubbish Storage (inc garbage chute...",01 - Normal (no change),8 - Not applicable (not a business),0,0,2000.0,0.0,0.0,2022-11-26 08:32:05,...,98 - Not applicable: Alarm operated OR presenc...,4 - Interconnected,"8 - Not applicable: No alarm, no persons present","4 - Spread beyond room of origin, same floor",2 - Did not activate: remote from fire,1 - Full sprinkler system present,2 - Fire with no evidence from street,2022-11-26 08:22:16,2022-11-26 08:26:04,0.0
29423,2975564,0,0,0,0,0,0.0,0.0,0.0,NaT,...,0,0,0,0,0,0,0,2022-11-26 09:13:13,2022-11-26 09:14:09,0.0


## 4.   Handling Dates ->

In [89]:
date_columns = ['TFS_Alarm_Time', 'Fire_Under_Control_Time', 'Last_TFS_Unit_Clear_Time', 'TFS_Arrival_Time', 'Ext_agent_app_or_defer_time']
for col in date_columns:
    df_dropped[col] = pd.to_datetime(df_dropped[col], errors='coerce')
df_dropped.info()
# df_dropped

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29425 entries, 0 to 29424
Data columns (total 43 columns):
 #   Column                                                         Non-Null Count  Dtype         
---  ------                                                         --------------  -----         
 0   _id                                                            29425 non-null  int64         
 1   Area_of_Origin                                                 29425 non-null  object        
 2   Building_Status                                                29425 non-null  object        
 3   Business_Impact                                                29425 non-null  object        
 4   Civilian_Casualties                                            29425 non-null  int64         
 5   Count_of_Persons_Rescued                                       29425 non-null  int64         
 6   Estimated_Dollar_Loss                                          29425 non-null  float64       


## 5.   Changing types

In [None]:
change_cols = ["TFS_Firefighter_Casualties", "Civilian_Casualties", "Count_of_Persons_Rescued", "Number_of_responding_personnel", "Number_of_responding_apparatus", ]
df_dropped['Civilian_Casualties'] = df_dropped['Civilian_Casualties'].astype(int)
df_dropped['Count_of_Persons_Rescued'] = df_dropped['Count_of_Persons_Rescued'].astype(int)

df_dropped.dtypes

In [None]:
# Save the cleaned dataset to a CSV file
cleaned_file_path = 'cleaned_fire_incidents.csv'
df_dropped.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset has been saved to {cleaned_file_path}")