In [20]:
import pandas as pd
import numpy as np
import polars as pl
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import gzip
import shutil
import pathlib
import os
import sqlalchemy
import sqlite3
import spacy
import re
import tqdm
from tqdm.notebook import tqdm, trange
import ipywidgets as widgets
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display
import time
import sklearn
from sklearn.preprocessing import LabelEncoder
import missingno as msno

In [41]:
file_path = r"F:\Data Science\Datasets\Historical_Fire_Data_Charlottesville.csv"

In [71]:
hfdf = pd.read_csv(file_path)

hfdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2246 entries, 0 to 2245
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   RecordID                      2246 non-null   int64  
 1   Incident_ID                   2246 non-null   int64  
 2   AddressRaw                    2246 non-null   object 
 3   AddressStandardized           2246 non-null   object 
 4   AlarmDateTimeAmericaNY        2246 non-null   object 
 5   CauseOfIgnition               2246 non-null   object 
 6   FireAreaOfOrigin              2246 non-null   object 
 7   FireType                      2246 non-null   object 
 8   HeatSource                    2246 non-null   object 
 9   IncidentDurationMinutes       2246 non-null   float64
 10  ItemFirstIgnited              2246 non-null   object 
 11  Latitude                      2246 non-null   float64
 12  Longitude                     2246 non-null   float64
 13  Pro

In [72]:
hfdf.head()

Unnamed: 0,RecordID,Incident_ID,AddressRaw,AddressStandardized,AlarmDateTimeAmericaNY,CauseOfIgnition,FireAreaOfOrigin,FireType,HeatSource,IncidentDurationMinutes,...,PropertyUse,StreetName,StreetNumber,StreetPostDirection,StreetPredirection,StreetSuffix,UnitFirstArrive,UnitFirstResponseTimeMinutes,Year,ZipCode
0,1,2000930,2400 FONTAINE AVE,2400 FONTAINE AVE,2002/04/14 16:38:57+00,Undetermined,,Natural vegetation fire,,17.299999,...,Undetermined,FONTAINE,2400.0,,,AVE,E6,3.5,2002,22901
1,2,2003381,905 FOREST ST,905 FOREST ST,2002/11/07 20:21:02+00,Intentional,Bedroom - < 5 persons; included are jail or pr...,Structure Fire,Cigarette lighter,258.700012,...,1 or 2 family dwelling,FOREST,905.0,,,ST,C2,3.2,2002,22901
2,3,2003463,219 5TH ST SW,219 5TH ST SW,2002/11/14 12:46:32+00,,,Mobile property (vehicle) fire,,9.4,...,,5TH,219.0,SW,,ST,BC2,0.05,2002,22903
3,4,2003769,,,2002/12/11 14:30:15+00,,,Structure Fire,,24.9,...,1 or 2 family dwelling,,,,,,BC2,1.6,2002,22902
4,5,2003827,822 HARDY DR,822 HARDY DR,2002/12/17 07:38:34+00,Intentional,"Cooking area, kitchen",Structure Fire,"Heat from powered equipment, other",98.800003,...,Multifamily dwelling,HARDY,822.0,,,DR,E7,5.93,2002,22901


In [73]:
hfdf['CauseOfIgnition'].value_counts()

CauseOfIgnition
                                                                      665
Unintentional                                                         630
Failure of equipment or heat source                                   224
Undetermined                                                          182
Cause under investigation                                             137
Smoking                                                               127
Cause undetermined after investigation                                 87
Intentional                                                            59
Open/outdoor fire                                                      25
Other cause                                                            18
Equipment                                                              17
Incendiary                                                             17
Cause, other                                                           13
Natural source        

In [74]:
hfdf = hfdf[hfdf['CauseOfIgnition'] != 'Unintentional']

In [75]:
hfdf = hfdf.dropna(how='any')

In [76]:
one_hot_encoded = pd.get_dummies(hfdf['CauseOfIgnition'], prefix='Cause',dtype=int)

In [77]:
hfdf = pd.concat([hfdf, one_hot_encoded],axis=1)

In [78]:
#hfdf = hfdf.dropna

In [79]:
hfdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1615 entries, 0 to 2238
Data columns (total 42 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   RecordID                                                                  1615 non-null   int64  
 1   Incident_ID                                                               1615 non-null   int64  
 2   AddressRaw                                                                1615 non-null   object 
 3   AddressStandardized                                                       1615 non-null   object 
 4   AlarmDateTimeAmericaNY                                                    1615 non-null   object 
 5   CauseOfIgnition                                                           1615 non-null   object 
 6   FireAreaOfOrigin                                                     

In [82]:
hfdf.head(10)

Unnamed: 0,RecordID,Incident_ID,AddressRaw,AddressStandardized,AlarmDateTimeAmericaNY,CauseOfIgnition,FireAreaOfOrigin,FireType,HeatSource,IncidentDurationMinutes,...,Cause_Equipment,Cause_Failure of equipment or heat source,Cause_Incendiary,Cause_Intentional,Cause_Misuse of fire,Cause_Natural source,Cause_Open/outdoor fire,Cause_Other cause,Cause_Smoking,Cause_Undetermined
0,1,2000930,2400 FONTAINE AVE,2400 FONTAINE AVE,2002/04/14 16:38:57+00,Undetermined,,Natural vegetation fire,,17.299999,...,0,0,0,0,0,0,0,0,0,1
1,2,2003381,905 FOREST ST,905 FOREST ST,2002/11/07 20:21:02+00,Intentional,Bedroom - < 5 persons; included are jail or pr...,Structure Fire,Cigarette lighter,258.700012,...,0,0,0,1,0,0,0,0,0,0
2,3,2003463,219 5TH ST SW,219 5TH ST SW,2002/11/14 12:46:32+00,,,Mobile property (vehicle) fire,,9.4,...,0,0,0,0,0,0,0,0,0,0
3,4,2003769,,,2002/12/11 14:30:15+00,,,Structure Fire,,24.9,...,0,0,0,0,0,0,0,0,0,0
4,5,2003827,822 HARDY DR,822 HARDY DR,2002/12/17 07:38:34+00,Intentional,"Cooking area, kitchen",Structure Fire,"Heat from powered equipment, other",98.800003,...,0,0,0,1,0,0,0,0,0,0
5,6,2003891,900 HILLCREST RD,900 HILLCREST RD,2002/12/22 16:23:05+00,Intentional,,Structure Fire,,51.599998,...,0,0,0,1,0,0,0,0,0,0
6,7,2000079,,,2003/01/08 17:29:18+00,Other cause,,Natural vegetation fire,Arcing,48.400002,...,0,0,0,0,0,0,0,1,0,0
7,8,2030084,20 BONNYCASTLE DR,20 BONNYCASTLE DR,2003/01/09 15:23:46+00,"Cause, other",Office,Structure Fire,Arcing,28.4,...,0,0,0,0,0,0,0,0,0,0
8,9,2030087,1526 TRAILRIDGE RD,1526 TRAILRIDGE RD,2003/01/09 20:41:44+00,Cause under investigation,Bedroom - < 5 persons; included are jail or pr...,Structure Fire,Cigarette,78.699997,...,0,0,0,0,0,0,0,0,0,0
9,10,2030117,100 RIDGE ST,100 RIDGE ST,2003/01/12 07:23:38+00,Intentional,"Chute/container - trash, rubbish, waste",Structure Fire,Cigarette,118.699997,...,0,0,0,1,0,0,0,0,0,0


In [130]:
hfdf['FireAreaOfOrigin'].value_counts()

FireAreaOfOrigin
                                                      1082
Engine area, running gear, wheel area                  134
Undetermined                                            41
Bedroom - < 5 persons; included are jail or prison      37
Cooking area, kitchen                                   30
                                                      ... 
Courtyard, patio, porch, terrace                         1
Machinery room or area; elevator machinery room          1
Art gallery, exhibit hall, library                       1
Awning                                                   1
Open area, outside; included are farmland, field         1
Name: count, Length: 63, dtype: int64

In [131]:
hfdf['HeatSource'].value_counts()

HeatSource
                                                          869
Undetermined                                              308
Heat from powered equipment, other                         77
Arcing                                                     59
Cigarette                                                  54
Radiated, conducted heat from operating equipment          33
Hot ember or ash                                           32
Hot or smoldering object, other                            24
Spark, ember, or flame from operating equipment            23
Heat source: other                                         17
Match                                                      15
Electrical arcing                                          14
Heat from undetermined smoking material                    13
Fireworks                                                  10
Heat, spark from friction                                   9
Heat from other open flame or smoking materials            

In [132]:
hfdf['FireType'].value_counts()

FireType
Structure Fire                                       455
Natural vegetation fire                              438
Mobile property (vehicle) fire                       244
Outside rubbish fire                                 126
Special outside fire                                  59
Building fire                                         48
Cooking fire, confined to container                   47
Brush or brush-and-grass mixture fire                 40
Fire, other                                           32
Outside rubbish, trash or waste fire                  27
Passenger vehicle fire                                13
Natural vegetation fire, other                        11
Cultivated vegetation, crop fire                      11
Dumpster or other outside trash receptacle fire       11
Chimney or flue fire, confined to chimney or flue      7
Fire in mobile property used as a fixed structure      7
Outside rubbish fire, other                            7
Trash or rubbish fire,

In [133]:
one_hot_encoded_2 = pd.get_dummies(hfdf['FireType'],prefix='FireType',dtype=int)

hfdf = pd.concat([hfdf,one_hot_encoded_2],axis=1)

In [134]:
#hfdf['StreetName'].value_counts().plot(kind='bar')
hfdf2 = hfdf.drop(columns={'StreetName','StreetNumber','StreetPostDirection','StreetPredirection','StreetSuffix','UnitFirstArrive','Incident_ID','AddressRaw','AddressStandardized','AlarmDateTimeAmericaNY','CauseOfIgnition','Latitude','Longitude','FireAreaOfOrigin','PropertyUse','HeatSource','FireType','ItemFirstIgnited','RecordID'})

In [135]:
hfdf2.head()

Unnamed: 0,IncidentDurationMinutes,PropertyLossDollars,UnitFirstResponseTimeMinutes,Year,ZipCode,Cause_,Cause_Act of nature,Cause_Albemarle County in charge of investigation,Cause_Cause under investigation,Cause_Cause undetermined after investigation,...,FireType_Outside equipment fire,FireType_Outside rubbish fire,"FireType_Outside rubbish fire, other","FireType_Outside rubbish, trash or waste fire",FireType_Passenger vehicle fire,FireType_Road freight or transport vehicle fire,FireType_Special outside fire,"FireType_Special outside fire, other",FireType_Structure Fire,"FireType_Trash or rubbish fire, contained"
0,17.299999,0,3.5,2002,22901,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,258.700012,40000,3.2,2002,22901,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,9.4,0,0.05,2002,22903,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24.9,0,1.6,2002,22902,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,98.800003,15000,5.93,2002,22901,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [138]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

target_col = 'ZipCode'

X=hfdf2.drop(columns={target_col})
y=hfdf2[target_col]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

classifier = RandomForestClassifier(random_state=42)

classifier.fit(X_train,y_train)

predictions = classifier.predict(X_test)

accuracy = accuracy_score(y_test,predictions)

f"Accuracy Score: {accuracy}"

'Accuracy Score: 0.32507739938080493'

In [140]:
#weights_df = pd.DataFrame([classifier.feature_names_in_,classifier.feature_importances_])
weights_df = pd.concat([pd.Series(classifier.feature_names_in_),pd.Series(classifier.feature_importances_)],axis=1)

In [141]:
weights_df.sort_values(by=1,ascending=False)


Unnamed: 0,0,1
2,UnitFirstResponseTimeMinutes,0.284974
0,IncidentDurationMinutes,0.277937
3,Year,0.15364
1,PropertyLossDollars,0.084007
4,Cause_,0.014869
46,FireType_Structure Fire,0.014664
13,Cause_Failure of equipment or heat source,0.012933
34,FireType_Mobile property (vehicle) fire,0.011678
7,Cause_Cause under investigation,0.009852
36,FireType_Natural vegetation fire,0.009333
