In [1]:
# dependencies
import pandas as pd

# import the fire data csv
fireFile = "./Data/CA_Fires.csv"

# read the file and store in a data frame
fireData = pd.read_csv(fireFile)

# see all columns
fireData.columns

# remove extraneous columns
fireData = fireData[["incident_name","incident_county","incident_latitude",
            "incident_longitude","incident_acres_burned","incident_id",
            "incident_dateonly_created","incident_dateonly_extinguished"]]

# rename the columns
fireData = fireData.rename(columns={"incident_name":"Name","incident_county":"County",
            "incident_acres_burned":"Acres Burned","incident_latitude":"Lat",
            "incident_longitude":"Long","incident_id":"ID","incident_dateonly_extinguished":
            "Date Extinguished","incident_dateonly_created":"Date Started"})

# make sure there are no incident repeats then drop ID field
fireData.drop_duplicates(subset=["ID"])
fireData = fireData[["Name","County","Lat","Long","Acres Burned",
                     "Date Started","Date Extinguished"]]

In [2]:
# create a column that contains the duration of each fire
# first convert the date columns to datetime
fireData["Date Started"] = pd.to_datetime(fireData["Date Started"])
fireData["Date Extinguished"] = pd.to_datetime(fireData["Date Extinguished"])

# subtract the two dates
fireData["Duration (Days)"] = fireData["Date Extinguished"] - fireData["Date Started"]

# convert duration to string and remove "days"
fireData["Duration (Days)"] = fireData["Duration (Days)"].astype(str)
fireData["Duration (Days)"] = fireData["Duration (Days)"].str.replace("days","")

# convert NaT to NaN and convert back to float
fireData["Duration (Days)"] = fireData["Duration (Days)"].replace(["NaT"],"NaN")
fireData["Duration (Days)"] = fireData["Duration (Days)"].astype(float)

# create a column that holds the year of each start date
fireData["Year"] = fireData["Date Started"].dt.year

# remove the date started and date extinguished columns
fireData = fireData[["Name","Year","County","Lat","Long","Acres Burned","Duration (Days)"]]

In [5]:
# remove any years that aren't 2013-2020
fireData = fireData.loc[(fireData["Year"]>=2013)&(fireData["Year"]<=2020),:]

# reset the index
fireData.reset_index(inplace=True,drop=True)

In [9]:
# separate into two dataframes
fireDamage = fireData[["Name","Year","County",
                "Lat","Long","Acres Burned"]]
fireDuration = fireData[["Name","Year","County",
                "Lat","Long","Duration (Days)"]]

# remove any NaNs from each dataframe
fireDamage = fireDamage.dropna()
fireDuration = fireDuration.dropna()

In [18]:
# groupby year and county and sum for each variable
fireDamageCounty = fireDamage.groupby(["Year","County"])
fireDamageCounty = fireDamageCounty["Acres Burned"].sum()
pd.DataFrame(fireDamageCounty)

fireDurationCounty = fireDuration.groupby(["Year","County"])
fireDurationCounty = fireDurationCounty["Duration (Days)"].sum()
pd.DataFrame(fireDurationCounty)

Unnamed: 0_level_0,Unnamed: 1_level_0,Duration (Days)
Year,County,Unnamed: 2_level_1
2013,Alameda,0.0
2013,Amador,1.0
2013,Butte,11.0
2013,Calaveras,3.0
2013,Contra Costa,6.0
...,...,...
2020,Trinity,33.0
2020,Tulare,11.0
2020,Tuolumne,26.0
2020,Ventura,10.0


In [19]:
# export as csvs 
fireData.to_csv("./Clean Data/FireData_Clean.csv")
fireDamage.to_csv("./Clean Data/FireDamage_Clean.csv")
fireDamageCounty.to_csv("./Clean Data/FireDamageCounty_Clean.csv")
fireDuration.to_csv("./Clean Data/FireDuration_Clean.csv")
fireDurationCounty.to_csv("./Clean Data/FireDurationCounty_Clean.csv")