In [144]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd

dataB = gpd.read_file('./fire21_2.gdb/')
dataB.info() #see what data we're working with

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 21688 entries, 0 to 21687
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   YEAR_           21620 non-null  object  
 1   STATE           21688 non-null  object  
 2   AGENCY          21683 non-null  object  
 3   UNIT_ID         21671 non-null  object  
 4   FIRE_NAME       21674 non-null  object  
 5   INC_NUM         21040 non-null  object  
 6   ALARM_DATE      16327 non-null  object  
 7   CONT_DATE       9022 non-null   object  
 8   CAUSE           21664 non-null  float64 
 9   COMMENTS        19764 non-null  object  
 10  REPORT_AC       9138 non-null   float64 
 11  GIS_ACRES       21688 non-null  float64 
 12  C_METHOD        9484 non-null   float64 
 13  OBJECTIVE       21492 non-null  float64 
 14  FIRE_NUM        17741 non-null  object  
 15  Shape_Length    21688 non-null  float64 
 16  Shape_Area      21688 non-null  float64 
 17  COMP

Convert geometry data to coordinates

In [145]:
dataB = dataB.set_crs(epsg = 3310)
dataB = dataB.to_crs(epsg = 4326)
dataB.geometry.bounds

Unnamed: 0,minx,miny,maxx,maxy
0,-121.355040,38.886106,-121.340169,38.890478
1,-121.387747,38.815755,-121.351426,38.837968
2,-121.338750,38.841884,-121.331711,38.845649
3,-121.275792,38.960930,-121.270471,38.963578
4,-121.302496,39.485433,-121.298675,39.487215
...,...,...,...,...
21683,-123.009418,40.846974,-123.003509,40.850995
21684,-119.569151,37.516216,-119.545889,37.529345
21685,-122.147823,40.401240,-122.136553,40.410468
21686,-123.056564,40.845766,-123.051091,40.850576


Remove fires not in California and before 1950

In [146]:
dataB.set_index('STATE' , inplace = True) #run entire cell only once after reading from above cell

print('entries:', len(dataB))
dataB.drop(index = 'NV' , axis = 0, inplace = True) 
dataB.drop(index = 'OR' , axis = 0, inplace = True) 
dataB.drop(index = 'AZ' , axis = 0 , inplace = True)
print('entries:', len(dataB))

# dataB = dataB[dataB['STATE'] != 'AZ'] alternate method of cleaning

dataB.reset_index('STATE' , inplace = True)


dataB = dataB.loc[dataB["YEAR_"] > "1949"] #loc function allows for sorting without reindexing
print('entries:', len(dataB))

entries: 21688
entries: 21593
entries: 16369


remove entries with missing year, missing fire name and less than 10 acres

In [147]:
print(len(dataB))
print(dataB['YEAR_'].isnull().sum()) #isna() is bool but python is 'ducklike' and can add with .sum(), this dataset has no missing year entries
dataB = dataB[dataB['YEAR_'].notna()] #reassigns dataframe to the same dataframe with column year with no missing year entries
missing_fire = dataB.loc[dataB['FIRE_NAME'].isnull()] #locates indexes where there is no fire name
dataB.drop(missing_fire.index , axis = 0 , inplace = True ) #drops indexes with missing names
print(len(dataB))
dataB = dataB.loc[dataB['GIS_ACRES'] > 10] #reassigns dataframe to the same dataframe w/ acres > 10
print(len(dataB))

16369
0
16358
14177


Sort Chronologically

In [148]:
dataB.set_index('YEAR_' , inplace = True)
dataB.sort_values(by = 'YEAR_' , ascending = True , inplace = True)
dataB.reset_index('YEAR_' , inplace = True)

dataB['datetime'] = pd.to_datetime(dataB['ALARM_DATE'] , errors = 'coerce') #create new datetime column
dataB.set_index('datetime' , inplace = True) #set new column to index
dataB.sort_values(by='datetime', ascending = True, inplace = True) #sort values
dataB.reset_index('datetime' , inplace = True) #reset index to numbered list
dataB.drop('datetime' , axis = 1 , inplace = True) #delete the datetime column

print(len(dataB) , dataB.index)
dataB.tail


14177 RangeIndex(start=0, stop=14177, step=1)


<bound method NDFrame.tail of       YEAR_ STATE AGENCY UNIT_ID       FIRE_NAME   INC_NUM  \
0      1950    CA    USF     TNF       COLESMILL  00000000   
1      1950    CA    CDF     LNU  WALTER TINDELL  00000000   
2      1950    CA    CDF     BEU       ECHENIQUE  00000000   
3      1950    CA    CCO     VNC    DEVILS GULCH  00000000   
4      1950    CA    CDF     BDU           GOCKE  00000000   
...     ...   ...    ...     ...             ...       ...   
14172  2018    CA    CDF     RRU     SKYLINE LRA  00090869   
14173  2019    CA    NPS     MNP            STAR  00013598   
14174  2019    CA    USF     HTF        TAMARACK  00030272   
14175  2019    CA    USF     TNF          BORDER  00030785   
14176  2019    CA    CDF     FKU            YORK  00005720   

                      ALARM_DATE                  CONT_DATE  CAUSE  \
0      1950-05-11T00:00:00+00:00  1950-05-29T00:00:00+00:00    5.0   
1      1950-05-29T00:00:00+00:00                       None   14.0   
2      1950-05-

We now want to drop the majority of columns in order to focus on location and time data.

In [149]:
columns_to_drop = ['STATE' , 'AGENCY' , 'UNIT_ID' , 'INC_NUM' , 'CONT_DATE' , 'REPORT_AC' , 'C_METHOD' , 'OBJECTIVE' , 'FIRE_NUM' , 'Shape_Length' , 'COMPLEX_NAME' , 'COMPLEX_INCNUM' , 'COMMENTS' , 'Shape_Area']
dataB.drop(columns_to_drop , axis = 1 , inplace = True)
dataB.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 14177 entries, 0 to 14176
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   YEAR_       14177 non-null  object  
 1   FIRE_NAME   14177 non-null  object  
 2   ALARM_DATE  12649 non-null  object  
 3   CAUSE       14163 non-null  float64 
 4   GIS_ACRES   14177 non-null  float64 
 5   geometry    14177 non-null  geometry
dtypes: float64(2), geometry(1), object(3)
memory usage: 664.7+ KB


Drop entries without a fire name

Export cleaned data to a .geojson

In [150]:
dataB.to_file('Clean_brush.geojson', driver = 'GeoJSON' , encoding = 'utf-8')