In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd

dataB = gpd.read_file('./fire21_2.gdb/')
dataB.columns

Index(['YEAR_', 'STATE', 'AGENCY', 'UNIT_ID', 'FIRE_NAME', 'INC_NUM',
       'ALARM_DATE', 'CONT_DATE', 'CAUSE', 'COMMENTS', 'REPORT_AC',
       'GIS_ACRES', 'C_METHOD', 'OBJECTIVE', 'FIRE_NUM', 'Shape_Length',
       'Shape_Area', 'COMPLEX_NAME', 'COMPLEX_INCNUM', 'geometry'],
      dtype='object')

In [21]:
dataB = dataB.set_crs(epsg = 3310)
dataB = dataB.to_crs(epsg = 4326)
dataB.geometry.bounds

Unnamed: 0,minx,miny,maxx,maxy
0,-121.355040,38.886106,-121.340169,38.890478
1,-121.387747,38.815755,-121.351426,38.837968
2,-121.338750,38.841884,-121.331711,38.845649
3,-121.275792,38.960930,-121.270471,38.963578
4,-121.302496,39.485433,-121.298675,39.487215
...,...,...,...,...
21683,-123.009418,40.846974,-123.003509,40.850995
21684,-119.569151,37.516216,-119.545889,37.529345
21685,-122.147823,40.401240,-122.136553,40.410468
21686,-123.056564,40.845766,-123.051091,40.850576


Remove fires not in California and before 1950

In [22]:
dataB.set_index('STATE' , inplace = True) #run entire cell only once after reading from above cell

print('entries:', len(dataB))
dataB.drop(index = 'NV' , axis = 0, inplace = True) 
dataB.drop(index = 'OR' , axis = 0, inplace = True) 
dataB.drop(index = 'AZ' , axis = 0 , inplace = True)
print('entries:', len(dataB))

# dataB = dataB[dataB['STATE'] != 'AZ'] alternate method of cleaning

dataB.reset_index('STATE' , inplace = True)


dataB = dataB.loc[dataB["YEAR_"] > "1949"] #loc function allows for sorting without reindexing
print('entries:', len(dataB))

entries: 21688
entries: 21593
entries: 16369


remove entries with missing year

In [23]:
print(len(dataB))
print(dataB['YEAR_'].isna().sum()) #isna() is bool but python is 'ducklike' and can add with .sum()
dataB = dataB[dataB['YEAR_'].notna()] #reassigns dataframe to the same datafram with column year with no missing year entries
print(len(dataB))

print(dataB[dataB['YEAR_'] == '1950']) #print all years at 1950

16369
0
16369
      STATE YEAR_ AGENCY UNIT_ID        FIRE_NAME   INC_NUM  \
1078     CA  1950    CDF     SNU      P.G.& E. #4  00000000   
1079     CA  1950    CDF     LMU    BAGGETT GULCH  00000000   
2237     CA  1950    CDF     MVU          CONEJOS  00000000   
2238     CA  1950    USF     CNF                   00000000   
2423     CA  1950    CDF     MVU  SYCAMORE CANYON  00000000   
...     ...   ...    ...     ...              ...       ...   
18617    CA  1950    NPS     KNP         GOAT MTN  00000000   
18618    CA  1950    NPS     KNP        EAGLE PEA  00000000   
18619    CA  1950    NPS     KNP        DEER COVE  00000000   
18620    CA  1950    NPS     KNP        LIGHTN CR  00000000   
18752    CA  1950    NPS     YNP                       None   

                      ALARM_DATE                  CONT_DATE  CAUSE  COMMENTS  \
1078   1950-09-03T00:00:00+00:00                       None   14.0             
1079   1950-06-02T00:00:00+00:00                       None    9.0   

Sort Chronologically

In [19]:
dataB.set_index('YEAR_' , inplace = True)
dataB.sort_values(by = 'YEAR_' , ascending = True , inplace = True)
dataB.reset_index('YEAR_' , inplace = True)

dataB['datetime'] = pd.to_datetime(dataB['ALARM_DATE'] , errors = 'coerce') #create new datetime column
dataB.set_index('datetime' , inplace = True) #set new column to index
dataB.sort_values(by='datetime', ascending = True, inplace = True) #sort values
dataB.reset_index('datetime' , inplace = True) #reset index to numbered list
dataB.drop('datetime' , axis = 1 , inplace = True) #delete the datetime column

print(len(dataB) , dataB.index)
dataB.tail

# for i in range(8):
#     print(dataB[dataB.index == i*1000].ALARM_DATE)

16369 RangeIndex(start=0, stop=16369, step=1)


<bound method NDFrame.tail of       YEAR_ STATE AGENCY UNIT_ID       FIRE_NAME   INC_NUM  \
0      1950    CA    USF     TNF       COLESMILL  00000000   
1      1950    CA    CDF     LNU  WALTER TINDELL  00000000   
2      1950    CA    CDF     BEU       ECHENIQUE  00000000   
3      1950    CA    CCO     VNC    DEVILS GULCH  00000000   
4      1950    CA    CDF     BDU           GOCKE  00000000   
...     ...   ...    ...     ...             ...       ...   
16364  2019    CA    USF     TNF          BORDER  00030785   
16365  2019    CA    USF     INF           KELTY  00001465   
16366  2019    CA    CCO     ORC    BLUE DIAMOND  19059616   
16367  2019    CA    USF     HTF        TAMARACK  00030272   
16368  2019    CA    CDF     FKU            YORK  00005720   

                      ALARM_DATE                  CONT_DATE  CAUSE  \
0      1950-05-11T00:00:00+00:00  1950-05-29T00:00:00+00:00    5.0   
1      1950-05-29T00:00:00+00:00                       None   14.0   
2      1950-05-