In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd


data = gpd.read_file("./CaliFP.geojson", header=0)
data.info() #seeing what the data looks like


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 16446 entries, 0 to 16445
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   OBJECTID        16446 non-null  int64   
 1   YEAR_           16446 non-null  object  
 2   STATE           16446 non-null  object  
 3   AGENCY          16441 non-null  object  
 4   UNIT_ID         16429 non-null  object  
 5   FIRE_NAME       16435 non-null  object  
 6   INC_NUM         16110 non-null  object  
 7   ALARM_DATE      14821 non-null  object  
 8   CONT_DATE       8942 non-null   object  
 9   CAUSE           16422 non-null  float64 
 10  COMMENTS        14525 non-null  object  
 11  REPORT_AC       8478 non-null   float64 
 12  GIS_ACRES       16446 non-null  float64 
 13  C_METHOD        9101 non-null   float64 
 14  OBJECTIVE       16270 non-null  float64 
 15  FIRE_NUM        12620 non-null  object  
 16  COMPLEX_NAME    569 non-null    object  
 17  COMP

Remove fires not in california, before 1950, and less than 10 acres

In [26]:
data.set_index('STATE' , inplace = True) #run entire cell only once after reading from above cell

print('entries:', len(data)) #print how many current entries data frame has
data.drop(index = 'NV' , axis = 0, inplace = True) 
data.drop(index = 'OR' , axis = 0, inplace = True) 
data.drop(index = 'AZ' , axis = 0 , inplace = True)
print('entries:', len(data))

# data = data[data['STATE'] != 'AZ'] alternate method of cleaning
data.reset_index('STATE' , inplace = True)

data = data.loc[data["YEAR_"] > "1949"] #loc function allows for sorting without reindexing
print('entries:', len(data))
data = data.loc[data['GIS_ACRES'] > 10] #reassigns dataframe to the same dataframe w/ acres > 10
print('entries:', len(data))

entries: 16446
entries: 16369
entries: 16369
entries: 14183


All entries have year information

sort dataframe chronologically

In [27]:
data.set_index('YEAR_' , inplace = True)
data.sort_values(by = 'YEAR_' , ascending = True , inplace = True) #sort by year first, then sort by ALARM_DATE
data.reset_index('YEAR_' , inplace = True)

data['datetime'] = pd.to_datetime(data['ALARM_DATE'] , errors = 'coerce') #create new datetime column
data.set_index('datetime' , inplace = True) #set new column to index
data.sort_values(by='datetime', ascending = True, inplace = True) #sort values
data.reset_index('datetime' , inplace = True) #reset index to numbered list
data.drop('datetime' , axis = 1 , inplace = True) #delete the datetime column

print(len(data) , data.index)
data.tail

14183 RangeIndex(start=0, stop=14183, step=1)


<bound method NDFrame.tail of       YEAR_ STATE  OBJECTID AGENCY UNIT_ID       FIRE_NAME   INC_NUM  \
0      1950    CA     29610    USF     TNF       COLESMILL  00000000   
1      1950    CA     25540    CDF     LNU  WALTER TINDELL  00000000   
2      1950    CA     30496    CDF     BEU       ECHENIQUE  00000000   
3      1950    CA     26337    CDF     BDU           GOCKE  00000000   
4      1950    CA     30381    CCO     VNC    DEVILS GULCH  00000000   
...     ...   ...       ...    ...     ...             ...       ...   
14178  2018    CA     42295    CDF     RRU     SKYLINE LRA  00090869   
14179  2019    CA     42763    NPS     MNP            STAR  00013598   
14180  2019    CA     42632    USF     HTF        TAMARACK  00030272   
14181  2019    CA     42626    USF     TNF          BORDER  00030785   
14182  2019    CA     42532    CDF     FKU            YORK  00005720   

                      ALARM_DATE                  CONT_DATE  CAUSE  ...  \
0      1950-05-11T00:00:00+00:

We now want to drop the majority of columns in order to focus on location and time data.

In [28]:
columns_to_drop = ['STATE', 'OBJECTID',  'AGENCY', 'UNIT_ID',
       'INC_NUM',  'CONT_DATE', 'COMMENTS', 'REPORT_AC',
        'C_METHOD', 'OBJECTIVE', 'FIRE_NUM', 'COMPLEX_NAME',
       'COMPLEX_INCNUM', 'SHAPE_Length', 'SHAPE_Area']
data.drop(columns_to_drop , axis = 1 , inplace = True)
data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 14183 entries, 0 to 14182
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   YEAR_       14183 non-null  object  
 1   FIRE_NAME   14177 non-null  object  
 2   ALARM_DATE  12655 non-null  object  
 3   CAUSE       14169 non-null  float64 
 4   GIS_ACRES   14183 non-null  float64 
 5   geometry    14183 non-null  geometry
dtypes: float64(2), geometry(1), object(3)
memory usage: 665.0+ KB


Export cleaned data to a .geojson

In [29]:
data.to_file('Clean_FP.geojson', driver = 'GeoJSON' , encoding = 'utf-8')