In [1]:
import boto3
import pandas as pd
import geopandas as gpd
from io import StringIO

In [2]:
s3 = boto3.client('s3')
S3_BUCKET = 'usgswildfires'
FIRE_DATA_KEY = 'clean_fpa_fod.csv'

In [3]:
YEAR = 2018

In [4]:
query = f"""SELECT FIRE_DATE, \"FIRE_CONT_DATE\", \"GENERAL_CAUSE\", \"SPECIFIC_CAUSE\",
                  \"FIRE_SIZE\", \"FIRE_SIZE_CLASS\", \"STATE\", \"geometry\"
           FROM s3object 
           WHERE \"YEAR\" = '{YEAR}'"""

#### using `OutputSerialization = {'CSV': {}}`

In [5]:
resp = s3.select_object_content(
    Bucket=S3_BUCKET,
    Key=FIRE_DATA_KEY,
    ExpressionType='SQL',
    Expression=query,
    InputSerialization = {'CSV': {"FileHeaderInfo": "Use"}},
    OutputSerialization = {'CSV': {}},
)

records = [] 
for event in resp['Payload']:
    if 'Records' in event:
        records.append(event['Records']['Payload'])
                       
# converting the byte strings to strings and then joining them together
# to form one large string
file_str = ''.join(r.decode('utf-8') for r in records)
# doing StringIO(file_str) so it looks like CSV file to pd.read_csv()
select_df = pd.read_csv(StringIO(file_str), 
                   names=['FIRE_DATE', 'FIRE_CONT_DATE', 'GENERAL_CAUSE', 'SPECIFIC_CAUSE', 'FIRE_SIZE', 
                          'FIRE_SIZE_CLASS', 'STATE', 'geometry'])
select_df.head()

Unnamed: 0,FIRE_DATE,FIRE_CONT_DATE,GENERAL_CAUSE,SPECIFIC_CAUSE,FIRE_SIZE,FIRE_SIZE_CLASS,STATE,geometry
0,2018-01-01 00:00:00+00:00,2018-01-01 00:00:00+00:00,Human,Missing data/not specified/undetermined,0.1,A,AZ,POINT (-112.3133315 33.4484799)
1,2018-01-01 00:00:00+00:00,,Missing data/not specified/undetermined,Missing data/not specified/undetermined,1.0,B,AR,POINT (-92.26694000000001 34.96528)
2,2018-01-01 00:00:00+00:00,,Missing data/not specified/undetermined,Missing data/not specified/undetermined,3.0,B,AR,POINT (-92.87313899999999 35.657854)
3,2018-01-01 00:00:00+00:00,,Human,Missing data/not specified/undetermined,2.0,B,VA,POINT (-79.10916666999999 38.18472222)
4,2018-01-01 00:00:00+00:00,2018-01-01 00:00:00+00:00,Human,Debris and open burning,0.01,A,CA,POINT (-121.793273 39.704699)


In [6]:
select_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80863 entries, 0 to 80862
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   FIRE_DATE        80863 non-null  object 
 1   FIRE_CONT_DATE   62106 non-null  object 
 2   GENERAL_CAUSE    80863 non-null  object 
 3   SPECIFIC_CAUSE   80863 non-null  object 
 4   FIRE_SIZE        80863 non-null  float64
 5   FIRE_SIZE_CLASS  80863 non-null  object 
 6   STATE            80863 non-null  object 
 7   geometry         80863 non-null  object 
dtypes: float64(1), object(7)
memory usage: 39.6 MB


#### optimize memory usage 

In [7]:
# fix DATE columns
select_df['FIRE_DATE'] = pd.to_datetime(select_df['FIRE_DATE'])
select_df['FIRE_CONT_DATE'] = pd.to_datetime(select_df['FIRE_CONT_DATE'])

# fix CATEGORICAL columns
for col in ['GENERAL_CAUSE', 'SPECIFIC_CAUSE', 'FIRE_SIZE_CLASS', 'STATE']:
    select_df[col] = select_df[col].astype('category')
    
# fix NUMERIC columns 
select_df['FIRE_SIZE'] = pd.to_numeric(select_df['FIRE_SIZE'], downcast='unsigned')
# select_df['LON'] = pd.to_numeric(select_df['LON'], downcast='float')
# select_df['LAT'] = pd.to_numeric(select_df['LAT'], downcast='float')

# fix GEOMETRY column
select_df['geometry'] = gpd.GeoSeries.from_wkt(select_df['geometry'])
# select_geo_df = gpd.GeoDataFrame(select_df, geometry='geometry')

# make DATETIME index
select_df = select_df.set_index('FIRE_DATE').sort_index()

In [8]:
select_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 80863 entries, 2018-01-01 00:00:00+00:00 to 2018-12-31 23:59:00+00:00
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   FIRE_CONT_DATE   62106 non-null  datetime64[ns, UTC]
 1   GENERAL_CAUSE    80863 non-null  category           
 2   SPECIFIC_CAUSE   80863 non-null  category           
 3   FIRE_SIZE        80863 non-null  float64            
 4   FIRE_SIZE_CLASS  80863 non-null  category           
 5   STATE            80863 non-null  category           
 6   geometry         80863 non-null  geometry           
dtypes: category(4), datetime64[ns, UTC](1), float64(1), geometry(1)
memory usage: 2.8 MB


#### Done! Now we convert the above code to a function that can be called from within `streamlit_app.py` to query data based on user input