In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage
import geopandas

import matplotlib.pyplot as plt

# Reading in the Data

In [2]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('../../finalproj_data/ds_df.parquet', engine = 'pyarrow')

In [3]:
data.shape

(1034376, 50)

In [4]:
data.head()

Unnamed: 0,lat,lon,firemask,MaxFRP,gaugeQualityInfo,hourlyPrecipRate,hourlyPrecipRateGC,observationTimeFlag,satelliteInfoFlag,Albedo_inst,...,Swnet_tavg,Tair_f_inst,Tveg_tavg,Wind_f_inst,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI
0,-4.98565,106.36053,3.0,,0.0,0.0,0.415,-1.0,0.0,,...,,,,,,,0.0,,,
1,-4.98565,106.405485,3.0,,0.0,0.0,0.415,-1.0,0.0,,...,,,,,,,0.0,,,
2,-4.98565,106.450441,3.0,,0.0,0.0,0.415,-1.0,0.0,,...,,,,,,,0.0,,,
3,-4.98565,106.495396,3.0,,0.0,0.0,0.415,-1.0,0.0,,...,,,,,,,0.0,,,
4,-4.98565,106.540352,3.0,,0.0,0.0,0.415,-1.0,0.0,,...,,,,,,,0.0,,,


In [5]:
data.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'gaugeQualityInfo',
       'hourlyPrecipRate', 'hourlyPrecipRateGC', 'observationTimeFlag',
       'satelliteInfoFlag', 'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst',
       'ECanop_tavg', 'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100_200cm_inst', 'SoilTMP10_40cm_inst', 'SoilTMP40_100cm_inst',
       'Swnet_tavg', 'Tair_f_inst', 'Tveg_tavg', 'Wind_f_inst', 'BurnDate',
       'Uncertainty', 'QA', 'FirstDay', 'LastDay', 'FWI'],
      dtype='object')

In [6]:
# Drop columns that were not deemed important features
less_than_onepercent = [
 'gaugeQualityInfo',
 'observationTimeFlag',
 'satelliteInfoFlag',
 'ECanop_tavg',
 'Qg_tavg',
 'Qh_tavg',
 'Qs_acc',
 'Qsm_acc',
 'SWE_inst',
 'SWdown_f_tavg',
 'SnowDepth_inst',
 'Snowf_tavg',
 'SoilMoi40_100cm_inst',
 'SoilTMP10_40cm_inst',
 'Tveg_tavg',
 'Uncertainty']

data.drop(less_than_onepercent, axis = 1, inplace = True)

In [35]:
data['FWI'].describe()

count    386189.000000
mean         58.663441
std          25.820683
min           0.010949
25%          45.882980
50%          67.859268
75%          78.003860
max          96.014587
Name: FWI, dtype: float64

# Exploring the Data

In [21]:
d = data.astype('float')

In [22]:
d['firemask'].value_counts()

3.0    360887
5.0    339477
4.0     87932
9.0        81
8.0        57
6.0        19
7.0        16
Name: firemask, dtype: int64

In [23]:
# d.info()

In [24]:
# MAJOR ASSUMPTION: Fill all nulls with 0
d.fillna(0, inplace = True)

# Create Target Categories

In [40]:
# Start with all the values in FireMask
d['label'] = d['firemask']

In [41]:
d['label'].value_counts()

3.0    360887
5.0    339477
0.0    245907
4.0     87932
9.0        81
8.0        57
6.0        19
7.0        16
Name: label, dtype: int64

In [42]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7 = low confidence fire pixel - classified as land
# 8, 9 = fire (nominal and high confidence fire pixels - classified as fire

d.loc[d['firemask'] == 3, 'label'] = 'water'
d.loc[(d['firemask'] == 5) | (d['firemask'] == 7), 'label'] = 'land'
d.loc[(d['firemask'] == 9) | (d['firemask'] == 8), 'label'] = 'active_fire'

# 4, 6, and zero are unknown right now - we will fill those in later. 
d.loc[d['firemask'] == 0, 'label'] = 'unknown'
d.loc[d['firemask'] == 4, 'label'] = 'unknown'
d.loc[d['firemask'] == 6, 'label'] = 'unknown'

In [43]:
# If burn date is earlier than December 21 (day 344), mark as burned
d.loc[(d['BurnDate'] < 344) & (d['BurnDate'] > 0), 'label'] = 'burned'

# Note: Max value of BurnDate is day 334

In [44]:
# Using the other datasets, let's determine whether unknown pixels are land or water

# If there is a FWI value for the pixel, it is land
d.loc[(d['label']=='unknown') & (d['FWI'] > 0), 'label'] = 'land'

#If there is no (zero) FWI value, it is water
d.loc[(d['label']=='unknown') & (d['FWI'] == 0), 'label'] = 'water'

In [45]:
d['label'].value_counts()

water          661027
land           370977
burned           2234
active_fire       138
Name: label, dtype: int64

In [46]:
# For "land" areas, create consolidated categories from FWI numerical
# based on the ranges appropriate for Australia
# just used extreme category

d.loc[(d['FWI'] > 64) & (d['label'] == 'land'), 'label'] = 'fwi_veryhigh'

Source: https://www.bushfirecrc.com/sites/default/files/managed/resource/dowdy_and_mills-fwi.pdf![fire_thresholds.png](attachment:fire_thresholds.png)

In [47]:
d['label'].value_counts()

water           661027
fwi_veryhigh    217468
land            153509
burned            2234
active_fire        138
Name: label, dtype: int64

# Drop columns that were used to create labels

In [65]:
d.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qle_tavg', 'Qsb_acc',
       'Rainf_f_tavg', 'RootMoist_inst', 'SoilMoi100_200cm_inst',
       'SoilMoi10_40cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100_200cm_inst', 'SoilTMP40_100cm_inst', 'Swnet_tavg',
       'Tair_f_inst', 'Wind_f_inst', 'BurnDate', 'QA', 'FirstDay', 'LastDay',
       'FWI', 'label'],
      dtype='object')

In [66]:
# Drop all variables that were used to create the label columns. 
data_v3 = d.drop(['firemask', 'MaxFRP','BurnDate', 'FirstDay', 'LastDay', 'FWI'], axis = 1)

### Check that data is aligned properly

In [67]:
# Export data as clean parquet gzip file
data_v3.to_parquet('../../finalproj_data/input_data/data_v3.parquet')

In [68]:
data_v3['label'].value_counts()

water          661027
fwi_extreme    326049
land            35977
fwi_high         8951
burned           2234
active_fire       138
Name: label, dtype: int64