In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage

import matplotlib.pyplot as plt

# Reading in the Data

In [2]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('../../finalproj_data/ds_df.parquet', engine = 'pyarrow')

In [3]:
data.head()

Unnamed: 0,lat,lon,firemask,MaxFRP,EVI,EVI2,NVDI,NIR_reflectance,SWIR1_reflectance,SWIR2_reflectance,...,Swnet_tavg,Tair_f_inst,Tveg_tavg,Wind_f_inst,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI
0,-4.98565,106.36053,3.0,,,,,,,,...,,,,,,,0.0,,,
1,-4.98565,106.405485,3.0,,,,,,,,...,,,,,,,0.0,,,
2,-4.98565,106.450441,3.0,,,,,,,,...,,,,,,,0.0,,,
3,-4.98565,106.495396,3.0,,,,,,,,...,,,,,,,0.0,,,
4,-4.98565,106.540352,3.0,,,,,,,,...,,,,,,,0.0,,,


In [6]:
data.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'EVI', 'EVI2', 'NVDI',
       'NIR_reflectance', 'SWIR1_reflectance', 'SWIR2_reflectance',
       'SWIR3_reflectance', 'VI_Quality', 'blue_reflectance',
       'composite_day_of_the_year', 'green_reflectance', 'pixel_reliability',
       'red_reflectance', 'relative_azimuth_angle', 'sun_zenith_angle',
       'view_zenith_angle', 'gaugeQualityInfo', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'observationTimeFlag', 'satelliteInfoFlag',
       'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst', 'ECanop_tavg',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100

# Exploring the Data

In [7]:
data['firemask'].value_counts()

3.0    360887
5.0    339477
4.0     87932
9.0        81
8.0        57
6.0        19
7.0        16
Name: firemask, dtype: int64

In [8]:
d = data.astype('float')

In [9]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1034376 entries, 0 to 1034375
Data columns (total 66 columns):
lat                          1034376 non-null float64
lon                          1034376 non-null float64
firemask                     788469 non-null float64
MaxFRP                       510 non-null float64
EVI                          221147 non-null float64
EVI2                         221147 non-null float64
NVDI                         221147 non-null float64
NIR_reflectance              221147 non-null float64
SWIR1_reflectance            221147 non-null float64
SWIR2_reflectance            221147 non-null float64
SWIR3_reflectance            221147 non-null float64
VI_Quality                   743022 non-null float64
blue_reflectance             221147 non-null float64
composite_day_of_the_year    221147 non-null float64
green_reflectance            221146 non-null float64
pixel_reliability            770573 non-null float64
red_reflectance              215981 non-

In [10]:
# MAJOR ASSUMPTION: Fill all nulls with 0
d.fillna(0, inplace = True)

# Create Target Categories

In [11]:
# Start with all the values in FireMask
d['label'] = d['firemask']

In [12]:
d['label'].value_counts()

3.0    360887
5.0    339477
0.0    245907
4.0     87932
9.0        81
8.0        57
6.0        19
7.0        16
Name: label, dtype: int64

In [15]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7, 8, 9 = fire (low, nominal and high confidence)

d.loc[d['firemask'] == 3, 'label'] = 'water'
d.loc[d['firemask'] == 5, 'label'] = 'land'
d.loc[(d['firemask'] == 9) | (d['firemask'] == 8) | (data['firemask'] == 7), 'label'] = 'active_fire'

In [16]:
# If last day of burn is earlier than December 21 (day 344), mark as burned
d.loc[(d['LastDay'] > 344), 'label'] = 'burned'

# If last day of burn is day 344 - 345, mark as active fire
d.loc[((d['LastDay'] >= 344) & (d['LastDay'] <= 345)), 'label'] = 'active_fire'

# If last day of burn is after day 346, mark as land
# we will later delete these burned columns, so they will not distort the time dimension of the dataset
d.loc[(d['LastDay'] > 345), 'label'] = 'land'

In [18]:
d['label'].value_counts()

land           362741
water          356951
0.0            242245
4.0             65030
5.0              4902
active_fire      2505
6.0                 2
Name: label, dtype: int64

In [22]:
# For "land" areas, create categorical values from FWI numerical

d.loc[(d['FWI'] < 5) & (d['label'] == 'land'), 'label'] = 'fwi_low'
d.loc[(d['FWI'] >= 5) & (d['FWI'] < 8) & (d['label'] == 'land'), 'label'] = 'fwi_moderate'
d.loc[(d['FWI'] >= 8) & (d['FWI'] < 16) & (d['label'] == 'land'), 'label'] = 'fwi_high'
d.loc[(d['FWI'] >= 16) & (d['FWI'] < 29) & (d['label'] == 'land'), 'label'] = 'fwi_veryhigh'
d.loc[(d['FWI'] >= 29) & (d['label'] == 'land'), 'label'] = 'fwi_extreme'

In [27]:
d.loc[(d['label'] == 5), 'label'] = d.loc[(d['label'] == 5), 'FWI']
d.loc[d['label'] == 4, 'label'] = d.loc[d['label'] == 4, 'FWI']
d.loc[d['label'] == 6, 'label'] = d.loc[d['label'] == 6, 'FWI']

In [29]:

d.loc[((d['BurnDate'] > 0) & (d['firemask']==4)), 'label'] = 'burned'

In [30]:
d['label'].value_counts()

fwi_low         582175
water           383783
fwi_extreme      30481
fwi_high         14398
fwi_veryhigh     13382
fwi_moderate      7733
burned            2231
active_fire        193
Name: label, dtype: int64

In [32]:
d.loc[(d['label'] ==0) | (d['label'].isna())]

Unnamed: 0,lat,lon,firemask,MaxFRP,EVI,EVI2,NVDI,NIR_reflectance,SWIR1_reflectance,SWIR2_reflectance,...,Tair_f_inst,Tveg_tavg,Wind_f_inst,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI,label


In [247]:
# # Create categorical values for burned pixels, only for areas that are not active fire
# # burned
# # 0.0 = unburned
# # -2.0 = water

# data.loc[(data['burned'] != 0.) & (data['burned'] != -2.) & (data['label'] != 'active_fire'), 'label'] = 'burned'

In [251]:
# # for missing data from viirs - i.e. cloud and unknown pixels, fill in missing information from categories in burned dataset

# # for cloud pixels...
# # where 'unburned', code as land
# data.loc[(data['label'] == 4) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 4) & (data['burned'] == -2.), 'label'] = 'water'                              
                                 

# # for unknown pixels...
# # where unburned, code as land
# data.loc[(data['label'] == 6) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 6) & (data['burned'] == -2.), 'label'] = 'water'    

# Drop columns that were used to create labels

In [33]:
d.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'EVI', 'EVI2', 'NVDI',
       'NIR_reflectance', 'SWIR1_reflectance', 'SWIR2_reflectance',
       'SWIR3_reflectance', 'VI_Quality', 'blue_reflectance',
       'composite_day_of_the_year', 'green_reflectance', 'pixel_reliability',
       'red_reflectance', 'relative_azimuth_angle', 'sun_zenith_angle',
       'view_zenith_angle', 'gaugeQualityInfo', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'observationTimeFlag', 'satelliteInfoFlag',
       'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst', 'ECanop_tavg',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100

In [34]:
# Create smaller dataset for first model run
newdata_v1 = d.drop(['firemask', 'BurnDate', 'FirstDay', 'LastDay', 'FWI'], axis = 1)

In [35]:
# Export data as clean parquet gzip file
newdata_v1.to_parquet('../../finalproj_data/input_data/newdata_v1.parquet')

In [36]:
newdata_v1['label'].value_counts()

fwi_low         582175
water           383783
fwi_extreme      30481
fwi_high         14398
fwi_veryhigh     13382
fwi_moderate      7733
burned            2231
active_fire        193
Name: label, dtype: int64