### This notebook creates a dataset with consolidated categories

In [23]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage

import matplotlib.pyplot as plt

# Reading in the Data

In [24]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('../../finalproj_data/ds_df.parquet', engine = 'pyarrow')

In [25]:
data.head()

Unnamed: 0,lat,lon,firemask,MaxFRP,EVI,EVI2,NVDI,NIR_reflectance,SWIR1_reflectance,SWIR2_reflectance,...,Swnet_tavg,Tair_f_inst,Tveg_tavg,Wind_f_inst,BurnDate,Uncertainty,QA,FirstDay,LastDay,FWI
0,-4.98565,106.36053,3.0,,,,,,,,...,,,,,,,0.0,,,
1,-4.98565,106.405485,3.0,,,,,,,,...,,,,,,,0.0,,,
2,-4.98565,106.450441,3.0,,,,,,,,...,,,,,,,0.0,,,
3,-4.98565,106.495396,3.0,,,,,,,,...,,,,,,,0.0,,,
4,-4.98565,106.540352,3.0,,,,,,,,...,,,,,,,0.0,,,


In [4]:
data.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'EVI', 'EVI2', 'NVDI',
       'NIR_reflectance', 'SWIR1_reflectance', 'SWIR2_reflectance',
       'SWIR3_reflectance', 'VI_Quality', 'blue_reflectance',
       'composite_day_of_the_year', 'green_reflectance', 'pixel_reliability',
       'red_reflectance', 'relative_azimuth_angle', 'sun_zenith_angle',
       'view_zenith_angle', 'gaugeQualityInfo', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'observationTimeFlag', 'satelliteInfoFlag',
       'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst', 'ECanop_tavg',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100

# Exploring the Data

In [26]:
data['firemask'].value_counts()

3.0    383783
5.0    364563
4.0    171658
9.0        97
8.0        78
6.0        76
7.0        18
Name: firemask, dtype: int64

In [6]:
d = data.astype('float')

In [7]:
# d.info()

In [8]:
## Assumption set all nulls to zero!
d.fillna(0, inplace = True)

# Create Target Categories

In [9]:
# Start with all the values in FireMask
d['label'] = d['firemask']

In [10]:
d['label'].value_counts()

3.0    383783
5.0    364563
4.0    171658
0.0    114103
9.0        97
8.0        78
6.0        76
7.0        18
Name: label, dtype: int64

In [11]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7, 8, 9 = fire (low, nominal and high confidence)

d.loc[(d['firemask'] == 9) | (d['firemask'] == 8) | (data['firemask'] == 7), 'label'] = 'active_fire'
d.loc[d['firemask'] == 3, 'label'] = 'water'

In [12]:
d.loc[((d['LastDay'] > 0) & (d['firemask']==5)), 'label'] = 'burned'
d.loc[((d['LastDay'] > 0) & (d['firemask']==4)), 'label'] = 'burned'

In [13]:
# Create categorical values from FWI numerical, only for areas that are not active fire or burned
d.loc[(d['FWI'] < 5) & (d['label'] == 5), 'label'] = 'land'
d.loc[(d['FWI'] < 5) & (d['label'] == 5), 'label'] = 'land'

d.loc[(d['FWI'] >= 5) & (d['FWI'] < 8) & (d['label'] == 5), 'label'] = 'fwi_moderate'
d.loc[(d['FWI'] >= 5) & (d['FWI'] < 8) & (d['label'] == 4), 'label'] = 'fwi_moderate'

d.loc[(d['FWI'] >= 8)  & (d['label'] == 5), 'label'] = 'fwi_high'
d.loc[(d['FWI'] >= 8)  & (d['label'] == 4), 'label'] = 'fwi_high'

In [14]:
d.loc[(d['label']==5), 'label'] = 'land'

In [15]:
# CLOUD COVER ASSUMPTION:
# Assuming if soil average value, then land. If not, water. 
d.loc[(d['label']==4) & (d['ESoil_tavg'] > 0), 'label'] = 'land'
d.loc[(d['label']==4), 'label'] = 'water'

## ASUMING UNKNOWNS AND ZEROS ARE WATER
d.loc[(d['label']==6), 'label'] = 'water'
d.loc[d['label']==0, 'label'] = 'water'

In [16]:
# # Create categorical values for burned pixels, only for areas that are not active fire
# # burned
# # 0.0 = unburned
# # -2.0 = water

# data.loc[(data['burned'] != 0.) & (data['burned'] != -2.) & (data['label'] != 'active_fire'), 'label'] = 'burned'

In [17]:
# # for missing data from viirs - i.e. cloud and unknown pixels, fill in missing information from categories in burned dataset

# # for cloud pixels...
# # where 'unburned', code as land
# data.loc[(data['label'] == 4) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 4) & (data['burned'] == -2.), 'label'] = 'water'                              
                                 

# # for unknown pixels...
# # where unburned, code as land
# data.loc[(data['label'] == 6) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 6) & (data['burned'] == -2.), 'label'] = 'water'    

# Drop columns that were used to create labels

In [18]:
d.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'EVI', 'EVI2', 'NVDI',
       'NIR_reflectance', 'SWIR1_reflectance', 'SWIR2_reflectance',
       'SWIR3_reflectance', 'VI_Quality', 'blue_reflectance',
       'composite_day_of_the_year', 'green_reflectance', 'pixel_reliability',
       'red_reflectance', 'relative_azimuth_angle', 'sun_zenith_angle',
       'view_zenith_angle', 'gaugeQualityInfo', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'observationTimeFlag', 'satelliteInfoFlag',
       'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst', 'ECanop_tavg',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100

In [19]:
# Create smaller dataset for first model run
newdata_v1 = d.drop(['firemask', 'BurnDate', 'FirstDay', 'LastDay', 'FWI'], axis = 1)

In [20]:
newdata_v1['label'].value_counts()

water           649240
burned          369925
fwi_high         11823
land              2463
fwi_moderate       732
active_fire        193
Name: label, dtype: int64

In [21]:
# Export data as clean parquet gzip file
newdata_v1.to_parquet('../../finalproj_data/input_data/newdata_v2.parquet')

In [22]:
newdata_v1['label'].value_counts()

water           649240
burned          369925
fwi_high         11823
land              2463
fwi_moderate       732
active_fire        193
Name: label, dtype: int64