In [234]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage

import matplotlib.pyplot as plt

# Reading in the Data

In [235]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('../../finalproj_data/ds_df.parquet', engine = 'pyarrow')

In [236]:
data.head()

Unnamed: 0,lat,lon,firemask,MaxFRP,EVI,EVI2,NVDI,NIR_reflectance,SWIR1_reflectance,SWIR2_reflectance,...,SoilTMP40_100cm_inst,Swnet_tavg,Tair_f_inst,Tveg_tavg,Wind_f_inst,BurnDate,Uncertainty,QA,FirstDay,LastDay
0,-48.868351,84.172142,4.0,,,,,,,,...,,,,,,,,0.0,,
1,-48.868351,84.262077,4.0,,,,,,,,...,,,,,,,,0.0,,
2,-48.868351,84.352011,4.0,,,,,,,,...,,,,,,,,0.0,,
3,-48.868351,84.441945,4.0,,,,,,,,...,,,,,,,,0.0,,
4,-48.868351,84.53188,4.0,,,,,,,,...,,,,,,,,0.0,,


In [237]:
data.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'EVI', 'EVI2', 'NVDI',
       'NIR_reflectance', 'SWIR1_reflectance', 'SWIR2_reflectance',
       'SWIR3_reflectance', 'VI_Quality', 'blue_reflectance',
       'composite_day_of_the_year', 'green_reflectance', 'pixel_reliability',
       'red_reflectance', 'relative_azimuth_angle', 'sun_zenith_angle',
       'view_zenith_angle', 'gaugeQualityInfo', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'observationTimeFlag', 'satelliteInfoFlag',
       'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst', 'ECanop_tavg',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100

# Exploring the Data

In [238]:
data['firemask'].value_counts()

3.0    165659
5.0     97860
4.0     63462
9.0        19
8.0        19
6.0         3
7.0         2
Name: firemask, dtype: int64

In [239]:
d = data.astype('float')

In [240]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527896 entries, 0 to 527895
Data columns (total 65 columns):
lat                          527896 non-null float64
lon                          527896 non-null float64
firemask                     327024 non-null float64
MaxFRP                       314 non-null float64
EVI                          73595 non-null float64
EVI2                         73595 non-null float64
NVDI                         73595 non-null float64
NIR_reflectance              73595 non-null float64
SWIR1_reflectance            73595 non-null float64
SWIR2_reflectance            73595 non-null float64
SWIR3_reflectance            73595 non-null float64
VI_Quality                   320796 non-null float64
blue_reflectance             73595 non-null float64
composite_day_of_the_year    73595 non-null float64
green_reflectance            73595 non-null float64
pixel_reliability            335422 non-null float64
red_reflectance              68794 non-null float64
re

In [241]:
# MAJOR ASSUMPTION: Fill all nulls with 0
d.fillna(0, inplace = True)

# Create Target Categories

In [242]:
# Start with all the values in FireMask
d['label'] = d['firemask']

In [243]:
d['label'].value_counts()

0.0    200872
3.0    165659
5.0     97860
4.0     63462
8.0        19
9.0        19
6.0         3
7.0         2
Name: label, dtype: int64

In [244]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7, 8, 9 = fire (low, nominal and high confidence)

d.loc[(d['firemask'] == 9) | (d['firemask'] == 8) | (data['firemask'] == 7), 'label'] = 'active_fire'
d.loc[(d['label'] == 5.), 'label'] = 'land'
d.loc[d['label'] == 3, 'label'] = 'water'
d.loc[d['label'] == 4, 'label'] = 'cloud'
d.loc[d['label'] == 6, 'label'] = 'unknown'

In [245]:
d.loc[(d['FirstDay'] > 0), 'label'] = 'land'
d.loc[((d['LastDay'] > 335) & (d['firemask']==5)), 'label'] = 'active_fire'
d.loc[((d['LastDay'] > 335) & (d['firemask']==4)), 'label'] = 'active_fire'
d.loc[((d['BurnDate'] > 0) & (d['firemask']==5)), 'label'] = 'burned'
d.loc[((d['BurnDate'] > 0) & (d['firemask']==4)), 'label'] = 'burned'
d.loc[((d['FirstDay'] > 0) & (d['firemask']==4)), 'label'] = 'land'
d.loc[((d['FirstDay'] > 0) & (d['firemask']==6)), 'label'] = 'land'


In [269]:
d.loc[(d['label'] ==0), 'label']= 'unknown'

In [271]:
d['label'].value_counts()

unknown        198433
water          154610
active_fire     68942
land            53739
cloud           51887
burned            285
Name: label, dtype: int64

In [247]:
# # Create categorical values for burned pixels, only for areas that are not active fire
# # burned
# # 0.0 = unburned
# # -2.0 = water

# data.loc[(data['burned'] != 0.) & (data['burned'] != -2.) & (data['label'] != 'active_fire'), 'label'] = 'burned'

In [272]:
# Check if there are any nulls or zeros left in label column

d.loc[(d['label']==0)]

Unnamed: 0,lat,lon,firemask,MaxFRP,EVI,EVI2,NVDI,NIR_reflectance,SWIR1_reflectance,SWIR2_reflectance,...,Swnet_tavg,Tair_f_inst,Tveg_tavg,Wind_f_inst,BurnDate,Uncertainty,QA,FirstDay,LastDay,label


In [249]:
# Code for old data below

In [250]:
# # Create categorical values from FWI numerical, only for areas that are not active fire
# data.loc[(data['_FWI'] < 5) & (data['label'] != 'active_fire'), 'label'] = 'fwi_low'
# data.loc[(data['_FWI'] >= 5) & (data['_FWI'] < 8) & (data['label'] != 'active_fire'), 'label'] = 'fwi_moderate'
# data.loc[(data['_FWI'] >= 8) & (data['_FWI'] < 16) & (data['label'] != 'active_fire'), 'label'] = 'fwi_high'
# data.loc[(data['_FWI'] >= 16) & (data['_FWI'] < 29) & (data['label'] != 'active_fire'), 'label'] = 'fwi_veryhigh'
# data.loc[(data['_FWI'] >= 29) & (data['label'] != 'active_fire'), 'label'] = 'fwi_extreme'

In [251]:
# # for missing data from viirs - i.e. cloud and unknown pixels, fill in missing information from categories in burned dataset

# # for cloud pixels...
# # where 'unburned', code as land
# data.loc[(data['label'] == 4) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 4) & (data['burned'] == -2.), 'label'] = 'water'                              
                                 

# # for unknown pixels...
# # where unburned, code as land
# data.loc[(data['label'] == 6) & (data['burned'] == 0), 'label'] = 'land'
# # where 'water', code as water
# data.loc[(data['label'] == 6) & (data['burned'] == -2.), 'label'] = 'water'    

# Drop columns that were used to create labels

In [252]:
d.columns

Index(['lat', 'lon', 'firemask', 'MaxFRP', 'EVI', 'EVI2', 'NVDI',
       'NIR_reflectance', 'SWIR1_reflectance', 'SWIR2_reflectance',
       'SWIR3_reflectance', 'VI_Quality', 'blue_reflectance',
       'composite_day_of_the_year', 'green_reflectance', 'pixel_reliability',
       'red_reflectance', 'relative_azimuth_angle', 'sun_zenith_angle',
       'view_zenith_angle', 'gaugeQualityInfo', 'hourlyPrecipRate',
       'hourlyPrecipRateGC', 'observationTimeFlag', 'satelliteInfoFlag',
       'Albedo_inst', 'AvgSurfT_inst', 'CanopInt_inst', 'ECanop_tavg',
       'ESoil_tavg', 'Evap_tavg', 'LWdown_f_tavg', 'Lwnet_tavg',
       'PotEvap_tavg', 'Psurf_f_inst', 'Qair_f_inst', 'Qg_tavg', 'Qh_tavg',
       'Qle_tavg', 'Qs_acc', 'Qsb_acc', 'Qsm_acc', 'Rainf_f_tavg',
       'RootMoist_inst', 'SWE_inst', 'SWdown_f_tavg', 'SnowDepth_inst',
       'Snowf_tavg', 'SoilMoi100_200cm_inst', 'SoilMoi10_40cm_inst',
       'SoilMoi40_100cm_inst', 'SoilTMP0_10cm_inst', 'SoilMoi0_10cm_inst',
       'SoilTMP100

In [273]:
# Create smaller dataset for first model run
newdata_v1 = d.drop(['firemask', 'BurnDate', 'FirstDay', 'LastDay'], axis = 1)

In [274]:
# Export data as clean parquet gzip file
newdata_v1.to_parquet('../../finalproj_data/input_data/newdata_v1.parquet')

In [275]:
newdata_v1['label'].value_counts()

unknown        198433
water          154610
active_fire     68942
land            53739
cloud           51887
burned            285
Name: label, dtype: int64