In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage

import matplotlib.pyplot as plt
%run ../pyfiles/data_cleaning.py

# Reading in the Data

In [2]:
!ls -lh final_parquet.parquet.gzip

-rw-r--r-- 1 jupyter jupyter 228M Jan 17 14:44 final_parquet.parquet.gzip


In [55]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('final_parquet.parquet.gzip', engine = 'pyarrow')

In [56]:
data.reset_index(inplace = True)

In [39]:
data.head()

Unnamed: 0,lat,lon,FireMask,MaxFRP,burned,et_500m,_1_km_16_days_EVI,_1_km_16_days_EVI2,_1_km_16_days_NDVI,_1_km_16_days_NIR_reflectance,...,_1_km_16_days_red_reflectance,_1_km_16_days_VI_Quality,_1_km_16_days_pixel_reliability,_DC,_DMC,_FFMC,_ISI,_BUI,_FWI,_DSR
0,4.3625,96.329167,5.0,,0.0,3276.5,0.611,0.6071,0.846,0.3885,...,0.0324,2257.0,6.0,6.153957,1.339365,43.675632,0.137625,1.721346,0.042499,0.000183
1,4.3625,96.3375,5.0,,0.0,3276.5,0.5452,0.5392,0.8948,0.3081,...,0.0171,2185.0,2.0,6.153957,1.339365,43.675632,0.137625,1.721346,0.042499,0.000183
2,4.3625,96.345833,5.0,,0.0,3276.5,0.4867,0.4878,0.6774,0.3734,...,0.0718,2257.0,6.0,6.153957,1.339365,43.675632,0.137625,1.721346,0.042499,0.000183
3,4.3625,96.354167,5.0,,0.0,33.700001,0.6779,0.6552,0.9191,0.3916,...,0.0165,35297.0,7.0,6.153957,1.339365,43.675632,0.137625,1.721346,0.042499,0.000183
4,4.3625,96.3625,5.0,,0.0,27.0,0.6779,0.6552,0.9191,0.3916,...,0.0165,35297.0,7.0,6.153957,1.339365,43.675632,0.137625,1.721346,0.042499,0.000183


# Exploring the Data

In [21]:
# data['burned'].value_counts()

-2.0      32127631
 0.0      12789227
 312.0        4550
 323.0        2967
 325.0        2881
 321.0        2856
 314.0        2793
 316.0        2761
 313.0        2719
 311.0        2459
 324.0        2307
 309.0        2280
 320.0        2227
 317.0        2141
 328.0        2135
 318.0        2121
 329.0        2092
 330.0        2067
 322.0        2029
 319.0        2021
 333.0        1973
 315.0        1901
 327.0        1843
 334.0        1787
 332.0        1669
 310.0        1635
 331.0        1628
 326.0        1577
 307.0        1557
 308.0        1361
 306.0        1292
 305.0        1047
Name: burned, dtype: int64

In [16]:
# data['FireMask'].value_counts()

3.0    29361362
5.0    11297991
4.0     4485529
8.0         757
9.0         516
7.0          80
6.0          80
Name: FireMask, dtype: int64

In [51]:
# data.isna().sum()

lat                                               0
lon                                               0
FireMask                                    7584133
MaxFRP                                     52729095
burned                                      7748914
et_500m                                     7586147
_1_km_16_days_EVI                          39586885
_1_km_16_days_EVI2                         39586885
_1_km_16_days_NDVI                         39586820
_1_km_16_days_NIR_reflectance              39586820
_1_km_16_days_SWIR1_reflectance            39586838
_1_km_16_days_SWIR2_reflectance            39586971
_1_km_16_days_SWIR3_reflectance            39586978
_1_km_16_days_blue_reflectance             39620597
_1_km_16_days_composite_day_of_the_year    39586820
_1_km_16_days_green_reflectance            39595518
_1_km_16_days_red_reflectance              39586820
_1_km_16_days_VI_Quality                    7584133
_1_km_16_days_pixel_reliability            39586820
_DC         

# Create Target Categories

In [57]:
# Start with all the values in FireMask
data['label'] = data['FireMask']

In [58]:
data['label'].value_counts()

3.0    29361362
5.0    11297991
4.0     4485529
8.0         757
9.0         516
7.0          80
6.0          80
Name: label, dtype: int64

In [59]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7, 8, 9 = fire (low, nominal and high confidence)

data.loc[(data['FireMask'] == 9) | (data['FireMask'] == 8) | (data['FireMask'] == 7), 'label'] = 'active_fire'

In [60]:
# Create categorical values from FWI numerical, only for areas that are not active fire
data.loc[(data['_FWI'] < 5) & (data['label'] != 'active_fire'), 'label'] = 'fwi_low'
data.loc[(data['_FWI'] >= 5) & (data['_FWI'] < 8) & (data['label'] != 'active_fire'), 'label'] = 'fwi_moderate'
data.loc[(data['_FWI'] >= 8) & (data['_FWI'] < 16) & (data['label'] != 'active_fire'), 'label'] = 'fwi_high'
data.loc[(data['_FWI'] >= 16) & (data['_FWI'] < 29) & (data['label'] != 'active_fire'), 'label'] = 'fwi_veryhigh'
data.loc[(data['_FWI'] >= 29) & (data['label'] != 'active_fire'), 'label'] = 'fwi_extreme'

In [61]:
# Create categorical values for burned pixels, only for areas that are not active fire
# burned
# 0.0 = unburned
# -2.0 = water

data.loc[(data['burned'] != 0.) & (data['burned'] != -2.) & (data['label'] != 'active_fire'), 'label'] = 'burned'

In [62]:
data['label'].value_counts()

3.0             28197889
fwi_extreme      9308511
burned           7813581
fwi_low          3031671
4.0              2826233
fwi_veryhigh      610905
fwi_high          410397
5.0               318870
fwi_moderate      211015
active_fire         1353
6.0                   23
Name: label, dtype: int64

In [66]:
# For the "land" label, there are nan values from fire weather index. So code as land. 
data.loc[(data['label'] == 5.), 'label'] = 'land'

In [67]:
# Change viirs FM 3 to water 
data.loc[data['label'] == 3, 'label'] = 'water'

In [69]:
# for missing data from viirs - i.e. cloud and unknown pixels, fill in missing information from categories in burned dataset

# for cloud pixels...
# where 'unburned', code as land
data.loc[(data['label'] == 4) & (data['burned'] == 0), 'label'] = 'land'
# where 'water', code as water
data.loc[(data['label'] == 4) & (data['burned'] == -2.), 'label'] = 'water'                              
                                 

# for unknown pixels...
# where unburned, code as land
data.loc[(data['label'] == 6) & (data['burned'] == 0), 'label'] = 'land'
# where 'water', code as water
data.loc[(data['label'] == 6) & (data['burned'] == -2.), 'label'] = 'water'    

In [70]:
# Check if all string categories
data['label'].value_counts()

water           30991646
fwi_extreme      9308511
burned           7813581
fwi_low          3031671
fwi_veryhigh      610905
fwi_high          410397
land              351369
fwi_moderate      211015
active_fire         1353
Name: label, dtype: int64

In [71]:
# Check if there are any nulls or zeros left in label column

data.loc[(data['label']==0) | (data['label'].isna()),:]

Unnamed: 0,lat,lon,FireMask,MaxFRP,burned,et_500m,_1_km_16_days_EVI,_1_km_16_days_EVI2,_1_km_16_days_NDVI,_1_km_16_days_NIR_reflectance,...,_1_km_16_days_VI_Quality,_1_km_16_days_pixel_reliability,_DC,_DMC,_FFMC,_ISI,_BUI,_FWI,_DSR,label


# Drop columns that were used to create labels

In [72]:
data.columns

Index(['lat', 'lon', 'FireMask', 'MaxFRP', 'burned', 'et_500m',
       '_1_km_16_days_EVI', '_1_km_16_days_EVI2', '_1_km_16_days_NDVI',
       '_1_km_16_days_NIR_reflectance', '_1_km_16_days_SWIR1_reflectance',
       '_1_km_16_days_SWIR2_reflectance', '_1_km_16_days_SWIR3_reflectance',
       '_1_km_16_days_blue_reflectance',
       '_1_km_16_days_composite_day_of_the_year',
       '_1_km_16_days_green_reflectance', '_1_km_16_days_red_reflectance',
       '_1_km_16_days_VI_Quality', '_1_km_16_days_pixel_reliability', '_DC',
       '_DMC', '_FFMC', '_ISI', '_BUI', '_FWI', '_DSR', 'label'],
      dtype='object')

In [73]:
# Create smaller dataset for first model run
dataset_v1 = data.drop(['FireMask', '_FWI', 'burned', '_1_km_16_days_composite_day_of_the_year', '_1_km_16_days_pixel_reliability', '_1_km_16_days_VI_Quality'], axis = 1)

# Address Missing Values

In [76]:
# Address missing values
dataset_v1.isna().sum()

lat                                       0
lon                                       0
MaxFRP                                    0
et_500m                             7586147
_1_km_16_days_EVI                  39586885
_1_km_16_days_EVI2                 39586885
_1_km_16_days_NDVI                 39586820
_1_km_16_days_NIR_reflectance      39586820
_1_km_16_days_SWIR1_reflectance    39586838
_1_km_16_days_SWIR2_reflectance    39586971
_1_km_16_days_SWIR3_reflectance    39586978
_1_km_16_days_blue_reflectance     39620597
_1_km_16_days_green_reflectance    39595518
_1_km_16_days_red_reflectance      39586820
_DC                                38933888
_DMC                               38933888
_FFMC                              38933888
_ISI                               38933888
_BUI                               38933888
_DSR                               38933888
label                                     0
dtype: int64

In [75]:
# Missing Data Decisions for dataset_v1

# set all missing MaxFRP values as 0
dataset_v1.loc[(small_dataset['MaxFRP'].isna()), 'MaxFRP'] = 0

# i imagine some of the other data is not available because it is over water...in which case set to zero?
# how to address other missing values? 

In [78]:
# Export data as clean parquet gzip file
dataset_v1.to_parquet('dataset_v1.parquet.gzip', compression = 'gzip')