In [1]:
import pandas as pd
import xarray as xr
import numpy as np
import scipy.interpolate
from netCDF4 import Dataset
import bottleneck
import pyarrow
import scipy.ndimage as ndimage

import matplotlib.pyplot as plt
%run ../pyfiles/data_cleaning.py

# Reading in the Data

In [3]:
!ls -lh final_parquet.parquet

-rw-r--r-- 1 jupyter jupyter 860K Jan 19 15:17 final_parquet.parquet


In [4]:
# Import Merged Data from Notebook in Cloud
data = pd.read_parquet('final_parquet.parquet', engine = 'pyarrow')

In [5]:
data.reset_index(inplace = True)

In [6]:
data.head()

Unnamed: 0,lat,lon,_DC,_DMC,_FFMC,_ISI,_BUI,_FWI,_DSR,burned,...,_1_km_16_days_NDVI,_1_km_16_days_NIR_reflectance,_1_km_16_days_SWIR1_reflectance,_1_km_16_days_SWIR2_reflectance,_1_km_16_days_SWIR3_reflectance,_1_km_16_days_blue_reflectance,_1_km_16_days_green_reflectance,_1_km_16_days_red_reflectance,FireMask,MaxFRP
0,-48.0,96.5625,6.153957,1.339365,43.675632,0.137625,1.721346,0.042499,0.000183,0.0,...,0.846,0.3885,0.349,0.1735,0.0732,0.0168,0.0518,0.0324,5.0,
1,-48.0,96.875,5.446888,1.396776,35.734371,0.048084,1.690183,0.016409,9e-05,,...,0.1914,0.558,0.4408,0.3231,0.2145,0.2367,0.2513,0.3787,5.0,
2,-48.0,97.1875,6.151443,1.368903,41.818214,0.13151,1.743675,0.04266,0.000217,,...,0.1894,0.2967,0.267,0.1492,0.1058,0.2162,0.2197,0.2022,5.0,
3,-48.0,97.5,8.828405,1.300164,40.428169,0.149596,1.817403,0.050666,0.000281,0.0,...,0.9564,0.1889,0.1801,0.0703,0.0312,0.0043,0.0146,0.0042,5.0,
4,-48.0,97.8125,20.771587,4.425531,55.349815,0.535304,5.688671,0.398845,0.015286,0.0,...,0.3158,0.6072,0.3881,0.2295,0.1327,0.1168,0.1412,0.3157,5.0,


# Exploring the Data

In [21]:
# data['burned'].value_counts()

-2.0      32127631
 0.0      12789227
 312.0        4550
 323.0        2967
 325.0        2881
 321.0        2856
 314.0        2793
 316.0        2761
 313.0        2719
 311.0        2459
 324.0        2307
 309.0        2280
 320.0        2227
 317.0        2141
 328.0        2135
 318.0        2121
 329.0        2092
 330.0        2067
 322.0        2029
 319.0        2021
 333.0        1973
 315.0        1901
 327.0        1843
 334.0        1787
 332.0        1669
 310.0        1635
 331.0        1628
 326.0        1577
 307.0        1557
 308.0        1361
 306.0        1292
 305.0        1047
Name: burned, dtype: int64

In [16]:
# data['FireMask'].value_counts()

3.0    29361362
5.0    11297991
4.0     4485529
8.0         757
9.0         516
7.0          80
6.0          80
Name: FireMask, dtype: int64

In [51]:
# data.isna().sum()

lat                                               0
lon                                               0
FireMask                                    7584133
MaxFRP                                     52729095
burned                                      7748914
et_500m                                     7586147
_1_km_16_days_EVI                          39586885
_1_km_16_days_EVI2                         39586885
_1_km_16_days_NDVI                         39586820
_1_km_16_days_NIR_reflectance              39586820
_1_km_16_days_SWIR1_reflectance            39586838
_1_km_16_days_SWIR2_reflectance            39586971
_1_km_16_days_SWIR3_reflectance            39586978
_1_km_16_days_blue_reflectance             39620597
_1_km_16_days_composite_day_of_the_year    39586820
_1_km_16_days_green_reflectance            39595518
_1_km_16_days_red_reflectance              39586820
_1_km_16_days_VI_Quality                    7584133
_1_km_16_days_pixel_reliability            39586820
_DC         

# Create Target Categories

In [7]:
# Start with all the values in FireMask
data['label'] = data['FireMask']

In [8]:
data['label'].value_counts()

3.0    25920
5.0     9958
4.0     3971
Name: label, dtype: int64

In [9]:
# Create categorical values for active fire pixels
# FireMask
# 3 = water, non-fire
# 4 = cloud (land or water)
# 5 = land, non-fire
# 6 = unknown (land or water)
# 7, 8, 9 = fire (low, nominal and high confidence)

data.loc[(data['FireMask'] == 9) | (data['FireMask'] == 8) | (data['FireMask'] == 7), 'label'] = 'active_fire'

In [10]:
# Create categorical values from FWI numerical, only for areas that are not active fire
data.loc[(data['_FWI'] < 5) & (data['label'] != 'active_fire'), 'label'] = 'fwi_low'
data.loc[(data['_FWI'] >= 5) & (data['_FWI'] < 8) & (data['label'] != 'active_fire'), 'label'] = 'fwi_moderate'
data.loc[(data['_FWI'] >= 8) & (data['_FWI'] < 16) & (data['label'] != 'active_fire'), 'label'] = 'fwi_high'
data.loc[(data['_FWI'] >= 16) & (data['_FWI'] < 29) & (data['label'] != 'active_fire'), 'label'] = 'fwi_veryhigh'
data.loc[(data['_FWI'] >= 29) & (data['label'] != 'active_fire'), 'label'] = 'fwi_extreme'

In [11]:
# Create categorical values for burned pixels, only for areas that are not active fire
# burned
# 0.0 = unburned
# -2.0 = water

data.loc[(data['burned'] != 0.) & (data['burned'] != -2.) & (data['label'] != 'active_fire'), 'label'] = 'burned'

In [12]:
data['label'].value_counts()

3.0             24923
fwi_extreme      8164
burned           6974
fwi_low          2665
4.0              2544
fwi_veryhigh      537
fwi_high          361
5.0               269
fwi_moderate      183
Name: label, dtype: int64

In [13]:
# For the "land" label, there are nan values from fire weather index. So code as land. 
data.loc[(data['label'] == 5.), 'label'] = 'land'

In [14]:
# Change viirs FM 3 to water 
data.loc[data['label'] == 3, 'label'] = 'water'

In [15]:
# for missing data from viirs - i.e. cloud and unknown pixels, fill in missing information from categories in burned dataset

# for cloud pixels...
# where 'unburned', code as land
data.loc[(data['label'] == 4) & (data['burned'] == 0), 'label'] = 'land'
# where 'water', code as water
data.loc[(data['label'] == 4) & (data['burned'] == -2.), 'label'] = 'water'                              
                                 

# for unknown pixels...
# where unburned, code as land
data.loc[(data['label'] == 6) & (data['burned'] == 0), 'label'] = 'land'
# where 'water', code as water
data.loc[(data['label'] == 6) & (data['burned'] == -2.), 'label'] = 'water'    

In [16]:
# Check if all string categories
data['label'].value_counts()

water           27454
fwi_extreme      8164
burned           6974
fwi_low          2665
fwi_veryhigh      537
fwi_high          361
land              282
fwi_moderate      183
Name: label, dtype: int64

In [17]:
# Check if there are any nulls or zeros left in label column

data.loc[(data['label']==0) | (data['label'].isna()),:]

Unnamed: 0,lat,lon,_DC,_DMC,_FFMC,_ISI,_BUI,_FWI,_DSR,burned,...,_1_km_16_days_NIR_reflectance,_1_km_16_days_SWIR1_reflectance,_1_km_16_days_SWIR2_reflectance,_1_km_16_days_SWIR3_reflectance,_1_km_16_days_blue_reflectance,_1_km_16_days_green_reflectance,_1_km_16_days_red_reflectance,FireMask,MaxFRP,label


# Drop columns that were used to create labels

In [18]:
data.columns

Index(['lat', 'lon', '_DC', '_DMC', '_FFMC', '_ISI', '_BUI', '_FWI', '_DSR',
       'burned', 'et_500m', '_1_km_16_days_EVI', '_1_km_16_days_EVI2',
       '_1_km_16_days_NDVI', '_1_km_16_days_NIR_reflectance',
       '_1_km_16_days_SWIR1_reflectance', '_1_km_16_days_SWIR2_reflectance',
       '_1_km_16_days_SWIR3_reflectance', '_1_km_16_days_blue_reflectance',
       '_1_km_16_days_green_reflectance', '_1_km_16_days_red_reflectance',
       'FireMask', 'MaxFRP', 'label'],
      dtype='object')

In [20]:
# Create smaller dataset for first model run
dataset_v1 = data.drop(['FireMask', '_FWI', 'burned'], axis = 1)

# Optimize Dataset for Memory

In [32]:
dataset_v1 = pd.read_parquet('dataset_v1.parquet.gzip')

In [21]:
# dataset_v1.info(memory_usage = 'deep')
# memory usage: 7.6 GB
# Thus, try to convert MaxFRP to float32
dataset_v1['MaxFRP'] = dataset_v1['MaxFRP'].astype('float32')
# Now memory usage: 7.4 GB

# Not sure what else to do. Changing decimal places actually adds memory.

In [22]:
dataset_v1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46620 entries, 0 to 46619
Data columns (total 21 columns):
lat                                46620 non-null float64
lon                                46620 non-null float64
_DC                                12105 non-null float32
_DMC                               12105 non-null float32
_FFMC                              12105 non-null float32
_ISI                               12105 non-null float32
_BUI                               12105 non-null float32
_DSR                               12105 non-null float32
et_500m                            39847 non-null float32
_1_km_16_days_EVI                  11524 non-null float32
_1_km_16_days_EVI2                 11524 non-null float32
_1_km_16_days_NDVI                 11524 non-null float32
_1_km_16_days_NIR_reflectance      11524 non-null float32
_1_km_16_days_SWIR1_reflectance    11524 non-null float32
_1_km_16_days_SWIR2_reflectance    11524 non-null float32
_1_km_16_days_SWIR3_r

# Address Missing Values

In [23]:
# For first run, replace all nulls with zeros. 
dataset_v1.fillna(0, inplace = True)

In [24]:
# Address missing values
dataset_v1.isna().sum()

lat                                0
lon                                0
_DC                                0
_DMC                               0
_FFMC                              0
_ISI                               0
_BUI                               0
_DSR                               0
et_500m                            0
_1_km_16_days_EVI                  0
_1_km_16_days_EVI2                 0
_1_km_16_days_NDVI                 0
_1_km_16_days_NIR_reflectance      0
_1_km_16_days_SWIR1_reflectance    0
_1_km_16_days_SWIR2_reflectance    0
_1_km_16_days_SWIR3_reflectance    0
_1_km_16_days_blue_reflectance     0
_1_km_16_days_green_reflectance    0
_1_km_16_days_red_reflectance      0
MaxFRP                             0
label                              0
dtype: int64

In [26]:
# Export data as clean parquet gzip file
dataset_v1.to_parquet('dataset_v2.parquet')