In [10]:
import pandas as pd
import pickle
import xarray as xr
import numpy as np
import boto3
import geopandas
from shapely.geometry import Point
import pyarrow

In [11]:
burned = xr.open_mfdataset('../../finalproj_data/time_slice/MCD64A1.006_500m_aid0001.nc', combine = 'by_coords')
burnt = burned.sel(time = slice('2019-11-01','2019-11-01'))

In [12]:
# burnt['crs']
# Attributes:
#     grid_mapping_name:            latitude_longitude
#     _CoordinateAxisTypes:         GeoX GeoY
#     epsg_code:                    4326
#     horizontal_datum_name:        WGS84
#     semi_major_axis:              6378137
#     inverse_flattening:           298.257223563
#     longitude_of_prime_meridian:  0.0

In [13]:
burnt = burnt.drop_vars(['Burn_Date_Uncertainty', 'First_Day', 'Last_Day','crs'])

In [14]:
burnt = burnt.squeeze('time')

In [15]:
burnt

In [16]:
burnt = burnt.to_dataframe()
burnt.reset_index(inplace = True)

In [17]:
burnt.drop('time', axis = 1, inplace = True)

In [18]:
burnt

Unnamed: 0,lat,lon,Burn_Date,QA
0,4.360417,96.327083,0.0,3.0
1,4.360417,96.331250,0.0,3.0
2,4.360417,96.335417,0.0,3.0
3,4.360417,96.339583,0.0,3.0
4,4.360417,96.343750,0.0,3.0
...,...,...,...,...
210892456,-48.231250,165.918750,-2.0,0.0
210892457,-48.231250,165.922917,-2.0,0.0
210892458,-48.231250,165.927083,-2.0,0.0
210892459,-48.231250,165.931250,-2.0,0.0


In [19]:
burnt.to_parquet('../../finalproj_data/parquet/burnt.parquet')

Unique values:
    array([  0.,  nan,  -2., 305., 307., 322., 327., 320., 323., 326., 328.,
       324., 321., 319., 329., 313., 316., 306., 308., 309., 317., 312.,
       311., 333., 334., 332., 331., 310., 325., 318., 314., 315., 330.] <br>
14.6% of 210,892,461 data points are null values. <br>
0 is unburned, -2 is water, other values are days

### Save as Parquet: other data

In [20]:
ONE = xr.open_mfdataset('../../finalproj_data/time_slice/MOD16A2.006_500m_aid0001.nc', combine='by_coords') 
# (lat: 12623, lon: 16707, time: 1)
# 1.687374337 GB

TWO = xr.open_mfdataset('../../finalproj_data/time_slice/VNP13A2.001_1km_aid0001.nc', combine = 'by_coords') #  (lat: 6312, lon: 8354, time: 1)
# Slice by time dimension so that this dataset is 1 dimension in time. 
TWO = TWO.sel(time = slice('2019-12-11', '2019-12-11'))
#2.953022425

THREE = xr.open_mfdataset('../../finalproj_data/time_slice/VNP14A1.001_1km_aid0001.nc', combine = 'by_coords') # (lat: 6312, lon: 8354, time: 1)
#1.054726297 GB

In [21]:
# GFWD - FWI --> Dataset labels to identify "high risk of fire" based on FWI calculations.
# Convension to label DataSet in caps and DataArray in lowercase

#Use xarray to open .nc file, combining by coordinates. 
FWI = xr.open_mfdataset("../../finalproj_data/satellitedata/GFWD/FWI.GEOS-5.Monthly.Default.201912.nc", combine = 'by_coords')

### Clean this Data

In [22]:
# <xarray.DataArray 'crs' ()>
# array(-127, dtype=int8)
# Coordinates:
#     time     object 2019-12-11 00:00:00
# Attributes:
#     grid_mapping_name:            latitude_longitude
#     _CoordinateAxisTypes:         GeoX GeoY
#     epsg_code:                    4326
#     horizontal_datum_name:        WGS84
#     semi_major_axis:              6378137
#     inverse_flattening:           298.257223563
#     longitude_of_prime_meridian:  0.0
ONE = ONE.drop_vars(['crs'])
ONE = ONE.squeeze('time')

In [23]:
ONE = ONE.to_dataframe()
ONE.reset_index(inplace = True)

In [24]:
ONE.drop('time', axis = 1, inplace = True)

In [25]:
ONE.to_parquet('../../finalproj_data/parquet/modis.parquet')

In [26]:
# TWO
# <xarray.DataArray 'crs' ()>
# array(-127, dtype=int8)
# Attributes:
#     grid_mapping_name:            latitude_longitude
#     _CoordinateAxisTypes:         GeoX GeoY
#     epsg_code:                    4326
#     horizontal_datum_name:        WGS84
#     semi_major_axis:              6378137
#     inverse_flattening:           298.257223563
#     longitude_of_prime_meridian:  0.0

In [27]:
TWO = TWO.drop_vars(['crs'])
TWO = TWO.squeeze('time')

In [28]:
TWO = TWO.to_dataframe()
TWO.reset_index(inplace = True)

In [29]:
TWO.drop('time', axis = 1, inplace = True)

In [30]:
TWO.to_parquet('../../finalproj_data/parquet/vnp13.parquet')

In [31]:
# THREE
# <xarray.DataArray 'crs' ()>
# array(-127, dtype=int8)
# Attributes:
#     grid_mapping_name:            latitude_longitude
#     _CoordinateAxisTypes:         GeoX GeoY
#     epsg_code:                    4326
#     horizontal_datum_name:        WGS84
#     semi_major_axis:              6378137
#     inverse_flattening:           298.257223563
#     longitude_of_prime_meridian:  0.0

In [32]:
THREE = THREE.drop_vars(['crs'])
THREE = THREE.squeeze('time')

In [33]:
THREE = THREE.to_dataframe()
THREE.reset_index(inplace = True)

In [34]:
THREE.drop('time', axis = 1, inplace = True)
THREE.drop('sample', axis = 1, inplace = True)

In [35]:
THREE.to_parquet('../../finalproj_data/parquet/vnp14.parquet')

In [None]:
# Check on a plot to see what it looks like

world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

# We restrict to Australia
ax = world[world.continent == 'Australia'].plot(
    color='white', edgecolor='black')

# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='red')

plt.show()