In [3]:
import rioxarray
import pandas
from pyprojroot import here
import pandas as pd
from osgeo import gdal
import numpy as np
import pickle

In [46]:
# start off dataset by getting x and y coordinates with a first file
# ag = gdal.Open(str(here("./data/intermediate/agriculture/ag_indicator.tif")))
# ag = gdal.Translate("ag.xyz", ag)
# ag = pd.read_csv("ag.xyz", sep = " ", header = None) #this takes forever

# see https://gis.stackexchange.com/questions/358051/convert-raster-to-csv-with-lat-lon-and-value-columns
def datasetify(raster_path, varname):
    rds = rioxarray.open_rasterio(
        raster_path,
    )
    rds = rds.squeeze().drop("spatial_ref").drop("band")
    rds.name = varname
    df = rds.to_dataframe().reset_index()
    return df

dataframe = datasetify(str(here("./data/intermediate/agriculture/ag_indicator.tif")), 
               "agriculture")

dataframe

Unnamed: 0,y,x,agriculture
0,42.009453,-124.409425,-3.400000e+38
1,42.009453,-124.408794,-3.400000e+38
2,42.009453,-124.408163,-3.400000e+38
3,42.009453,-124.407532,-3.400000e+38
4,42.009453,-124.406901,-3.400000e+38
...,...,...,...
244611908,32.534425,-114.134295,-3.400000e+38
244611909,32.534425,-114.133664,-3.400000e+38
244611910,32.534425,-114.133033,-3.400000e+38
244611911,32.534425,-114.132402,-3.400000e+38


In [47]:
# flatten other rasters and add them to the dataset
def add_columns(file, name):
    ar = gdal.Open(str(file)).ReadAsArray()
    if len(ar.shape) == 2:
        ar = ar.reshape(ar.shape[0]*ar.shape[1]) #flatten the array
        dataframe[name] = ar
    elif len(ar.shape) == 3:
        ar = ar.reshape(ar.shape[0], ar.shape[1]*ar.shape[2]) #flatten the array same as above
        ar = ar.reshape(ar.shape[0]*ar.shape[1]) # flatten again
        dataframe[name] = ar
    else: 
        raise Exception("Unexpected number of dimensions")


In [None]:
# add all time invarying variables

add_columns(here("./data/intermediate/counterf/counterf_indicator.tif"), 
           "counterfactual")

add_columns(here("./data/intermediate/topography/elevation.tif"), 
                     "elevation")

add_columns(here("./data/intermediate/topography/aspect.tif"), 
                     "aspect")

add_columns(here("./data/intermediate/topography/slope.tif"), 
                     "slope")

add_columns(here("./data/intermediate/CA_storie/CA_storie.tif"), 
                     "soil")

In [53]:
# save the time invarying version
dataframe.to_csv(here("./data/for_analysis/full_grid_time_invariant.csv"), index=False)



In [55]:
# add time varying variables (PET and ET)

# first read in the start dates that each layer corresponds to
with open(here("./data/intermediate/start_dates.pkl"), 'rb') as f:
    start_date = pickle.load(f)

# repeat the dataframe once for each start date
repeated_start_date = np.repeat(start_date, dataframe.shape[0])
dataframe = pd.concat([dataframe]*len(start_date))
dataframe["start_date"] = repeated_start_date

In [None]:
# add PET and ET
add_columns(here("./data/intermediate/PET/PET_rolling_avg.tif"), 
                     "PET")

add_columns(here("./data/intermediate/ECOSTRESS/ETinst_rolling_average.tif"), 
                     "ET")

In [None]:
# save the full dataset
dataframe.to_csv(here("./data/for_analysis/full_grid.csv"), index=False)

In [None]:
# filter the dataset to only agriculture and save 
ag = dataframe.loc[(dataframe.agriculture == 1)]
ag.to_csv(here("./data/for_analysis/agriculture.csv"), index=False)

In [None]:
# filter the dataset to only vegetation and save
veg = dataframe.loc[(dataframe.counterfactual == 1)]
veg.to_csv(here("./data/for_analysis/counterfactual.csv"), index=False)