In [24]:
import os
import datetime
from pyprojroot import here
import pandas as pd
from osgeo import gdal

In [25]:
# list all the ECOSTRESS files
file_list = [s for s in os.listdir(str(here("./data/raw/ECOSTRESS/"))) if s.endswith('.tif')]

In [26]:
# function to parse the file names to return the timestamp or whether it's an ET or uncertainty measure
def parse_filename(file_string,result='timestamp'):
    (root_string,variable,method,source,raster_type,timestamp,end_string) = file_string.split('_')
    if result == 'timestamp':
        return datetime.datetime.strptime(timestamp[3:], "%Y%j%H%M%S")
    elif result == 'raster_type':
        return raster_type
    else:
        return None   

In [33]:
# function to average all the files together using GDAL
def average_files(file_list, output=None):
    if output:
        file_string = " -A ".join(file_list)
        print("Averaging {files}".format(files=file_string))
        print("Output {output}".format(output=output))
        command = 'gdal_calc.py -o {output} -of gtiff --extent=intersect --calc="numpy.average(A,axis=0)" -A '.format(output=output) + file_string
        # print("Command: {command}".format(command=command))
    else:
        raise ValueError("Must provide output filename as argument")
    print(os.popen(command).read())
    return None

# function to count how many files contribute a value to a pixel using GDAL
def pixel_count(file_list, output=None):
    if output:
        file_string = " -A ".join(file_list)
        print("Counting {files}".format(files=file_string))
        print("Output {output}".format(output=output))
        command = 'gdal_calc.py -o {output} -of gtiff --extent=intersect --calc="numpy.sum(!is.na(A),axis=0)" -A '.format(output=output) + file_string
    else:
        raise ValueError("Must provide output filename as argument")
    print(os.popen(command).read())
    return None

In [28]:
# create a dataframe with each file and time and type information
df = pd.DataFrame(file_list, columns=['name'])
df['timestamp'] = df['name'].apply(parse_filename)
df['raster_type'] = df['name'].apply(parse_filename, result='raster_type')
df['year'] = df['timestamp'].apply(lambda x: x.year)
df['doy'] = df['timestamp'].apply(lambda x: x.timetuple().tm_yday)
df['fullname'] = df['name'].apply(lambda x: str(here("./data/raw/ECOSTRESS/")) + "/" + x)
df['monthday'] = df['timestamp'].apply(lambda x: int(str(x.month)+str(x.day).zfill(2)))

bins = [0, 115, 315, 515, 715, 915, 1115, 1300]
labels = ['NDJ', 'JFM', 'MAM', 'MJJ', 'JAS', 'SAN', 'NDJ']
df['monthgroup'] = pd.cut(df.monthday, bins, labels = labels, include_lowest = True, ordered = False)

In [29]:
# create the folder to save the merged outputs if it doesn't exist yet
outdir = str(here("./data/intermediate/ECOSTRESS/monthgroup_averaged"))
if not os.path.exists(outdir):
    os.makedirs(outdir)

In [30]:
# get the unique days and raster types you need to create a merged raster for
years = df.year.unique()
monthgroups = df.monthgroup.unique()
raster_types = df.raster_type.unique()

In [34]:
# take the average 
for year in years:
    for monthgroup in monthgroups:
        for raster_type in raster_types:
            
            # define the name of the file to save to
            merged_output_filename = "avg_{year}_{monthgroup}_{raster_type}.tif".format(
                raster_type=raster_type,
                monthgroup=monthgroup,
                year=year)
            
            # select the files with the correct time and raster type
            select_file_list = (df[
                (df['year'] == year) & 
                (df['monthgroup'] == monthgroup) & 
                (df['raster_type'] == raster_type)
            ].fullname.to_list())
            
            # average and save
            average_files(select_file_list, output= outdir + "/" + merged_output_filename)

Averaging /Users/annaboser/Documents/GitHub/ET_agriculture/data/raw/ECOSTRESS/ECO3ETPTJPL.001_EVAPOTRANSPIRATION_PT_JPL_ETinstUncertainty_doy2019028010658_aid0001.tif -A /Users/annaboser/Documents/GitHub/ET_agriculture/data/raw/ECOSTRESS/ECO3ETPTJPL.001_EVAPOTRANSPIRATION_PT_JPL_ETinstUncertainty_doy2019042184230_aid0001.tif -A /Users/annaboser/Documents/GitHub/ET_agriculture/data/raw/ECOSTRESS/ECO3ETPTJPL.001_EVAPOTRANSPIRATION_PT_JPL_ETinstUncertainty_doy2019029001615_aid0001.tif -A /Users/annaboser/Documents/GitHub/ET_agriculture/data/raw/ECOSTRESS/ECO3ETPTJPL.001_EVAPOTRANSPIRATION_PT_JPL_ETinstUncertainty_doy2019027002130_aid0001.tif -A /Users/annaboser/Documents/GitHub/ET_agriculture/data/raw/ECOSTRESS/ECO3ETPTJPL.001_EVAPOTRANSPIRATION_PT_JPL_ETinstUncertainty_doy2019028010750_aid0001.tif -A /Users/annaboser/Documents/GitHub/ET_agriculture/data/raw/ECOSTRESS/ECO3ETPTJPL.001_EVAPOTRANSPIRATION_PT_JPL_ETinstUncertainty_doy2019045010713_aid0001.tif -A /Users/annaboser/Documents/Git

In [None]:
# count the pixels
for year in years:
    for monthgroup in monthgroups:
        for raster_type in raster_types:
            
            # define the name of the file to save to
            merged_output_filename = "ncount_{year}_{monthgroup}_{raster_type}.tif".format(
                raster_type=raster_type,
                monthgroup=monthgroup,
                year=year)
            
            # select the files with the correct time and raster type
            select_file_list = (df[
                (df['year'] == year) & 
                (df['monthgroup'] == monthgroup) & 
                (df['raster_type'] == raster_type)
            ].fullname.to_list())
            
            # count and save
            pixel_count(select_file_list, output= outdir + "/" + merged_output_filename)