https://gpm.nasa.gov/sites/default/files/2023-07/IMERG_TechnicalDocumentation_final_230713.pdf (technical documentation on GPM)

In IMERG, the half-hourly precipitation measurement is based on a single microwave snapshot (or interpolated data if no microwave data are available). The precipitation rate in mm/hr is assumed to be valid for the entire half-hour period (mm/hr)

to do: 
- potentially adjust threshold for qi index
- nan logic for handling nans in morning, afternoon or evening precip and how to calculate those sums


Gaoyun APE def: 

morning (0600-1300 LST), afternoon (1400-2000 LST), and evening (2100-2400 LST) precipitation events in our analysis. Afternoon precipitation events (APEs) are identified as daily samples that meet the following two criteria: 1) daily precipitation peaks during the afternoon hours defined above; and 2) the afternoon precipitation is at least twice as large as the morning precipitation, and also greater than the evening precipitation (filter out organized precipitation).

APE def from other paper: 

If on a given day the 3°x3° domain receives less than 0.004 mm precipitation during the morning hours (defined as 0700 –1100 CST, i.e. 0.001 mm hr-1 average), greater than 0.004 mm during the “afternoon” hours (defined as 1100 – 2300 CST), and the daily precipitation peak occurs during one of the “afternoon” hours, this day is classified as APE. https://agupubs.onlinelibrary.wiley.com/action/downloadSupplement?doi=10.1029%2F2018GL078598&file=grl57813-sup-0001-2018GL078598_SI.pdf


In [2]:
import os
import pandas as pd
import glob 
import numpy as np
import matplotlib.pyplot as plt
import requests
import xarray as xr
from datetime import datetime, timedelta
import h5py
import gc
import datetime
from urllib.parse import urljoin
import pytz
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
pd.set_option('display.max_rows', 500)

In [3]:
#reading in HDF5 files
folder_path = '/data/rong3/annie/gpm/2000'

gpm_files = sorted(glob.glob(os.path.join(folder_path, '*.HDF5')))

7344


In [4]:
#parallel processed

base_date = datetime.datetime(1980, 1, 6, 0, 0, 0)

lat_start = 30.05

lat_end = 39.05

lon_start = -105.05

lon_end = -95.05

def convert_to_cst(seconds_since_base):
    
    if isinstance(seconds_since_base, (np.int32, np.int64)):
        
        seconds_since_base = int(seconds_since_base)
    
    time_delta = timedelta(seconds=seconds_since_base)
    
    # Calculate UTC time
    utc_time = base_date + time_delta
    
    # Convert UTC time to CST (subtract 6 hours for CST)
    cst_time = utc_time - timedelta(hours=6)
    
    # Remove timezone information to make it naive
    naive_cst_time = cst_time.replace(tzinfo=None)

    return naive_cst_time

def process_file(file):
    
    threshold_nans = 1510 #10% of total in lat/lon range
    
    threshold_qi = 0.4 #above 0.4 falls into "fair" and "good" based on documentation
    
    if os.path.isfile(file) and os.path.getsize(file) > 0:
        
        with h5py.File(file, 'r', locking=False) as f:
            
            # Geographical filtering
            
            lon = f['Grid/lon'][:]
            
            lat = f['Grid/lat'][:]

            lon_mask = (lon >= lon_start) & (lon <= lon_end)
            
            lat_mask = (lat >= lat_start) & (lat <= lat_end)

            lon_indices = np.where(lon_mask)[0]
            
            lat_indices = np.where(lat_mask)[0]

            # Time conversion
            time = f['Grid/time'][:][0]
            
            lst = convert_to_cst(time)

            # Precipitation filtering by geography
            precip = xr.DataArray(f['Grid/precipitation'][:][0], dims=['lon', 'lat'])
            
            psub = precip.isel(lon=lon_indices, lat=lat_indices)
            
            psub = psub.where(psub >= 0, np.nan)  # Replace all negative values with NaNs

            # Precipitation filtering with quality index
            precip_qi = xr.DataArray(f['Grid/precipitationQualityIndex'][:][0], dims=['lon', 'lat'])
            
            psub_qi = precip_qi.isel(lon=lon_indices, lat=lat_indices)
            
            qi_mask = psub_qi > threshold_qi
            
            qi_psub = psub.where(qi_mask, np.nan)

            nans = np.sum(np.isnan(qi_psub)).item()

            if nans < threshold_nans:
                
                sgp_precip_average = np.nanmean(qi_psub)
            
            else:
                
                sgp_precip_average = np.nan

            return lst, sgp_precip_average

    else:
        
        print(f"Not a file: {file}")
        
        return None, None

def read_and_filter_data_parallel(files):
    
    num_files = len(files)
    
    lst_array = []
    
    precip_means = []

    with ThreadPoolExecutor(max_workers=20) as executor: #max_workers means # of threads at once
        
        future_to_file = {executor.submit(process_file, file): file for file in files}
        
        for future in as_completed(future_to_file):
            
            lst, sgp_precip_average = future.result()
            
            if lst is not None:
                
                lst_array.append(lst)
                
                precip_means.append(sgp_precip_average)

    # Convert lists to numpy arrays
    lst_array = np.array(lst_array, dtype='datetime64[ns]')
    
    precip_means = np.array(precip_means, dtype=float)

    return lst_array, precip_means


lst_array, precip_means = read_and_filter_data_parallel(gpm_files)

df = pd.DataFrame({'LST': lst_array, 'mm_per_hr': precip_means}).sort_values(by='LST')

df = df.reset_index(drop=True)

## APE classification

In [5]:
#convert from mm/hr to total mm, mm/hr*0.5hr = mm 
df['total_mm'] = df['mm_per_hr']*0.5

#calculating total mm for morning, afternoon and evening based on gaoyun's definition 
df['hr_min'] = pd.to_datetime(df['LST']).dt.strftime('%H:%M')

df['date'] = pd.to_datetime(df['LST']).dt.date

def classify_time_of_day(time):
    
    if "06:00" <= time < "14:00": #8 hours x 2 = 16 total observations
        
        return 'morning'
    
    elif "14:00" <= time < "21:00": #7 hours x 2 = 14 total observations
        
        return 'afternoon'
    
    elif "21:00" <= time <= "23:30": #3 hours x 2 = 6 total observations
        
        return 'evening'

# # Apply the function to create a new column
df['time_of_day'] = df['hr_min'].apply(classify_time_of_day)

df = df[pd.notna(df["time_of_day"])]

In [6]:
#nans adjustment
na_counts = df.groupby(['date', 'time_of_day'])['total_mm'].apply(lambda x: x.isna().sum())

tod_means = df.groupby(['date', 'time_of_day'])['total_mm'].mean()

total_counts = df.groupby(['date', 'time_of_day'])['total_mm'].count() + na_counts

proportions_na = na_counts / total_counts

proportions_df = proportions_na.reset_index(name='proportions_na')

tod_means_df = tod_means.reset_index(name='tod_means')

In [7]:
df = df.merge(proportions_df, on=['date', 'time_of_day'], how='left')

df = df.merge(tod_means_df, on=['date', 'time_of_day'], how='left')

In [8]:
def nans_adjustment(row):
    
    if row['proportions_na'] >= 0.5:
        
        return np.nan
    
    elif row['proportions_na'] < 0.5 and not pd.isna(row['total_mm']):
        
        return row['total_mm']
    
    elif row['proportions_na'] < 0.5 and pd.isna(row['total_mm']):
        
        return row['tod_means']
    
    else:
        
        return np.nan

# Apply the function and create a new column 'total_mm_clean'
df['total_mm_clean'] = df.apply(nans_adjustment, axis=1)

In [9]:
def custom_sum(values):
    
    if pd.isna(values).any():
        
        return np.nan
    
    else:
        
        return values.sum()

In [10]:
#condition 1 -> afternoon is 2x morning and also greater than evening

sums = df.pivot_table(index='date', columns='time_of_day', values='total_mm_clean', aggfunc=custom_sum)

sums = sums.dropna(how='any')

sums['cond1'] = (sums['afternoon'] >= 2 * sums['morning']) & (sums['afternoon'] > sums['evening'])

In [11]:
#condition 2

peak = df.loc[df.groupby('date')['total_mm'].idxmax()][['date', 'hr_min']]

peak['cond2'] = peak['hr_min'].apply(classify_time_of_day)

peak = peak.reset_index(drop=True)

ape_conditions = pd.merge(sums, peak, on='date', how='inner')

ape_conditions['APE'] = (ape_conditions['cond1'] == True) & (ape_conditions['cond2'] == 'afternoon')

In [15]:
ape_conditions

Unnamed: 0,date,afternoon,evening,morning,cond1,hr_min,cond2,APE
0,2001-05-02,1.209279,1.028254,0.020153,True,22:30,evening,False
1,2001-05-03,0.952842,0.768568,0.769582,False,22:30,evening,False
2,2001-05-04,3.98819,1.891001,3.610083,False,18:30,afternoon,False
3,2001-05-07,0.163817,0.016172,0.450646,False,06:00,morning,False
4,2001-05-08,0.225474,0.056071,0.05146,True,16:30,afternoon,True
5,2001-05-11,1.538211,1.208882,0.275501,True,20:30,afternoon,True
6,2001-05-12,0.43033,0.064929,0.244665,False,19:00,afternoon,False
7,2001-05-13,0.117809,0.049994,0.008302,True,19:30,afternoon,True
8,2001-05-14,0.262484,0.019342,0.022757,True,18:30,afternoon,True
9,2001-05-15,0.006363,0.002105,0.000884,True,19:00,afternoon,True


In [17]:
#total count of APEs in set

(np.sum(ape_conditions['APE'] == True)/ape_conditions.shape[0])*100


45.45454545454545

In [14]:
ape_conditions[ape_conditions['APE']==True]

Unnamed: 0,date,afternoon,evening,morning,cond1,hr_min,cond2,APE


In [494]:
# #to do: fix this, if less than half of the values are nans for each time and time of day, replace nans with average
# #value, otherwise leave as nans and let pivot table (with fill_value=np.nan) keep as nan




# # proportions_df.set_index(['date', 'time_of_day'], inplace=True)

# def fill_na(row):
#     key = (row['date'], row['time_of_day'])
    
#     # Set the index and lookup the proportion
#     proportion = proportions_df.set_index(['date', 'time_of_day']).at[key, 'proportions_na'] if key in proportions_df.set_index(['date', 'time_of_day']).index else np.nan
    
#     if proportion > 0.5:
#         return row['total_mm']  # Leave NaNs as NaNs
#     else:
#         # Calculate the mean of total_mm for the same date and time_of_day
#         avg_value = df[(df['date'] == row['date']) & 
#                        (df['time_of_day'] == row['time_of_day'])]['total_mm'].mean()
        
#         return avg_value if pd.isna(row['total_mm']) else row['total_mm']

# # Apply the filling function
# df['total_mm_clean'] = df.apply(fill_na, axis=1)

  tod_means_nans = df.groupby(['date', 'time_of_day'])['total_mm'].agg(np.nanmean)


In [235]:
#not parallel processed 


def read_and_filter_data(files):
    
    threshold_nans = 1510 #10% of total in lat/lon range
    
    threshold_qi = 0.4 #above 0.4 falls into "fair" and "good" based on documentation, below "questionable"
    
    num_files = 140000 #for 153 (warm season days in every year) * 48 (half-hourly measurements) * 19 (total years) = 139536 files
    
    lst_array = np.empty(num_files, dtype='datetime64[ns]')
    
    precip_means = np.empty(num_files, dtype=float)
    
    count = 0
    
    for file in files:
        
        if os.path.isfile(file) and os.path.getsize(file) > 0:
            
            with h5py.File(file, 'r', locking=False) as f:
                
                #geographical filtering 
                lon = f['Grid/lon'][:]
                
                lat = f['Grid/lat'][:]

                lon_mask = (lon >= lon_start) & (lon <= lon_end)
                
                lat_mask = (lat >= lat_start) & (lat <= lat_end)

                lon_indices = np.where(lon_mask)[0]
                
                lat_indices = np.where(lat_mask)[0]
                
                #time conversion
                time = f['Grid/time'][:][0]
                
                lst = convert_to_cst(time)
                
                #precipitation filtering by geography
                precip = xr.DataArray(f['Grid/precipitation'][:][0], dims=['lon', 'lat'])
                
                psub = precip.isel(lon=lon_indices, lat=lat_indices)
                
                psub = psub.where(psub >=0, np.nan) #replace all negative values (which includes -9999.9) values with nans
                
                #precipitation filtering with quality index
                precip_qi = xr.DataArray(f['Grid/precipitationQualityIndex'][:][0], dims=['lon', 'lat'])
                
                psub_qi = precip_qi.isel(lon=lon_indices, lat=lat_indices)
                
                qi_check = np.sum(psub_qi.where(psub_qi > 0.4))
                
                qi_total = psub_qi.shape[0]*psub_qi.shape[1]
                                  
                qi_mask = psub_qi > threshold_qi

                qi_psub = psub.where(qi_mask, np.nan)
                
                nans = np.sum(np.isnan(qi_psub)).item()
                
                lst_array[count] = lst

                if nans < threshold_nans: 
                    
                    sgp_precip_average = np.nanmean(qi_psub)
                
                else: 
                    
                    sgp_precip_average = np.nan
    
                    
                precip_means[count] = sgp_precip_average
                
                count += 1
                    
        else:
            print(f"Not a file: {file}")
    
    return lst_array[:count], precip_means[:count]

lst_array, precip_means = read_and_filter_data(gpm_files)

non_par_df = pd.DataFrame({'LST': lst_array, 'mean precip': precip_means})

In [163]:
# file_path = gpm_small[0]

# def print_item(name, obj):
    
#     if isinstance(obj, h5py.Group):
        
#         print(f"Group: {name}")
    
#     elif isinstance(obj, h5py.Dataset):
        
#         print(f"Dataset: {name}")
#         # Print attributes for this dataset
        
#         print("  Attributes:")
        
#         for attr_name, attr_value in obj.attrs.items():
            
#             print(f"    {attr_name}: {attr_value}")

# with h5py.File(file_path, 'r', locking=False) as f:
    
#     f.visititems(print_item)

            

In [571]:
file = gpm_files[0]
        
if os.path.isfile(file) and os.path.getsize(file) > 0:

    with h5py.File(file, 'r', locking=False) as f:
        
        lon = f['Grid/lon'][:]
        
        lat = f['Grid/lat'][:]

lon

array([-179.95   , -179.85   , -179.75   , ...,  179.75   ,  179.84999,
        179.95   ], dtype=float32)