# import libraries

In [1]:
import os
import h5py
import xarray as xr
import numpy as np
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
import pandas as pd

# Set file paths

In [2]:
imerg_path = r"file_path_to_your_imerg_files"
output_file = r'your_output_name.csv' #Name your dataframe with selection criteria don't remove the .csv

# Define time intervals

In [3]:
# Define time interval of the IPW and the preceeding time taken into account for the estimation and output path
time_interval_ipw = timedelta(minutes = 2)
step_time_ipw = timedelta(minutes = 30)
time_interval = timedelta(hours=2)

# Get time of the files

In [4]:
# Function to extract datetime from filename
def extract_datetime_seviri(filename):
    dt = datetime.strptime(filename.split('-')[5].split('.')[0], "%Y%m%d%H%M%S")
    return dt

def extract_datetime_imerg(filename):
    day = filename.split('.')[4].split('-')[0]
    hoday = filename.split('.')[4].split('-')[2][1:7]
    dt = datetime.strptime(day + hoday, "%Y%m%d%H%M%S")
    return dt

In [5]:
# List to store all file paths
imerg_files = []

# Traverse through all subdirectories and files in the given path
for root, dirs, files in os.walk(imerg_path):
    for file in files:
        if file.endswith(".HDF5"):
            imerg_files.append(file)

# Sort the list of files
imerg_files = sorted(imerg_files)

# List to store all file paths
seviri_files = []

# Traverse through all subdirectories and files in the given path
for root, dirs, files in os.walk(seviri_path):
    for file in files:
        if file.endswith(".hdf5"):
            seviri_files.append(file)

# Sort the list of files
seviri_files = sorted(seviri_files)

# Get files within a certain interval

In [7]:
# Function to find all files within a time interval
def get_files_in_interval_imerg(start_time, end_time, imerg_files):
    imerg_interval = [f for f in imerg_files if start_time <= extract_datetime_imerg(f) <= end_time]
    return imerg_interval

# Function to find all files within a time interval
def get_files_in_interval_seviri(start_time, end_time, seviri_files):
    seviri_interval = [f for f in seviri_files if start_time <= extract_datetime_seviri(f) <= end_time]
    return seviri_interval

# Get weights for probability sampling

In [11]:
p_dt = []

p_sum = []
p_num_gridcells = []
p_average = []
p_coverage = []
p_peak = []
p_num_min_max_5mm = []
p_max_num = []

# Traverse through all subdirectories and files in the given path
imerg_files_path = []
for root, dirs, files in os.walk(imerg_path):
    for file in files:
        if file.endswith(".HDF5"):
            imerg_files_path.append(os.path.join(root, file))

for file in imerg_files_path:


    #load in the file
    ds = xr.open_dataset(file, engine='netcdf4')

    try:
    

        #calculate the sum of precipitation for all grid cells > 1 mm/h
        precip_sum = ds['IMERG_FR'].where(ds['IMERG_FR'] > 1.).sum().item()
    
        #Calculate number of grid cells with precipitation > 1 mm/h
        precip_num = (ds['IMERG_FR'] > 1.).sum().item()
    
        #Calculate the average precipitatin of grid cells > 1 mm/h
        if precip_num > 0:
            p_average.append(precip_sum / precip_num)
    
        else:
            #If there is no precipitation set it to a really small value
            p_average.append(0.0001)
    
        #find maximum precipitation
        precip_max = ds['IMERG_FR'].max().item()
    
        #Check how much pixels contain the maximum value
        precip_max_num = (ds['IMERG_FR'] > precip_max).sum().item()
    
        #Check how much pixels contain more than 5 mm/h
        precip_num_min_max_5mm = (ds['IMERG_FR'] > 5.).sum().item()
    
        # Calculate coverage of precipitation 
        precip_perc = precip_num / (ds['IMERG_FR'].shape[0] * ds['IMERG_FR'].shape[0])
    
        # set timestamp to save
        p_dt.append(extract_datetime_imerg(file))
    
       #append values to lists 
        p_peak.append(precip_max)
        p_sum.append(precip_sum) # Sum of all the rainfall bigger than 1 mm/h
        p_num_gridcells.append(precip_num) # Number of gridcells with rainfall bigger than 1 mm/h
        p_num_min_max_5mm.append(precip_num_min_max_5mm) #Number of gridcells with rainfall bigger than 5 mm/h
        p_coverage.append(precip_perc) #Percentual coverage of the area containining precipitation > 5 mm/h
        p_max_num.append(precip_max_num) #Number of gridcells that contain the maximum precipitation value

    except KeyError:
        print('file is empty')

In [12]:
# Convert the list to a DataFrame
df = pd.DataFrame(p_dt, columns=['time'])
df['sum'] = p_sum
df['num_grid_cells'] = p_num_gridcells
df['average'] = p_average
df['coverage'] = p_coverage
df['peak'] = p_peak
df['num_5mm'] = p_num_min_max_5mm
df['num_max'] = p_max_num

# Save the DataFrame to a CSV file
df.to_csv(output_file, index=False)