## 1. Import Python Libraries 

In [1]:
import os
import json
import h5py
import datetime

import numpy as np
import pandas as pd

## 2. Define global variables 

### 2.1. Define date range for data extractions and collations 

In [3]:
sdate = datetime.datetime(2020, 5, 1) ## DEFINE YOUR DESIRED ENDING DATE FOR DATA EXTRACTION AND COLLATION
edate = datetime.datetime(2021, 10, 1) ## DEFINE YOUR DESIRED ENDING DATE FOR DATA EXTRACTION AND COLLATION

delta = edate - sdate
final_datetimes_list = list()
for i in range(delta.days+1):
    day = sdate + datetime.timedelta(days=i)    
    month = day.month
    days = day.day
    year = day.year
    if len(str(month)) == 1:
        month = '0' + str(month)
    if len(str(days)) == 1:
        days = '0' + str(days)
    final_datetime = str(year) + '-' + str(month) + '-' + str(days)
    final_datetimes_list.append(final_datetime)

### 2.2. Define moving average (MA) & multi-time steps (S) for analysis 

In [2]:
MA_day_list = [1,3,5,7] ## DEFINE 1, 3, 5, and 7 days MOVING AVERAGE (MA) 

In [4]:
starting_date = '2020-05-01'
ending_date = '2021-10-01'
multi_time_steps = [1,3,5,7,9] ## DEFINE 1, 3, 5, 7 and 9 days MULTI-TIME STEPS (S) 
fixed_lead_time = 1

full_scenario_multi_time_dict = dict()
for multi_time_value in multi_time_steps:
    
    modelled_date_multi_dates = dict()    
    for datetime_item in final_datetimes_list:
        d1 = datetime.datetime.strptime(datetime_item, "%Y-%m-%d")
        year = d1.year
        month = d1.month
        day = d1.day
        
        refdate = datetime.datetime(year, month, day)
        multi_dates_list = list()
        for k in range(multi_time_value,0,-1):
            new_refdate = refdate - datetime.timedelta(days=k+fixed_lead_time)  
            month = new_refdate.month
            days = new_refdate.day
            year = new_refdate.year

            if len(str(month)) == 1:
                month = '0' + str(month)
            if len(str(days)) == 1:
                days = '0' + str(days)

            final_datetime = str(year) + '-' + str(month) + '-' + str(days)
            multi_dates_list.append(final_datetime)
        modelled_date_multi_dates[datetime_item] = multi_dates_list
        
    full_scenario_multi_time_dict[str(multi_time_value) + '_day_multi_time_steps'] = modelled_date_multi_dates

## 3. Import processed data, followed by data management 

In [7]:
DEFINE_YOUR_OWN_HDF5_LOC = ''
DEFINE_YOUR_SUMMARY_FILES_LOC = ''

h5_files_location = DEFINE_YOUR_OWN_HDF5_LOC
processed_path_location = DEFINE_YOUR_SUMMARY_FILES_LOC

for multi_time_value in multi_time_steps:
    for MA_days in MA_day_list:
        
        data_features_filename = 'Global-R_summary_Jan2020-Oct2021_20features.csv' ## DATA FILE EXTRACTED FROM SHARED FOLDER ##
        final_data_features_path = processed_path_location + '/' + data_features_filename
        targets_filename = continent + '_G_D_targets.csv'
        final_targets_path = processed_path_location + '/' + targets_filename

        data_features = pd.read_csv(final_data_features_path)
        data_targets = pd.read_csv(final_targets_path)

        DATES_DATA_FEATURES = list(data_features['Date'])
        DATES_DATA_TARGETS = list(data_targets['Date'])
        G_RATE_DATA = list(data_targets['G_rate-' + str(MA_days) + ' days MA'])
        D_RATE_DATA = list(data_targets['D_rate-' + str(MA_days) + ' days MA'])

        extracted_dates_dict = full_scenario_multi_time_dict[str(multi_time_value) + '_day_multi_time_steps']

        G_target_list = list()
        D_target_list = list()
        x_features_list = list()
        ass_G_features_list = list()
        ass_D_features_list = list()
        x_features_names_dict = dict()
        for model_date, features_dates in extracted_dates_dict.items():

            target_index = DATES_DATA_TARGETS.index(model_date)
            G_target_list.append(G_RATE_DATA[target_index])
            D_target_list.append(D_RATE_DATA[target_index])

            sub_x_features_names_list = list()
            sub_x_features_list = list()
            sub_G_features_list = list()
            sub_D_features_list = list()
            for feature_date in features_dates:
                features_index = DATES_DATA_FEATURES.index(feature_date)
                extracted_features = list(data_features.iloc[features_index,
                                                             1:])
                sub_x_features_list.append(extracted_features)

                sel_targets_index = DATES_DATA_TARGETS.index(feature_date)
                sub_G_features_list.append(G_RATE_DATA[sel_targets_index])
                sub_D_features_list.append(D_RATE_DATA[sel_targets_index])
                features_names = [name + '_' + feature_date for name in data_features.columns[1:]]
                sub_x_features_names_list.append(features_names)

            x_features_list.append(sub_x_features_list)
            ass_G_features_list.append(sub_G_features_list)
            ass_D_features_list.append(sub_D_features_list)
            x_features_names_dict[model_date] = sub_x_features_names_list

        G_targets_array = np.array(G_target_list).reshape(-1,1)
        D_targets_array = np.array(D_target_list).reshape(-1,1)
        X_features_array = np.array(x_features_list)
        ass_G_features_array = np.array(ass_G_features_list)
        ass_D_features_array = np.array(ass_D_features_list)

        h5_file = h5py.File(h5_files_location + '/' + continent + '_' + str(multi_time_value) + '_day_multi_time_steps-' \
                            + str(MA_days) + '_days_MA.h5','w')
        h5_file.create_dataset('X_array', data=X_features_array )
        h5_file.create_dataset('G_array', data=G_targets_array)
        h5_file.create_dataset('D_array', data=D_targets_array)
        h5_file.create_dataset('ASS_G_array', data=ass_G_features_array)
        h5_file.create_dataset('ASS_D_array', data=ass_D_features_array)
        h5_file.close()

        with open(h5_files_location + '/' + continent + '_' + str(multi_time_value) + '_day_multi_time_steps-' \
                  + str(MA_days) + '_days_MA.json','w') as outfile:
            json.dump(x_features_names_dict, outfile)