In [78]:
import sys, getopt
import pandas as pd
import numpy as np
import os
import joblib
import math
import json
from time import time
from datetime import datetime, timedelta, time,date

In [68]:
def cgm_construction(cgm_df, bolus_df, basal_df):
    combined_df = cgm_df
    # add new zero columns to combined_df: 'normal', 'carbInput', 'insulinCarbRatio'
    combined_df['normal'] = 0
    combined_df['carbInput'] = 0
    combined_df['insulinCarbRatio'] = 0
    combined_df['duration'] = 0
    combined_df['rate'] = 0
    # go through each row of bolus_df and add the bolus data to the combined_df
    for index, row in bolus_df.iterrows():
        # print(row)
        # get the subject and date of the bolus
        time = row['time']
        # find the closest time index in the combined_df to the bolus time
        closest_time_index = np.argmin(abs(combined_df['time'] - time))
        # check if the time lag between the bolus time and the closest time is less than 5 minutes
        # if it is, add the bolus data to the combined_df
        # else, do nothing
        if abs(combined_df['time'][closest_time_index] - time) > timedelta(minutes=5):
            continue
        # print("time is: ", time, "closest time is: ", combined_df['time'][closest_time_index])
        # add the bolus data to the combined_df
        combined_df.at[closest_time_index, 'normal'] = row['normal']
        combined_df.at[closest_time_index, 'carbInput'] = row['carbInput']
        combined_df.at[closest_time_index, 'insulinCarbRatio'] = row['insulinCarbRatio']
    for index, row in basal_df.iterrows():
        # print(row)
        # get the subject and date of the basal
        time = row['time']
        # find the closest time index in the combined_df to the bolus time
        closest_time_index = np.argmin(abs(combined_df['time'] - time))
        # check if the time lag between the bolus time and the closest time is less than 5 minutes
        # if it is, add the bolus data to the combined_df
        # else, do nothing
        if abs(combined_df['time'][closest_time_index] - time) > timedelta(minutes=5):
            continue
        # print("time is: ", time, "closest time is: ", combined_df['time'][closest_time_index])
        # add the bolus data to the combined_df
        combined_df.at[closest_time_index, 'duration'] = row['duration']
        combined_df.at[closest_time_index, 'rate'] = row['rate']

    return combined_df

def seperate_cgm(combined_df):
    # seperate the combined_df into consecutive time series
    # if the time difference between two consecutive time points is greater than 11 minutes, then it is a new time series
    seperate_df_list = []
    start_index = 0
    end_index = 0
    for index, row in combined_df.iterrows():
        if index == 0:
            continue
        if abs(combined_df['time'][index] - combined_df['time'][index - 1]) > timedelta(minutes=21):
            end_index = index - 1
            if end_index - start_index == 0:
                start_index = index
                continue
            seperate_df_list.append(combined_df[start_index:end_index].reset_index(drop=True))
            start_index = index
    # go through each time series and check if the time series needs interpolation
    inter_seperate_df_list = []
    for seperate_df in seperate_df_list:
        # check the length of seperate_df
        if seperate_df.index[-1] < 24 + 6:
            continue
        # change time to unix time in minutes
        seperate_df['time'] = seperate_df['time'].apply(lambda x: x.timestamp() / 60).astype(int)
        # subtract the first time point from all time points
        seperate_df['time'] = seperate_df['time'] - seperate_df['time'].iloc[0]
        # print("seperate_df['time'] is: ", seperate_df['time'])
        # print("The length of the time series is: ", len(seperate_df))
        # print("The last index is: ", seperate_df.index[-1])
        # check if there are any missing time points
        expected_time = 5 * seperate_df.index[-1]
        # print("expected time: ", 5 * (len(seperate_df) - 1))
        if expected_time == seperate_df['time'].iloc[-1]:
            # replace nan with 0
            seperate_df = seperate_df.fillna(0)
            inter_seperate_df_list.append(seperate_df)
            # print("no interpolation needed")
            # if math.isnan(list(seperate_df['time'])[-1]):
            #     print("nan in seperate_df['time']")
            #     print("seperate_df['time'] is: ", seperate_df['time'])
            # print("expected time: ", seperate_df['time'])
            continue
        else:
            expected_time = np.array(np.arange(0, expected_time + 1, 5), dtype=int)
            # print("expected time: ", expected_time)
            inter_seperate_df = pd.DataFrame({'time': expected_time})
            # print("interpolation needed")
            inter_seperate_df['mg/dl'] = np.interp(expected_time, seperate_df['time'], seperate_df['mg/dl'])
            # use linear interpolation to fill in mg/dl
            # add nan for the missing bolus data
            inter_seperate_df['normal'] = 0
            inter_seperate_df['carbInput'] = 0
            inter_seperate_df['insulinCarbRatio'] = 0
            inter_seperate_df['duration'] = 0
            inter_seperate_df['rate'] = 0
            # add the bolus data to the interpolated dataframe according to the time stamp
            for index, row in seperate_df.iterrows():
                if int(row['time'] / 5) > inter_seperate_df.index[-1]:
                    continue
                inter_seperate_df.at[int(row['time'] / 5), 'normal'] = row['normal']
                inter_seperate_df.at[int(row['time'] / 5), 'carbInput'] = row['carbInput']
                inter_seperate_df.at[int(row['time'] / 5), 'insulinCarbRatio'] = row['insulinCarbRatio']
                inter_seperate_df.at[int(row['time'] / 5), 'duration'] = row['duration']
                inter_seperate_df.at[int(row['time'] / 5), 'rate'] = row['rate']

            # add subject to the interpolated dataframe
            inter_seperate_df['subject'] = seperate_df['subject'][0]
            # replace nan with 0
            inter_seperate_df = inter_seperate_df.fillna(0)
            # if math.isnan(list(inter_seperate_df['time'])[-1]):
            #     print("nan in inter_seperate_df['time']")
            #     print("extected time is: ", expected_time)
            #     print("inter_seperate_df['time'] is: ", inter_seperate_df['time'])
            inter_seperate_df_list.append(inter_seperate_df)

    # print("number of time series: ", len(seperate_df_list))
    return inter_seperate_df_list

In [69]:
dataset_path = './sensor-data'
cleaned_files = os.listdir(dataset_path)

# Read in CGM data
cbg_df = pd.DataFrame()
subject = []
for file in cleaned_files:
    if "._" in file:
        continue
    if ".xlsx" not in file:
        continue
    # if the xlsx does not have Basal sheet, skip it
    if "Basal" not in pd.ExcelFile(dataset_path + "/" + file).sheet_names:
        continue
    cgm_df = pd.read_excel(dataset_path + "/" + file, sheet_name='CGM')
    # convert timestamps to mins
    cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
    # delete df['date']
    cgm_df = cgm_df[['time', 'mg/dl']]
    unique_cgm_df = cgm_df.drop_duplicates(subset=['time'])
    new_cgm_df = unique_cgm_df.dropna(subset=['time'])
    new_cgm_df['subject'] = file.replace('.xlsx', '').replace('Subject', '')
    # check if data is ordered by time
    if new_cgm_df['time'][0] >= new_cgm_df['time'][1]:
        # turn the df upside down
        new_cgm_df = new_cgm_df.iloc[::-1]
    # delete df index
    new_cgm_df = new_cgm_df.reset_index(drop=True)

    bolus_df = pd.read_excel(dataset_path + "/" + file, sheet_name='Bolus')
    bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
    bolus_df = bolus_df[['time', 'normal', 'carbInput', 'insulinCarbRatio']]
    unique_bolus_df = bolus_df.drop_duplicates(subset=['time'])
    new_bolus_df = unique_bolus_df.dropna(subset=['time'])
    new_bolus_df['subject'] = file.replace('.xlsx', '').replace('Subject', '')
    # check if data is ordered by time
    if new_bolus_df['time'][0] >= new_bolus_df['time'][1]:
        # turn the df upside down
        print(new_bolus_df['time'][0], new_bolus_df['time'][1])
        new_bolus_df = new_bolus_df.iloc[::-1]
    # delete df index
    new_bolus_df = new_bolus_df.reset_index(drop=True)

    basal_df = pd.read_excel(dataset_path + "/" + file, sheet_name='Basal')
    basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)
    basal_df = basal_df[['time', 'duration', 'rate']]
    unique_basal_df = basal_df.drop_duplicates(subset=['time'])
    new_basal_df = unique_basal_df.dropna(subset=['time'])
    new_basal_df['subject'] = file.replace('.xlsx', '').replace('Subject', '')
    # check if data is ordered by time
    if new_basal_df['time'][0] >= new_basal_df['time'][1]:
        # turn the df upside down
        new_basal_df = new_basal_df.iloc[::-1]
    # delete df index
    new_basal_df = new_basal_df.reset_index(drop=True)

    # # Combine CGM and Bolus data
    combined_df = cgm_construction(new_cgm_df, new_bolus_df, new_basal_df)
    # move subject to the last column
    combined_df = combined_df[['time', 'mg/dl', 'normal', 'carbInput', 'insulinCarbRatio', 'duration', 'rate', 'subject']]

    # print(seperate_df_list)
    # seperate the combined_df into test, test sets (80%, 20%)
    train_df = combined_df.iloc[:int((combined_df.index[-1]+1) * 0.8)]
    test_df = combined_df.iloc[int((combined_df.index[-1]+1) * 0.8):]
    test_df = test_df.reset_index(drop=True)
    # seperate the dfs into consecutive time series
    train_df_list = seperate_cgm(train_df)
    test_df_list = seperate_cgm(test_df)

    # print("train_df_list: \n", train_df_list)
    # print("test_df_list: \n", test_df_list)
    for i in range(len(train_df_list)):
        joblib.dump(train_df_list[i], './uniform_data_baseline/' + 'subject' + train_df_list[i]['subject'][0] + '_train_' + str(i) + '.pkl')
    for i in range(len(test_df_list)):
        joblib.dump(test_df_list[i], './uniform_data_baseline/' + 'subject' + test_df_list[i]['subject'][0] + '_test_' + str(i) + '.pkl')
    print("done with " + file)

  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-06-24 11:30:26+00:00 2019-06-24 10:31:46+00:00
done with Subject36.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-11-20 14:15:45+00:00 2019-11-20 12:03:18+00:00
done with Subject37.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-07-02 12:23:32+00:00 2019-07-02 02:57:45+00:00
done with Subject29.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)


2019-06-25 16:56:14+00:00 2019-06-25 12:11:03+00:00


  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject42.xlsx
2019-07-09 12:36:41+00:00 2019-07-09 03:49:12+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject54.xlsx
2019-07-31 14:33:53+00:00 2019-07-31 11:36:28+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject51.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-10-11 12:38:02+00:00 2019-10-11 12:37:24+00:00
done with Subject45.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-11-21 18:45:34+00:00 2019-11-20 18:30:50+00:00
done with Subject30.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-10-10 11:22:29+00:00 2019-10-10 09:20:46+00:00
done with Subject46.xlsx
2019-08-09 16:12:33+00:00 2019-08-09 13:18:18+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject47.xlsx
2019-08-12 12:59:21+00:00 2019-08-12 12:09:22+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject49.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)


2019-11-22 17:10:11+00:00 2019-11-22 16:39:14+00:00


  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject38.xlsx
2019-09-22 11:20:03+00:00 2019-09-22 09:26:16+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject52.xlsx
2019-09-27 15:34:45+00:00 2019-09-27 14:09:19+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject53.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


2019-06-25 17:24:35+00:00 2019-06-25 10:30:21+00:00
done with Subject31.xlsx


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)


2019-06-17 13:20:03+00:00 2019-06-17 09:35:57+00:00


  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject39.xlsx
2019-09-26 09:47:39+00:00 2019-09-26 00:18:33+00:00


  cgm_df['time'] = pd.to_datetime(cgm_df['date'], utc=True, infer_datetime_format=True)
  bolus_df['time'] = pd.to_datetime(bolus_df['date'], utc=True, infer_datetime_format=True)
  basal_df['time'] = pd.to_datetime(basal_df['date'], utc=True, infer_datetime_format=True)


done with Subject50.xlsx


### Build dataset for our model
data = ['cgm', 'bolus iob', 'carb absoption curv', 'insulin carb ratio', 'basal iob']

In [76]:
# load the train and test data from the uniform_data_baseline folder
files = os.listdir('./uniform_data_baseline')
train_df_list = []
test_df_list = []

for file in files:
    if "train" in file and '.pkl' in file:
        train_df_list.append(joblib.load('./uniform_data_baseline/' + file))
    if "test" in file and '.pkl' in file:
        test_df_list.append(joblib.load('./uniform_data_baseline/' + file))
print("number of train time series: ", len(train_df_list))
print("number of test time series: ", len(test_df_list))

number of train time series:  422
number of test time series:  111


In [112]:
# use bolus insulin on board to fill in the bolus data
def iobCalcExponential(insulin, t, dia, peak):
    peak = 50
    end = dia * 60
    activityContrib = 0
    minsAgo = t
    iobContrib = 0
    if minsAgo < end:
        tau = peak * (1 - peak / end) / (1 - 2 * peak / end) # time constant of exponential decay
        a = 2 * tau / end # rise time factor
        S = 1 / (1 - a + (1 + a) * math.exp(-end / tau)) # auxiliary scale factor

    activityContrib = insulin * (S / tau ** 2) * minsAgo * (1 - minsAgo / end) * math.exp(-minsAgo / tau)
    # iobContrib = insulin * (1 - S * (1 - a) * ((minsAgo ** 2 / (tau * end * (1 - a)) - minsAgo / tau - 1) * math.exp(-minsAgo / tau) + 1))
    #print('DIA:', dia, 'minsAgo:', minsAgo, 'end:', end, 'peak:', peak, 'tau:', tau, 'a:', a, 'S:', S, 'activityContrib:', activityContrib, 'iobContrib:', iobContrib)

    return activityContrib

# use carb absoption curv to fill in the carb data
def carbAbsobCurv(carbintake, t):
    if t >= 0 and t < 15:
        return carbintake * (0.05 + 1 / 3 * t)
    elif t >= 15 and t < 45:
        return carbintake * (0.05 + 5 * (45 - t) / 30)
    elif t >= 45 and t <= 240:
        return carbintake * 0.05
    else:
        return 0

In [121]:
# fill in the data
processed_train_df_list = []
processed_test_df_list = []

for train_df in train_df_list:
    train_df = np.array(train_df)[:, 1:]
    processed_train_df = np.zeros((len(train_df), 5)) # ['cgm', 'bolus iob', 'carb absoption curv', 'insulin carb ratio', 'basal iob']
    processed_train_df[:, 0] = train_df[:, 0] # fill in the cgm data
    # fill in the bolus data
    new_bolus_iob = []
    for i in range(len(train_df)):
        if train_df[i][1] != 0:
            bolus_array = np.zeros((len(train_df)))
            for j in range(i, min(i + int(240 / 5) - 1, len(train_df))):
                bolus_array[j] = iobCalcExponential(train_df[i][1], (j - i) * 5, 4, 50)
            new_bolus_iob.append(bolus_array)
    # add bolus iob into one array
    new_bolus_iob = np.array(new_bolus_iob)
    # print(new_bolus_iob.shape)
    new_bolus_iob = np.sum(new_bolus_iob, axis=0)
    # print(new_bolus_iob.shape)
    processed_train_df[:, 1] = new_bolus_iob
    # fill in the carb data
    new_carb_absob_curv = []
    for i in range(len(train_df)):
        if train_df[i][2] != 0:
            carb_array = np.zeros((len(train_df)))
            for j in range(i, min(i + int(240 / 5) - 1, len(train_df))):
                carb_array[j] = carbAbsobCurv(train_df[i][2], (j - i) * 5)
            new_carb_absob_curv.append(carb_array)
    # add carb absoption curv into one array
    new_carb_absob_curv = np.array(new_carb_absob_curv)
    # print(new_carb_absob_curv.shape)
    new_carb_absob_curv = np.sum(new_carb_absob_curv, axis=0)
    # print(new_carb_absob_curv.shape)
    processed_train_df[:, 2] = new_carb_absob_curv
    # fill in the insulin carb ratio
    # use forward imputation
    for i in range(len(train_df)):
        if train_df[i][3] != 0:
            for j in range(i, min(i + int(240 / 5) - 1, len(train_df))):
                processed_train_df[j][3] = train_df[i][3]
    # fill in the basal data
    # use forward imputation according to the duration
    for i in range(len(train_df)):
        if train_df[i][5] != 0:
            for j in range(i, min(i + int(train_df[i][4] / 5) - 1, len(train_df))):
                processed_train_df[j][4] = train_df[i][5]
    processed_train_df_list.append(processed_train_df)

for test_df in test_df_list:
    test_df = np.array(test_df)[:, 1:]
    processed_test_df = np.zeros((len(test_df), 5)) # ['cgm', 'bolus iob', 'carb absoption curv', 'insulin carb ratio', 'basal iob']
    processed_test_df[:, 0] = test_df[:, 0] # fill in the cgm data
    # fill in the bolus data
    new_bolus_iob = []
    for i in range(len(test_df)):
        if test_df[i][1] != 0:
            bolus_array = np.zeros((len(test_df)))
            for j in range(i, min(i + int(240 / 5) - 1, len(test_df))):
                bolus_array[j] = iobCalcExponential(test_df[i][1], (j - i) * 5, 4, 50)
            new_bolus_iob.append(bolus_array)
    # add bolus iob into one array
    new_bolus_iob = np.array(new_bolus_iob)
    # print(new_bolus_iob.shape)
    new_bolus_iob = np.sum(new_bolus_iob, axis=0)
    # print(new_bolus_iob.shape)
    processed_test_df[:, 1] = new_bolus_iob
    # fill in the carb data
    new_carb_absob_curv = []
    for i in range(len(test_df)):
        if test_df[i][2] != 0:
            carb_array = np.zeros((len(test_df)))
            for j in range(i, min(i + int(240 / 5) - 1, len(test_df))):
                carb_array[j] = carbAbsobCurv(test_df[i][2], (j - i) * 5)
            new_carb_absob_curv.append(carb_array)
    # add carb absoption curv into one array
    new_carb_absob_curv = np.array(new_carb_absob_curv)
    # print(new_carb_absob_curv.shape)
    new_carb_absob_curv = np.sum(new_carb_absob_curv, axis=0)
    # print(new_carb_absob_curv.shape)
    processed_test_df[:, 2] = new_carb_absob_curv
    # fill in the insulin carb ratio
    # use forward imputation
    for i in range(len(test_df)):
        if test_df[i][3] != 0:
            for j in range(i, min(i + int(240 / 5) - 1, len(test_df))):
                processed_test_df[j][3] = test_df[i][3]
    # fill in the basal data
    # use forward imputation according to the duration
    for i in range(len(test_df)):
        if test_df[i][5] != 0:
            for j in range(i, min(i + int(test_df[i][4] / 5) - 1, len(test_df))):
                processed_test_df[j][4] = test_df[i][5]
    processed_test_df_list.append(processed_test_df)

In [144]:
# split the data based on given backcast_length and forecast_length
backcast_length = 24
forecast_length = 6

final_train_df_list = []
final_test_df_list = []

for train_df in processed_train_df_list:
    for i in range(len(train_df) - backcast_length - forecast_length + 1):
        final_train_df_list.append(train_df[i:i + backcast_length + forecast_length])
for test_df in processed_test_df_list:
    for i in range(len(test_df) - backcast_length - forecast_length + 1):
        final_test_df_list.append(test_df[i:i + backcast_length + forecast_length])

final_train_df_list = np.array(final_train_df_list)
final_test_df_list = np.array(final_test_df_list)
# split train into train and val
train_df_list = final_train_df_list[:int(len(final_train_df_list) * 0.8)]
val_df_list = final_train_df_list[int(len(final_train_df_list) * 0.8):]
test_df_list = final_test_df_list

In [145]:
print("number of train time series: ", len(train_df_list))
print("number of val time series: ", len(val_df_list))
print("number of test time series: ", len(test_df_list))

number of train time series:  220574
number of val time series:  55144
number of test time series:  58411


In [146]:
# save the data in npy format
np.save('./uniform_data_30/train.npy', train_df_list)
np.save('./uniform_data_30/val.npy', val_df_list)
np.save('./uniform_data_30/test.npy', test_df_list)

In [147]:
# split the data based on given backcast_length and forecast_length
backcast_length = 48
forecast_length = 12

final_train_df_list = []
final_test_df_list = []

for train_df in processed_train_df_list:
    if len(train_df) < backcast_length + forecast_length:
        continue
    for i in range(len(train_df) - backcast_length - forecast_length + 1):
        final_train_df_list.append(train_df[i:i + backcast_length + forecast_length])
for test_df in processed_test_df_list:
    if len(test_df) < backcast_length + forecast_length:
        continue
    for i in range(len(test_df) - backcast_length - forecast_length + 1):
        final_test_df_list.append(test_df[i:i + backcast_length + forecast_length])

final_train_df_list = np.array(final_train_df_list)
final_test_df_list = np.array(final_test_df_list)
# split train into train and val
train_df_list = final_train_df_list[:int(len(final_train_df_list) * 0.8)]
val_df_list = final_train_df_list[int(len(final_train_df_list) * 0.8):]
test_df_list = final_test_df_list

In [148]:
print("number of train time series: ", len(train_df_list))
print("number of val time series: ", len(val_df_list))
print("number of test time series: ", len(test_df_list))

number of train time series:  211081
number of val time series:  52771
number of test time series:  55240


In [149]:
# save the data in npy format
np.save('./uniform_data_60/train.npy', train_df_list)
np.save('./uniform_data_60/val.npy', val_df_list)
np.save('./uniform_data_60/test.npy', test_df_list)