## Preprocess Data for seasonal VAEs

The aim of this notebook is to translate NetCDF files (.nc) of three daily climate variables (maximum temperature, precipitations, wind) to four numpy 3D-arrays: one for each season. These output arrays can easily be read for training and evaluating the Convolutional Variational AutoEncoder model.

#### 0. Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import cftime
import cartopy.crs as ccrs
import csv
import pandas as pd
from datetime import datetime

#### 1. Load Data to xarray

In [2]:
train_images = np.load("../input/preprocessed_train_data.npy")
test_images = np.load("../input/preprocessed_test_data.npy")
histo_images = np.load("../input/preprocessed_3d_histo_data.npy")
proj_images = np.load("../input/preprocessed_3d_proj_data.npy")
histo_time = pd.read_csv("../input/dates_histo_data.csv")
train_time = pd.read_csv("../input/dates_train_data.csv")
test_time = pd.read_csv("../input/dates_test_data.csv")
proj_time = pd.read_csv("../input/dates_proj_data.csv")

#### 2. Split Yearly Data into Four Seasonal Datasets

In [3]:
def season_split(images: np.ndarray, 
                 time: pd.DataFrame,
                 keyword: str,
                 saving_on: bool = False
                ) -> (list[np.ndarray],
                      list[pd.DataFrame]):

    n_years = int(len(histo_images)/365)

    # 1st April = index 90
    # 1st July = index 181
    # 1st October = index 273
    winter_index = [365*i + j for i in range(n_years) for j in range(90)]
    spring_index = [365*i + j for i in range(n_years) for j in range(90, 181)]
    summer_index = [365*i + j for i in range(n_years) for j in range(181, 273)]
    autumn_index = [365*i + j for i in range(n_years) for j in range(273, 365)]

    winter_images = images[winter_index]
    spring_images = images[spring_index]
    summer_images = images[summer_index]
    autumn_images = images[autumn_index]
    
    winter_time = time.loc[winter_index].iloc[:,1]
    spring_time = time.loc[spring_index].iloc[:,1]
    summer_time = time.loc[summer_index].iloc[:,1]
    autumn_time = time.loc[autumn_index].iloc[:,1]
    
    # save results as an input for CVAE training
    if saving_on:
        np.save(f"../input/preprocessed_3d_{keyword}_winter_data.npy", winter_images)
        np.save(f"../input/preprocessed_3d_{keyword}_spring_data.npy", spring_images)
        np.save(f"../input/preprocessed_3d_{keyword}_summer_data.npy", summer_images)
        np.save(f"../input/preprocessed_3d_{keyword}_autumn_data.npy", autumn_images)
        pd.DataFrame(winter_time).to_csv(f'../input/dates_{keyword}_winter_data.csv')
        pd.DataFrame(spring_time).to_csv(f'../input/dates_{keyword}_spring_data.csv')
        pd.DataFrame(summer_time).to_csv(f'../input/dates_{keyword}_summer_data.csv')
        pd.DataFrame(autumn_time).to_csv(f'../input/dates_{keyword}_autumn_data.csv')
    
    season_images = [winter_images, spring_images, summer_images, autumn_images]
    season_time = winter_time, spring_time, summer_time, autumn_time
    
    return season_images, season_time

#### 3. Apply to History, Training, Test, and Projection Datasets

In [4]:
season_images, season_time = season_split(histo_images, 
                                          histo_time, 
                                          'histo',
                                          True)

In [6]:
season_images, season_time = season_split(train_images, 
                                          train_time, 
                                          'train',
                                          True)

In [None]:
season_images, season_time = season_split(test_images, 
                                          test_time, 
                                          'test',
                                          True)

In [None]:
season_images, season_time = season_split(proj_images, 
                                          proj_time, 
                                          'proj',
                                          True)