# Dataset Normalization

This file aims to obtain the normalization terms for the N-CMAPSS dataset.

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from pickle import dump

In [21]:
def get_fileloc(ds_no):
    locations = {
        1: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS01-005.h5',
        2: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS02-006.h5',
        3: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS03-012.h5',
        4: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS04.h5',
        5: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS05.h5',
        6: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS06.h5',
        7: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS07.h5',
        8: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS08a-009.h5',
        9: '/data/courseac/N-CMAPSS/data_set/N-CMAPSS_DS08c-008.h5',
    }

    return locations[ds_no]

def load_data(fileloc):
    with h5py.File(fileloc, 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

    X_train = np.concatenate((W_dev, X_s_dev, X_v_dev, T_dev, A_dev), axis=1)

    return X_train

def save_normalization(ds_no):
    fileloc = get_fileloc(ds_no)
    
    X_train = load_data(fileloc)
    
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    dump(scaler, open('../scalers/scaler'+str(ds_no)+'.pkl', 'wb'))

In [22]:
for ds_no in tqdm(range(1,10)):
    save_normalization(ds_no)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:29<00:00,  9.91s/it]


## Get lengths of datasets

In [26]:
def get_lengths():
    lengths = []
    for ds_no in tqdm(range(1,10)):
        fileloc = get_fileloc(ds_no)
        with h5py.File(fileloc, 'r') as hdf:
            W_dev = np.array(hdf.get('W_dev'))
            W_test = np.array(hdf.get('W_test'))
            
        lengths.append((len(W_dev), len(W_test)))
        
    return lengths

In [27]:
lengths = get_lengths()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:05<00:00,  1.61it/s]


In [28]:
lengths

[(4906636, 2735232),
 (5263447, 1253743),
 (5571277, 4251560),
 (6377452, 3602561),
 (4350606, 2562046),
 (4257209, 2522447),
 (4350176, 2869786),
 (4885389, 3722997),
 (4299918, 2117819)]