In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# own Modules 
from models_mle import LstmMle_1
from data_set import DataSet
from cross_validation import CrossValidationProvider
from scaler import DataScaler
from trainer import Trainer
from loss_module import LossMle
from tester import Tester

## Paramerters artifical data

In [None]:
param = {
    "data" : {
        "path" : '../../../../data/artifical_signals/artifical_2_signals.csv' ,
    },
    "preprocessing" : {
        "droped_features": ["ID"
                           ],
        "features_not_to_scale": []
    },
    "model" : {
        "input_size" : 2,
        "n_hidden_lstm" : [10,20,50], 
        "sequence_size" : [10,20,30], 
        "batch_size" : 8,
        "lstm_layer" : [1],
        "n_hidden_fc": [20,40,60],
        "dropout_rate_lstm": 0.0,
        "dropout_rate_fc": 0.2
    },
    "cycling_lr" : {
        "scheduler_active" : True, 
        # Mode can be one of {triangular, triangular2, exp_range}
        "mode" : "triangular", 
        "gamma" : 0.9995,
        "base_lr" : 0.0001, # 0.016, 
        "max_lr" :0.0005,  # 0.75
    },
    "training": {
        "stake_training_data": 0.75,
        "total_number" : 12000,
        "n_folds_cv": 6,
        "n_epochs" : 20,
        "patience" : 7,
    },
    "filed_location": {
        "trained_model" : "../../../../models/cross_validation/MLE/artifical_data_",
        "history_validation" : "../../../visualisation/files/cross_validation/MLE/validation_artifical_datacsv",
        "history_test" : "../../../visualisation/files/cross_validation/MLE/test_artifical_data.csv"
    }
}

## Paramerters cpps data

In [None]:
param = {
    "data" : {
        "path" : '../../../../data/cpps_degradation/large_degeneration/cpps_data_large_degeneration_training.csv' ,
    },
    "preprocessing" : {
        "droped_features": ["ID"
                           ],
        "features_not_to_scale": []
    },
    "model" : {
        "input_size" : 10,
        "n_hidden_lstm" : [10,20,50], 
        "sequence_size" : [10,20,30], 
        "batch_size" : 8,
        "lstm_layer" : [1],
        "n_hidden_fc": [20,40,60],
        "dropout_rate_lstm": 0.0,
        "dropout_rate_fc": 0.2
    },
    "cycling_lr" : {
        "scheduler_active" : True, 
        # Mode can be one of {triangular, triangular2, exp_range}
        "mode" : "triangular", 
        "gamma" : 0.9995,
        "base_lr" : 0.0001, # 0.016, 
        "max_lr" :0.0005,  # 0.75
    },
    "training": {
        "stake_training_data": 0.75,
        "total_number" : 50000,
        "n_folds_cv": 6,
        "n_epochs" : 20,
        "patience" : 7,
    },
    "filed_location": {
        "trained_model" : "../../../../models/cross_validation/MLE/cpps_data_",
        "history_validation" : "../../../visualisation/files/cross_validation/MLE/validation_cpps_data.csv",
        "history_test" : "../../../visualisation/files/cross_validation/MLE/test_cpps_data.csv"
    }
}

## Paramerters phm data

In [2]:
param = {
    "data" : {
        "path" : '../../../../data/phm_data_challenge/recipe/dataset_for_each_recipe/training/training_recipe_67.csv' ,
    },
    "preprocessing" : {
        "droped_features": ["ID", "stage", "Lot", "runnum", "recipe", "recipe_step",
                            "up time", "ongoing time", 
                            "ETCHSOURCEUSAGE", "ETCHAUXSOURCETIMER", 
                            "ETCHAUX2SOURCETIMER", "FIXTURESHUTTERPOSITION", "ROTATIONSPEED"
                           ],
        "features_not_to_scale": []
    },
    "model" : {
        "input_size" : 12,
        "n_hidden_lstm" : [10,20,50], 
        "sequence_size" : [10,20,30], 
        "batch_size" : 8,
        "lstm_layer" : [1],
        "n_hidden_fc": [20,40,60],
        "dropout_rate_lstm": 0.0,
        "dropout_rate_fc": 0.2
    },
    "cycling_lr" : {
        "scheduler_active" : True, 
        # Mode can be one of {triangular, triangular2, exp_range}
        "mode" : "triangular", 
        "gamma" : 0.9995,
        "base_lr" : 0.0001, # 0.016, 
        "max_lr" :0.0005,  # 0.75
    },
    "training": {
        "total_number" : 1000,
        "n_folds_cv": 6,
        "n_epochs" : 20,
        "patience" : 7,
    },
    "filed_location": {
        "trained_model" : "../../../../models/cross_validation/MLE/phm_data_recipe_66_",
        "history_validation" : "../../../visualisation/files/cross_validation/MLE/validation_phm_data_recipe_66.csv",
        "history_test" : "../../../visualisation/files/cross_validation/MLE/test_phm_data_recipe_66.csv"
    }
}

# Nested Cross Validation for Time Series Data

![](../../../../knowledge/pictures/nested_cv.png)

## Split Data into folds
- ignored features are getting removed
- remaining data are split up into folds

In [3]:
cv_provider = CrossValidationProvider(path=param["data"]["path"], 
                                      no_folds=param["training"]["n_folds_cv"], 
                                      amount_data=param["training"]["total_number"],
                                      ignored_features = param['preprocessing']['droped_features']
                                     )
folds = cv_provider.provide_data()

## Cross Validation Training

In [4]:
# Save scaler metrics for test dataset
scaler_metrics = {}
# Path, where model from phase 1 is stored
current_path_model_phase_1 = None

In [6]:
# Create file where validation results are stored and add header
column_names_validation = ["iteration","validation_fold","validation_loss", 
                           "n_hidden_lstm", "sequence_size", "n_lstm_layer", 
                           "n_hidden_fc", "training phase"]
with open(param["filed_location"]["history_validation"], "a+") as file:
    [file.write(column+";") for column in column_names_validation]
    file.write("\n")
            
# Create file where test results are stored and add header
column_names_test = ["test_fold", "test_loss", "n_hidden_lstm", "sequence_size",
                     "n_lstm_layer", "n_hidden_fc"]
with open(param["filed_location"]["history_test"], "a+") as file:
    [file.write(column+";") for column in column_names_test]
    file.write("\n")
         
for iteration in range (2, param["training"]["n_folds_cv"]):
    # Select folds for current iteration
    training_folds = folds[:iteration]
    validation_fold = folds[iteration:iteration+1]
    test_fold = folds[iteration:iteration+2]
    print("Training Data : Fold 1-" + str(iteration))
    print("Validation Data : Fold "+ str(iteration+1))
    print("Test Data : Fold "+ str(iteration+2))
    
    # Concate data of training folds and unpack validation and testdata
    raw_training_data = pd.concat(training_folds, axis = 0, ignore_index=True)
    raw_validation_data = validation_fold[0]
    raw_test_data = test_fold[0]
    print("Amount Training Data: {}".format(raw_training_data.shape[0]))
    print("Amount Validation Data: {}".format(raw_validation_data.shape[0]))
    print("Amount Test Data: {}".format(raw_test_data.shape[0]))
    print("- -"*30)
    
    # Scale training data and validation data (validation data with mean and variance of training data)
    scaler = DataScaler(features_not_to_scale= param['preprocessing']['features_not_to_scale'])
    train_data_scaled, validation_data_scaled, test_data_scaled = scaler.scale_data(raw_training_data, raw_validation_data, raw_test_data)
    
    # Training model and test hyperparameter on validation set
    for n_lstm_layer in param["model"]["lstm_layer"]:
        for sequence_size  in param["model"]["sequence_size"]:
            # Initialize DataSet
            dataset_train = DataSet(train_data_scaled, timesteps=sequence_size)
            dataset_validation = DataSet(validation_data_scaled, timesteps=sequence_size)
            
            # Initialize DataLoader
            data_loader_training = DataLoader(dataset_train, 
                                              batch_size=param["model"]["batch_size"], 
                                              num_workers=0, 
                                              shuffle=True, 
                                              drop_last=True
                                             )
            data_loader_validation = DataLoader(dataset_validation, 
                                                batch_size=param["model"]["batch_size"], 
                                                num_workers=0, 
                                                shuffle=True, 
                                                drop_last=True
                                               )
            
            for n_hidden_lstm in param["model"]["n_hidden_lstm"]:
                for n_hidden_fc in param["model"]["n_hidden_fc"]:
                    print("Start with new hyperparameters in grid search: ")
                    print("Sequence_size: {}".format(sequence_size))
                    print("Number LSTM Layers: {}".format(n_lstm_layer))
                    print("LSTM Number Hidden Dimensions: {}".format(n_hidden_lstm))
                    print("FC NN Number Hidden Dimensions: {}".format(n_hidden_fc))
                    
                    for phase in range(1, 3):
                        print("\nStart with phase {}:".format(phase))
                        # Create lists to save training loss and validation loss of each epoch
                        hist_loss = []
                        torch.manual_seed(0)
                        model = LstmMle_1(batch_size=param['model']['batch_size'], 
                                          input_dim=param['model']['input_size'], 
                                          n_hidden_lstm=n_hidden_lstm, 
                                          n_layers=n_lstm_layer,
                                          dropout_rate_lstm= param['model']['dropout_rate_lstm'],
                                          dropout_rate_fc= param['model']['dropout_rate_fc'],
                                          n_hidden_fc=n_hidden_fc,
                                          K=phase-1,
                                          )
                        
                        if phase == 2:
                            checkpoint = torch.load(current_path_model_phase_1)
                            model.load_state_dict(checkpoint['model_state_dict'])
                            print("Load model for phase 2")

                        # Define Loss Function
                        criterion = LossMle(param["model"]["input_size"], param["model"]["batch_size"])

                        # Initialize Optimizer and Cyclic Learning Rate Scheduler
                        optimizer = torch.optim.SGD(model.parameters(), lr=1.)  
                        scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer=optimizer, 
                                                                      base_lr=param['cycling_lr']['base_lr'], 
                                                                      max_lr=param['cycling_lr']['max_lr'], 
                                                                      step_size_up=(raw_training_data.shape[0]/8)*2, 
                                                                      mode=param['cycling_lr']['mode'],
                                                                      gamma=param['cycling_lr']['gamma']
                                                                      )
                        # Initialize Trainer
                        trainer = Trainer(model=model,
                                          optimizer=optimizer,
                                          scheduler=scheduler,
                                          scheduler_active = param["cycling_lr"]["scheduler_active"],
                                          criterion=criterion, 
                                          location_model=param["filed_location"]["trained_model"], 
                                          location_stats=param["filed_location"]["history_validation"], 
                                          patience=param['training']['patience']
                                         )

                        # Measure training time for current configuration
                        start = time.time()

                        for epoch in range(param['training']['n_epochs']):
                            # Train
                            mean_epoch_training_loss = trainer.train(data_loader_training)

                            # Evaluate
                            mean_epoch_validation_loss = trainer.evaluate(data_loader_validation, hist_loss, epoch)

                            # Cache History
                            trainer.cache_history_training(hist_loss, epoch, mean_epoch_training_loss, mean_epoch_validation_loss)

                            # Save model if its the best one since the last change in configuration of hyperparameters
                            trainer.fold = "Fold_1-"+str(iteration)
                            status_ok, path_model = trainer.save_model(epoch, mean_epoch_validation_loss, param['model']['input_size'], 
                                                           n_lstm_layer, n_hidden_lstm, n_hidden_fc, sequence_size)
                            current_path_model_phase_1 = path_model
                            
                            if not status_ok or epoch == (param['training']['n_epochs'])-1:
                                # Statistics of current fold
                                statistics_validation = [iteration,"fold "+str(iteration+1),
                                                         trainer.lowest_loss, n_hidden_lstm, sequence_size,
                                                         n_lstm_layer, n_hidden_fc, phase]
                                
                                # Safe statistics to .csv file
                                with open(param["filed_location"]["history_validation"], "a") as file:
                                    for value in statistics_validation:
                                        file.write(str(value)+";")
                                    file.write("\n")
                                break       
                    print("\n"+"# #"*30+"\n")
                    
    # Test model with best hyperparameters of current fold
    df_hyperparameters = pd.read_csv(param["filed_location"]["history_validation"])
    current_fold = df_hyperparameters.loc[df_hyperparameters.iteration == iteration]
    # Get index of row with lowest validation loss
    idx_best_hyperparm = current_fold.lowest_validation_loss.idxmin()
    opt_n_hidden_lstm = current_fold.iloc[idx_best_hyperparm, 4]
    opt_sequence_size = current_fold.iloc[idx_best_hyperparm, 5]
    opt_n_lstm_layer = current_fold.iloc[idx_best_hyperparm, 6]
    opt_n_didden_fc = current_fold.iloc[idx_best_hyperparm, 7]
    
    # Load best model
    torch.manual_seed(0)
    model = LstmMle(batch_size=param["model"]["batch_size"], 
                    input_dim=param["model"]["input_size"], 
                    n_hidden_lstm=opt_n_hidden_lstm, 
                    n_layers=opt_n_lstm_layer,
                    dropout_rate_lstm=param["model"]["dropout_rate_lstm"],
                    dropout_rate_fc=param["model"]["dropout_rate_fc"],
                    n_hidden_fc=opt_n_didden_fc
                    )
    checkpoint = torch.load(param_test["model_for_testset"]["path"])
    model.load_state_dict(checkpoint['model_state_dict'])

    # Initialize DataSet
    dataset_test = DataSet(test_data_scaled, timesteps=opt_sequence_size)

    # Initialize DataLoader
    data_loader_test = DataLoader(dataset_test, 
                                  batch_size=param["model"]["batch_size"], 
                                  num_workers=0, 
                                  shuffle=True, 
                                  drop_last=True
                                 )

    # Define Loss Function
    criterion = LossMle(param["model"]["input_size"], param["model"]["batch_size"])

    # Initialize Tester
    tester = Tester(model=model, criterion=criterion)
                                                     
    # Evaluate Testset
    mean_test_loss = tester.evaluate(data_loader_test)
    print("\n"+"# #"*30+"\n")
    print("Mean loss of test dataset is {}".format(mean_test_loss))
    print("\n"+"# #"*30+"\n")
    # Statistics of current fold
    statistics_test = ["fold "+str(iteration+2),
                 mean_test_loss, n_hidden_lstm, sequence_size,
                 n_lstm_layer, n_hidden_fc]

    # Safe statistics to .csv file
    with open(param["filed_location"]["history_test"], "a") as file:
        for value in statistics_test:
            file.write(str(value)+";")
        file.write("\n")

print("Cross Validation finished")

Training Data : Fold 1-2
Validation Data : Fold 3
Test Data : Fold 4
Amount Training Data: 334
Amount Validation Data: 167
Amount Test Data: 167
- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
Start with new hyperparameters in grid search: 
Sequence_size: 20
Number LSTM Layers: 1
LSTM Number Hidden Dimensions: 10
FC NN Number Hidden Dimensions: 60

Start with phase 1:
-------- epoch_no. 0 finished with eval loss 0.7073911925156912--------
Epoch 0: best model saved with loss: 0.7073911925156912
-------- epoch_no. 1 finished with eval loss 0.6998292439513736--------
Epoch 1: best model saved with loss: 0.6998292439513736
-------- epoch_no. 2 finished with eval loss 0.6910380356841617--------
Epoch 2: best model saved with loss: 0.6910380356841617
-------- epoch_no. 3 finished with eval loss 0.6853121022383372--------
Epoch 3: best model saved with loss: 0.6853121022383372
-------- epoch_no. 4 finished with eval loss 0.6810592214266459--------
Ep

NameError: name 'statistics_folds' is not defined

## Test statistics

In [None]:
dataframe = pd.read_csv(param["filed_location"]["history_test"], sep=";")
dataframe.head(20)