# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sktime.datasets import load_from_ucr_tsv_to_dataframe
from sktime.datasets import load_from_tsfile
import warnings
warnings.filterwarnings("ignore")
from utilities_helper import *
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import pandas as pd
import numpy as np
import os
from sktime.datasets import load_from_ucr_tsv_to_dataframe, load_from_tsfile


  from .autonotebook import tqdm as notebook_tqdm


## Functions

In [2]:
# print current working directory
current_dir = os.getcwd()
# get one directory backwards but dont change dir
parent_dir = os.path.dirname(os.getcwd())
# Datasets dir
original_datasets_dir = os.path.join(parent_dir,"Datasets")
# Synthetic datasets dir
synthetic_datasets_dir = os.path.join(current_dir,"DGAN_data")
results_dir = os.path.join(current_dir,"pretrained_finetuned")


csv_path = os.path.join(current_dir,"all_experiments_param.csv")
config_path = os.path.join(current_dir,"config.yaml")

config_data = read_yaml_config(config_path)
csv_data = pd.read_csv(csv_path)
csv_data = csv_data[csv_data['dataset_name'] != 'DuckDuckGeese']

completed_experiments = pd.read_csv(os.path.join(current_dir,"completed_experiments.csv"))

In [4]:
csv_data

Unnamed: 0,dataset_name,model_name,BM_batch_size_ratio,hidden_dim,num_layers,epochs,learning_rate,synthetic_num_samples,dgan_original_data_ratio,finetuning_original_data_ratio
0,ArticularyWordRecognition,inceptionTime,0.1,,,10,0.0010,2,0.7,0.7
1,ArticularyWordRecognition,inceptionTime,0.1,,,10,0.0010,2,0.7,0.9
2,ArticularyWordRecognition,inceptionTime,0.1,,,10,0.0010,2,0.9,0.9
3,ArticularyWordRecognition,inceptionTime,0.1,,,10,0.0010,4,0.7,0.7
4,ArticularyWordRecognition,inceptionTime,0.1,,,10,0.0010,4,0.7,0.9
...,...,...,...,...,...,...,...,...,...,...
4795,PenDigits,LSTM,0.2,128.0,3.0,20,0.0001,2,0.7,0.9
4796,PenDigits,LSTM,0.2,128.0,3.0,20,0.0001,2,0.9,0.9
4797,PenDigits,LSTM,0.2,128.0,3.0,20,0.0001,4,0.7,0.7
4798,PenDigits,LSTM,0.2,128.0,3.0,20,0.0001,4,0.7,0.9


# Running Experiments

In [3]:
for index, row in csv_data.iterrows():
    if str(index) not in completed_experiments['experiment_index'].tolist():
        # Read dataset and save into corrsponding variables
        dataset_name = row['dataset_name']
        dgan_original_data_ratio = row['dgan_original_data_ratio']
        epochs = row['epochs']
        finetuning_original_data_ratio = row['finetuning_original_data_ratio']


        print(dataset_name)
        # Read Synthetic data
        pretrain_path = f'''{synthetic_datasets_dir}\{dataset_name}\{dgan_original_data_ratio}\generated_data.npy'''
        #Read Original data + metadata
        train_path = f'''{original_datasets_dir}\{dataset_name}\{dataset_name}_TRAIN.ts'''
        test_path = f'''{original_datasets_dir}\{dataset_name}\{dataset_name}_TEST.ts'''
        # Loading synthetic data
        pretraining_data = np.load(pretrain_path,allow_pickle=True)
        X = pretraining_data.item().get('X')
        y = pretraining_data.item().get('y')
        # Create parameters for training
        num_samples = X.shape[0]
        batch_size = int(row['BM_batch_size_ratio']*num_samples)
        #Reading metadata
        series_length,features_num,num_classes = extract_metadata(train_path)
        # Loading original data + metadata + preprocessing
        X_train,y_train = load_from_tsfile(train_path)
        X_test,y_test = load_from_tsfile(test_path)
        X_train = preprocess_dgan(X_train,series_length)
        X_test = preprocess_dgan(X_test,series_length)
        _,__,y_test = map_label_int(y_test)
        _,__,y_train = map_label_int(y_train)
        _,__,y = map_label_int(y)

        # y is a list of float string i want to transform it to integers without map_label_int
        # y = np.array([int(float(label)) for label in y])
        # y -= 1
        try:
            ssf = StratifiedShuffleSplit(n_splits=1, test_size=1-finetuning_original_data_ratio)
            train_ind, test_ind = next(ssf.split(X_train,y_train))
            X_train = X_train[train_ind]
            y_train = y_train[train_ind]
        except:
            print("StratifiedShuffleSplit failed")
            completed_experiments = completed_experiments.append({'experiment_index': "StratifiedShuffleSplit failed"}, ignore_index=True)
            completed_experiments.to_csv('completed_experiments.csv', index=False)


        train_samples = X.shape[0]
        test_samples = X_test.shape[0]
        lr = row['learning_rate']
        model_type = row['model_name']
        #Writig to Yaml
            # Experiment Params
        config_data['experiment_params']['experiment_index'] = index
        config_data['experiment_params']['dataset_name'] = dataset_name
        config_data['experiment_params']['num_classes'] = num_classes
        config_data['experiment_params']['num_features'] = features_num
        config_data['experiment_params']['sequence_length'] = series_length
        config_data['datageneration']['percentage_of_original_data'] = row['dgan_original_data_ratio']
        # Pretraining
        config_data['pretraining']['model_type'] = model_type
        if model_type == "LSTM":
            config_data['pretraining']['hidden_size'] = int(row['hidden_dim'])
            config_data['pretraining']['num_layers_layers_stacked'] = int(row['num_layers'])
        else:
            config_data['pretraining']['hidden_size'] = 'NaN'
            config_data['pretraining']['num_layers_layers_stacked'] = 'NaN'

        config_data['pretraining']['batch_size'] = batch_size
        config_data['pretraining']['epochs'] = epochs
        config_data['pretraining']['learning_rate'] = lr
        config_data['pretraining']['shuffle'] = True
        config_data['pretraining']['save_each_epoch'] =False
        config_data['pretraining']['optimizer'] = 'Adam'
        config_data['pretraining']['criterion'] = "crossentropy"
        #Creating a dataloader
        synthetic_dataloader = DataLoader(TimeSeriesDataset(X,y),batch_size=batch_size,shuffle=config_data['finetuning']['shuffle'])
        train_dataloader = DataLoader(TimeSeriesDataset(X_train,y_train),batch_size=batch_size,shuffle=config_data['finetuning']['shuffle'])
        validation_dataloader = DataLoader(TimeSeriesDataset(X_test,y_test),batch_size=batch_size,shuffle=config_data['finetuning']['shuffle'])
        print('Dataloaders created!')

        if config_data['finetuning']['criterion'].lower() == "crossentropy":
                criterion = nn.CrossEntropyLoss()
        elif config_data['finetuning']['criterion'].lower() == "bce":
                criterion = nn.BCELoss()

        model = create_model_based_on_config(row['model_name'],config_data)
        model.to(device)

        # Optimizer
        if config_data['pretraining']['optimizer'].lower() == 'adam':
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        elif config_data['pretraining']['optimizer'].lower() == 'sgd':
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
        elif config_data['pretraining']['optimizer'].lower() == 'rmsprop':
            optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)

        # Pretraining
        if config_data['pretraining']['save_each_epoch'] == True:
            save_each_epoch = True
        else:
            save_each_epoch = False

        run_param = {"epochs":config_data['pretraining']['epochs'],
                "patience":config_data['pretraining']['epochs'],
                "batch_size": config_data['pretraining']['batch_size'],
                "learning_rate": config_data['pretraining']['learning_rate'], 
                "criterion":config_data['pretraining']['criterion'],
                "optimizer": config_data['pretraining']['optimizer']}
        
        experiment_param = create_experiment_param_new(config_data)


        model_path = pretrain_and_finetune(synthetic_dataloader = synthetic_dataloader,
                                            train_dataloader = train_dataloader,
                                            validation_dataloader = validation_dataloader,
                                            model = model,
                                            device = device,
                                            criterion = criterion,
                                            optimizer = optimizer,
                                            run_param = run_param,
                                            experiment_param = experiment_param)
        # After your experiment ends and you want to remove the current row from the csv_data
        completed_experiments = completed_experiments.append({'experiment_index': index}, ignore_index=True)

        # Overwrite the csv file
        completed_experiments.to_csv('completed_experiments.csv', index=False)
    else:
         print("Already did this experiment !")

ArticularyWordRecognition
Series Length: 144
Dimensions: 9
Unique labels: ['1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10.0', '11.0', '12.0', '13.0', '14.0', '15.0', '16.0', '17.0', '18.0', '19.0', '20.0', '21.0', '22.0', '23.0', '24.0', '25.0']
Dataloaders created!
Pretraining : 10 epochs
https://app.neptune.ai/astarteam/FinalProject/e/FIN-1238
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
loss=3.311, 0 / 1100
train accuracy = 0.31272727272727274, val_loss = 2.629415
loss=1.968, 0 / 1100
train accuracy = 0.8263636363636364, val_loss = 1.654886
loss=1.194, 0 / 1100
train accuracy = 0.9581818181818181, val_loss = 0.910528
loss=0.579, 0 / 1100
train accuracy = 0.9927272727272727, val_loss = 0.410328
loss=0.241, 0 / 1100
train accuracy = 1.0, val_loss = 0.176759
loss=0.103, 0 / 1100
train accuracy = 1.0, val_lo

KeyboardInterrupt: 