In [12]:
#! pip install matplotlib
#!pip install scipy

In [13]:
import os, warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
warnings.filterwarnings('ignore') 

In [14]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import scipy
import sys 
import time
import joblib
import multiprocessing
import random


sys.path.insert(0, './metrics/')
# from discriminative_metrics3 import discriminative_score_metrics
from discriminative_metrics2 import discriminative_score_metrics

from predictive_metrics3 import predictive_score_metrics
from visualization_metrics import visualization

In [15]:
orig_data_dir = "../../data/processed_orig_data/"
gen_data_dir = "../../data/generated_data/"

scores_dir = './scores/'

# Scaler

In [16]:
class MinMaxScaler():
    """Min Max normalizer.
    Args:
    - data: original data

    Returns:
    - norm_data: normalized data
    """
    def fit_transform(self, data): 
        self.fit(data)
        scaled_data = self.transform(data)
        return scaled_data


    def fit(self, data):    
        self.mini = np.min(data, 0)
        self.range = np.max(data, 0) - self.mini
        return self
        

    def transform(self, data):
        numerator = data - self.mini
        scaled_data = numerator / (self.range + 1e-7)
        return scaled_data

    
    def inverse_transform(self, data):
        data *= self.range
        data += self.mini
        return data

In [17]:
def confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [18]:
def split_list_into_lists(list_of_items, num_splits):
    if len(list_of_items) % num_splits == 0: 
        num_per_split = (len(list_of_items) // num_splits) 
    else: 
        num_per_split = (len(list_of_items) // num_splits) + 1

    list_of_split_lists = []
    for i in range(num_splits):
        list_of_split_lists.append(list_of_items[i * num_per_split : (i + 1) * num_per_split ])
        
    list_of_split_lists = [l for l in list_of_split_lists if l  ]

    return list_of_split_lists

# Main Calculations

In [26]:
def evaluate_data(params, split_num):     
    
    model = params[0]
    dataset = params[1]
    training_size = params[2]

    data = []
    print(f"Running Model: {model}; dataset = {dataset}, perc = {training_size} on split {split_num}")

    ## original data 
    fname = f'{orig_data_dir + dataset}_subsampled_train_perc_{training_size}.npz'
    loaded = np.load(fname)
    ori_data = loaded['data']            

    ## scale orig 
    scaler_orig = MinMaxScaler( )  
    scaled_ori_data = scaler_orig.fit_transform(ori_data)

    sample_file_name = gen_data_dir + f'{model}/{model}_gen_samples_{dataset}_perc_{training_size}.npz'
        
    if not os.path.isfile(sample_file_name): return

    loaded = np.load(sample_file_name)
    gen_data = loaded['data']     

    # load and scale generated data 
    if model == 'vae_conv_I': 
        scaled_gen_data = scaler_orig.transform(gen_data)     
    else: 
        scaled_gen_data = gen_data
    
#     print(scaled_ori_data.shape, scaled_gen_data.shape); return

    # ---------------------------------------------------------------------------
    # print("-"*90); print('Visualizations:')
    # visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'pca')
    # visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'tsne')

    # ---------------------------------------------------------------------------
    
    predictive_score, discriminative_score = [], []
    for tt in range(metric_iteration):
        temp_pred = predictive_score_metrics(scaled_ori_data, scaled_gen_data, 
                                             predictor = 'conv', # conv, rnn, nbeats
                                             epochs = pred_epochs, print_epochs = print_period)
#         temp_pred = -1
        predictive_score.append(temp_pred)  
#         print(temp_pred); continue

        temp_disc = discriminative_score_metrics(scaled_ori_data, scaled_gen_data, print_epochs=print_period)
#         temp_disc = -1
        discriminative_score.append(temp_disc)   
        print(tt, model, dataset, training_size, temp_pred, temp_disc)  

    
    pred_mean = np.round(np.mean(predictive_score), 4)
    pred_CI = np.round(confidence_interval(predictive_score)[1], 4)
    
    disc_mean = np.round(np.mean(discriminative_score), 4)
    disc_CI = np.round(confidence_interval(discriminative_score)[1], 4)
    #disc_mean = -1.; disc_CI = -1.

    
    print(f"***Split/Model/Data/Perc : {split_num}/{model}/{dataset}/{training_size} Scores:", 
          pred_mean, "+/-", pred_CI, disc_mean, "+/-", disc_CI)

    #     ---------------------------------------------------------------------------
    # save pred results
    data =  [[  model,  dataset,   training_size,   metric_iteration,  
                  pred_epochs, pred_mean,  pred_CI  ]]
    cols = [ 'model', 'dataset', 'train_perc', 'iters', 'epochs', 'mean', 'conf_int']
    df = pd.DataFrame(data, columns = cols)
    df.insert(0, 'metric', 'pred_score')
    df.to_csv(f"./{scores_dir}/{model}/{model}_pred_scores_{dataset}_{training_size}.csv", index=False,
              float_format='%.4f')
    
    # save disc results
    data =  [[  model,  dataset,   training_size,   metric_iteration,  
                  disc_epochs, disc_mean,  disc_CI  ]]
    cols = [ 'model', 'dataset', 'train_perc', 'iters', 'epochs' , 'mean', 'conf_int' ]
    df = pd.DataFrame(data, columns = cols)
    df.insert(0, 'metric', 'disc_score')
    df.to_csv(f"./{scores_dir}/{model}/{model}_disc_scores_{dataset}_{training_size}.csv", index=False,
              float_format='%.4f')
    

In [20]:
def evaluate_all_data(params_sublist, split_num): 
    num = len(params_sublist)
    for i, params in enumerate(params_sublist): 
        evaluate_data(params, split_num)
        print(f"Completed {i+i} of {num} on {split_num}")

In [None]:
start = time.time()

metric_iteration = 5

pred_epochs = 500
disc_epochs = 500 

print_period = 100

# full selection of data to run
models = ['vae_conv_I', 'rcgan', 'T_forcing', 'timegan']
training_sizes = [2, 5, 10, 20, 100]
datasets = ['stocks', 'stocks2', 'air', 'sine', 'energy']


### custom selection 
models = ['vae_conv_I']
datasets = [ 'stocks2' ]
# training_sizes = [ 2, 5, 10 ]


params_list = [ [model, data_name, p ] for model in models for p in training_sizes  for data_name in datasets ]

# Get cpu_count and use all but one for resource calculations
num_cpus_to_use = multiprocessing.cpu_count() - 2
if num_cpus_to_use > 8: num_cpus_to_use = 8
if len(params_list) < num_cpus_to_use: num_cpus_to_use = len(params_list)
# num_cpus_to_use = 1


if num_cpus_to_use == 1: 
    evaluate_all_data(params_list, 0)    
else: 
#     random.shuffle(params_list)
    
    split_params_lists = split_list_into_lists(params_list, num_cpus_to_use)
    num_cpus_to_use = len(split_params_lists)
    print(f"Using {num_cpus_to_use} CPUs")
    
    pool = multiprocessing.Pool(num_cpus_to_use)    
#     print(split_params_lists); sys.exit()

    # run forecasts on each thread
    for split_num in range(num_cpus_to_use):
        
        pool.apply_async(evaluate_all_data, 
        args=( split_params_lists[split_num], split_num) )

    pool.close()
    pool.join() 
    

end = time.time()
print(f"Total run time: {np.round((end - start)/60.0, 2)} minutes") 

Using 5 CPUs
Running Model: vae_conv_I; dataset = stocks2, perc = 5 on split 1
Running Model: vae_conv_I; dataset = stocks2, perc = 10 on split 2
Running Model: vae_conv_I; dataset = stocks2, perc = 2 on split 0
Running Model: vae_conv_I; dataset = stocks2, perc = 100 on split 4
Running Model: vae_conv_I; dataset = stocks2, perc = 20 on split 3
Disc Avg. train / val loss for epoch 100: 0.664 / 0.676 
Disc Avg. train / val loss for epoch 100: 0.674 / 0.663 
Disc Avg. train / val loss for epoch 100: 0.69 / 0.689 
Disc Avg. train / val loss for epoch 200: 0.598 / 0.601 
Disc Avg. train / val loss for epoch 200: 0.639 / 0.624 
Disc Avg. train / val loss for epoch 100: 0.692 / 0.693 
Disc Avg. train / val loss for epoch 300: 0.466 / 0.362 
Disc Avg. train / val loss for epoch 300: 0.609 / 0.615 
Disc Avg. train / val loss for epoch 200: 0.681 / 0.675 
Disc Avg. train / val loss for epoch 400: 0.364 / 0.276 
Disc Avg. train / val loss for epoch 400: 0.509 / 0.528 
0 vae_conv_I stocks2 100 -1