In [1]:
#! pip install matplotlib
#!pip install scipy

In [2]:
import os, warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
warnings.filterwarnings('ignore') 

In [3]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import scipy
import sys 
import time

# from metrics.discriminative_metrics import discriminative_score_metrics
from metrics.discriminative_metrics2 import discriminative_score_metrics

# from metrics.predictive_metrics import predictive_score_metrics
from metrics.predictive_metrics2 import predictive_score_metrics

from metrics.visualization_metrics import visualization

In [4]:
orig_data_dir = "../data/processed_orig_data/"
gen_data_dir = "../data/generated_data/"

scores_dir = './scores/'

# Scaler

In [5]:
class MinMaxScaler():
    """Min Max normalizer.
    Args:
    - data: original data

    Returns:
    - norm_data: normalized data
    """
    def fit_transform(self, data): 
        self.fit(data)
        scaled_data = self.transform(data)
        return scaled_data


    def fit(self, data):    
        self.mini = np.min(data, 0)
        self.range = np.max(data, 0) - self.mini
        return self
        

    def transform(self, data):
        numerator = data - self.mini
        scaled_data = numerator / (self.range + 1e-7)
        return scaled_data

    
    def inverse_transform(self, data):
        data *= self.range
        data += self.mini
        return data

# Main Calculations

In [6]:
def confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [7]:
start = time.time()

metric_iteration = 2

# model name
model = 'vae_conv_I'      # vae_conv_I, vae_IN

disc_epochs = 30 ; pred_epochs = 30

# full selection of data to run
training_sizes = [2, 5, 10, 20, 100]
datasets = ['sine', 'stocks', 'energy', 'air']


# custom selection 
# training_sizes = [ 5 ]
datasets = ['air']

data = []
for dataset in datasets:

    for training_size in training_sizes:

        print('-'*90); print('-'*90)
        print(f"Data: {dataset}; Training Size: {training_size}")

        ## original data 
        fname = f'{orig_data_dir + dataset}_subsampled_train_perc_{training_size}.npz'
        loaded = np.load(fname)
        ori_data = loaded['data']

        ## generated data 
        sample_file_name = gen_data_dir + f'{model}/{model}_gen_samples_{dataset}_perc_{training_size}.npz'
        loaded = np.load(sample_file_name)
        gen_data = loaded['data']        
        
        ## scale orig and generated data
        scaler_orig = MinMaxScaler( )  
        scaled_ori_data = scaler_orig.fit_transform(ori_data)
        scaled_gen_data = scaler_orig.transform(gen_data)         
        print('ori_data shape: ', ori_data.shape, 'gen_data shape: ', gen_data.shape)
    #     ---------------------------------------------------------------------------
        print("-"*90); print('Visualizations:')
        # visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'pca')
#         visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'tsne')

#            ---------------------------------------------------------------------------
        print("-"*90); print('Discrimination Score :')
        discriminative_score = list()
        for tt in range(metric_iteration):
            temp_disc = discriminative_score_metrics(scaled_ori_data, scaled_gen_data,  epochs = disc_epochs)
            discriminative_score.append(temp_disc)  
            print("----------  disc iter: ", tt, 'score: ', temp_disc, '----------')

        disc_mean = np.round(np.mean(discriminative_score), 4)
        disc_CI = np.round(confidence_interval(discriminative_score)[1], 4)
        print("-"*90); print('Discrimination Score :')
        print('Discriminative score: ' + str(disc_mean))
        print("Discriminative score CI: ", disc_CI)

        #     ---------------------------------------------------------------------------             
        print("-"*90); print('Predictive Score :')
        predictive_score = list()
        for tt in range(metric_iteration):
            temp_pred = predictive_score_metrics(scaled_ori_data, scaled_gen_data, epochs = pred_epochs)
            predictive_score.append(temp_pred)   
            print("----------  pred iter: ", tt, 'score: ', temp_pred, '----------')
        
        pred_mean = np.round(np.mean(predictive_score), 4)
        pred_CI = np.round(confidence_interval(predictive_score)[1], 4)
        print('Predictive score: ' + str(pred_mean))
        print("Predictive score CI: ", pred_CI)

        print("\n")
        #     ---------------------------------------------------------------------------
        # save results
        data =  [[  model,  dataset,   training_size,   metric_iteration,  
                      disc_epochs, disc_mean,  disc_CI,  
                      pred_epochs, pred_mean,  pred_CI  ]]
        cols = ['model', 'dataset', 'train_perc', 'iters', 
                    'disc_epochs' , 'disc_mean', 'disc_CI', 
                    'pred_epochs', 'pred_mean', 'pred_CI']
        df = pd.DataFrame(data, columns = cols)
        df.to_csv(f"./{scores_dir}/{model}_disc_and_pred_scores_{dataset}_{training_size}.csv", index=False, float_format='%.4f')
end = time.time()
print(f"Total run time: {np.round((end - start)/60.0, 2)} minutes") 

------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------
Data: air; Training Size: 2
ori_data shape:  (163, 24, 13) gen_data shape:  (163, 24, 13)
------------------------------------------------------------------------------------------
Visualizations:
------------------------------------------------------------------------------------------
Discrimination Score :
Avg. train / val loss for epoch 0: 0.773 / 0.692 
----------  disc iter:  0 score:  0.48484848484848486 ----------
Avg. train / val loss for epoch 0: 0.802 / 1.008 
----------  disc iter:  1 score:  0.4696969696969697 ----------
------------------------------------------------------------------------------------------
Discrimination Score :
Discriminative score: 0.4773
Discriminative score CI:  0.0963
------------------------------------------------------------------------------------------
Predictive 



KeyboardInterrupt: 

# Combine all score files into single file 

In [8]:
all_scores = []
for dataset in datasets:
    for training_size in training_sizes:
        fname = f"./{scores_dir}/{model}_disc_and_pred_scores_{dataset}_{training_size}.csv"
        if os.path.exists(fname):
            data = pd.read_csv(fname)
            all_scores.append(data)
all_scores = pd.concat(all_scores, ignore_index=True)
all_scores = all_scores.round(3)
all_scores.head()

Unnamed: 0,model,dataset,train_perc,iters,disc_epochs,disc_mean,disc_CI,pred_epochs,pred_mean,pred_CI
0,vae_conv_I,air,2,2,30,0.477,0.096,30,0.205,0.741
1,vae_conv_I,air,5,2,30,0.497,0.036,30,0.11,0.153


In [9]:
all_scores.to_csv(f"./{scores_dir}/{model}_disc_and_pred_scores_ALL.csv", index=False)