In [1]:
! pip install matplotlib
!pip install scipy

You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import os, warnings
import joblib
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
warnings.filterwarnings('ignore') 

In [3]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import scipy
import sys 
import time

# from metrics.discriminative_metrics import discriminative_score_metrics
from metrics.discriminative_metrics2 import discriminative_score_metrics

# from metrics.predictive_metrics import predictive_score_metrics
from metrics.predictive_metrics2 import predictive_score_metrics

from metrics.visualization_metrics import visualization

In [4]:
orig_data_dir = "../data/processed_orig_data/"
gen_data_dir = "../data/generated_data/"

scores_dir = './scores/'

# Scaler

In [5]:
class MinMaxScaler():
    """Min Max normalizer.
    Args:
    - data: original data

    Returns:
    - norm_data: normalized data
    """
    def fit_transform(self, data): 
        self.fit(data)
        scaled_data = self.transform(data)
        return scaled_data


    def fit(self, data):    
        self.mini = np.min(data, 0)
        self.range = np.max(data, 0) - self.mini
        return self
        

    def transform(self, data):
        numerator = data - self.mini
        scaled_data = numerator / (self.range + 1e-7)
        return scaled_data

    
    def inverse_transform(self, data):
        data *= self.range
        data += self.mini
        return data

# Main Calculations

In [6]:
def confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [11]:
start = time.time()

metric_iteration = 5

# model name
model = 'vae_conv_I'      # vae_conv_I, vae_IN, tforcing

disc_epochs = 500 ; pred_epochs = 500

# full selection of data to run
training_sizes = [2, 5, 10, 20, 100]
datasets = ['sine', 'stocks', 'energy', 'air']


# custom selection 
training_sizes = [ 2 ]
datasets = ['stocks']

data = []
for dataset in datasets:

    for training_size in training_sizes:

        print('-'*90); print('-'*90)
        print(f"Data: {dataset}; Training Size: {training_size}")

        ## original data 
        fname = f'{orig_data_dir + dataset}_subsampled_train_perc_{training_size}.npz'
        loaded = np.load(fname)
        ori_data = loaded['data']

        # generated data 
        sample_file_name = gen_data_dir + f'{model}/{model}_gen_samples_{dataset}_perc_{training_size}.npz'
        loaded = np.load(sample_file_name)
        gen_data = loaded['data']   

#         sample_file_name = gen_data_dir + f'{model}/{model}_gen_samples_{dataset}_{training_size}'
#         gen_data = joblib.load(sample_file_name)
#         gen_data = gen_data[0:ori_data.shape[0]]
        
        
        ## scale orig and generated data
        scaler_orig = MinMaxScaler( )  
        scaled_ori_data = scaler_orig.fit_transform(ori_data)
        scaled_gen_data = scaler_orig.transform(gen_data)         
        print('ori_data shape: ', ori_data.shape, 'gen_data shape: ', gen_data.shape)
        
    #     ---------------------------------------------------------------------------
#         print("-"*90); print('Visualizations:')
        # visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'pca')
#         visualization(scaled_ori_data[0:scaled_gen_data.shape[0]], scaled_gen_data, 'tsne')

#            ---------------------------------------------------------------------------
        print("-"*90); print('Discrimination Score :')
        discriminative_score = list()
        for tt in range(metric_iteration):
            temp_disc = discriminative_score_metrics(scaled_ori_data, scaled_gen_data,  epochs = disc_epochs)
            discriminative_score.append(temp_disc)  
            print("----------  disc iter: ", tt, 'score: ', temp_disc, '----------')

        disc_mean = np.round(np.mean(discriminative_score), 4)
        disc_CI = np.round(confidence_interval(discriminative_score)[1], 4)
        print("-"*90); print('Discrimination Score :')
        print('Discriminative score: ' + str(disc_mean))
        print("Discriminative score CI: ", disc_CI)
        #     ---------------------------------------------------------------------------             
        print("-"*90); print('Predictive Score :')
        predictive_score = list()
        for tt in range(metric_iteration):
            temp_pred = predictive_score_metrics(scaled_ori_data, scaled_gen_data, epochs = pred_epochs)
            predictive_score.append(temp_pred)   
            print("----------  pred iter: ", tt, 'score: ', temp_pred, '----------')
        
        pred_mean = np.round(np.mean(predictive_score), 4)
        pred_CI = np.round(confidence_interval(predictive_score)[1], 4)
        print('Predictive score: ' + str(pred_mean))
        print("Predictive score CI: ", pred_CI)

        print("\n")
        #     ---------------------------------------------------------------------------
        # save results
        data =  [[  model,  dataset,   training_size,   metric_iteration,  
                      disc_epochs, disc_mean,  disc_CI,  
                      pred_epochs, pred_mean,  pred_CI  ]]
        cols = ['model', 'dataset', 'train_perc', 'iters', 
                    'disc_epochs' , 'disc_mean', 'disc_CI', 
                    'pred_epochs', 'pred_mean', 'pred_CI']
        df = pd.DataFrame(data, columns = cols)
        df.to_csv(f"./{scores_dir}/{model}_disc_and_pred_scores_{dataset}_{training_size}.csv", index=False, float_format='%.4f')
end = time.time()
print(f"Total run time: {np.round((end - start)/60.0, 2)} minutes") 

------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------
Data: stocks; Training Size: 2
ori_data shape:  (49, 24, 6) gen_data shape:  (49, 24, 6)
------------------------------------------------------------------------------------------
Discrimination Score :
Avg. train / val loss for epoch 0: 0.642 / 0.525 
Avg. train / val loss for epoch 50: 0.438 / 0.372 
Avg. train / val loss for epoch 100: 0.276 / 0.201 
Avg. train / val loss for epoch 150: 0.192 / 0.089 
Avg. train / val loss for epoch 200: 0.172 / 0.064 
Avg. train / val loss for epoch 250: 0.105 / 0.171 
Avg. train / val loss for epoch 300: 0.104 / 0.059 
Avg. train / val loss for epoch 350: 0.102 / 0.032 
Avg. train / val loss for epoch 400: 0.087 / 0.03 
Avg. train / val loss for epoch 450: 0.083 / 0.027 
----------  disc iter:  0 score:  0.5 ----------
Avg. train / val loss for epoch 0: 0.491 / 0.448 


# Combine all score files into single file 

In [22]:
model = 'vae_conv_I'

training_sizes = [2, 5, 10, 20, 100]

datasets = ['sine', 'stocks', 'stocks2', 'energy', 'air']

all_scores = []
for dataset in datasets:
    for training_size in training_sizes:
        fname = f"./{scores_dir}/{model}_disc_and_pred_scores_{dataset}_{training_size}.csv"
        if os.path.exists(fname):
            print(fname)
            data = pd.read_csv(fname)
            all_scores.append(data)
all_scores = pd.concat(all_scores, ignore_index=True)
all_scores = all_scores.round(3)
all_scores.head()

././scores//vae_conv_I_disc_and_pred_scores_sine_2.csv
././scores//vae_conv_I_disc_and_pred_scores_sine_5.csv
././scores//vae_conv_I_disc_and_pred_scores_sine_10.csv
././scores//vae_conv_I_disc_and_pred_scores_sine_20.csv
././scores//vae_conv_I_disc_and_pred_scores_sine_100.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks_2.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks_5.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks_10.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks_20.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks_100.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks2_2.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks2_5.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks2_10.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks2_20.csv
././scores//vae_conv_I_disc_and_pred_scores_stocks2_100.csv
././scores//vae_conv_I_disc_and_pred_scores_energy_2.csv
././scores//vae_conv_I_disc_and_pred_scores_energy_5.csv
././scores//vae_conv_I_d

Unnamed: 0,model,dataset,train_perc,iters,disc_epochs,disc_mean,disc_CI,pred_epochs,pred_mean,pred_CI
0,vae_conv_I,sine,2,5,500,0.472,0.039,500,0.28,0.034
1,vae_conv_I,sine,5,5,500,0.041,0.025,500,0.218,0.0
2,vae_conv_I,sine,10,5,500,0.089,0.037,500,0.215,0.001
3,vae_conv_I,sine,20,5,500,0.112,0.03,500,0.216,0.0
4,vae_conv_I,sine,100,5,500,0.228,0.089,500,0.214,0.0


In [23]:
all_scores.to_csv(f"./{scores_dir}/{model}_disc_and_pred_scores_ALL.csv", index=False)

## Nicely organized score table 

In [24]:
new_df = all_scores[['model', 'dataset', 'train_perc', 'disc_mean', 'disc_CI', 
                        'pred_mean', 'pred_CI']]

new_df['disc_mean'] = new_df['disc_mean'].apply(lambda x: f'{x:0.3f}'[1:] )
new_df['disc_CI'] = new_df['disc_CI'].apply(lambda x: f'{x:0.3f}'[1:] )
new_df['pred_mean'] = new_df['pred_mean'].apply(lambda x: f'{x:0.3f}'[1:] )
new_df['pred_CI'] = new_df['pred_CI'].apply(lambda x: f'{x:0.3f}'[1:] )
new_df.head()

Unnamed: 0,model,dataset,train_perc,disc_mean,disc_CI,pred_mean,pred_CI
0,vae_conv_I,sine,2,0.472,0.039,0.28,0.034
1,vae_conv_I,sine,5,0.041,0.025,0.218,0.0
2,vae_conv_I,sine,10,0.089,0.037,0.215,0.001
3,vae_conv_I,sine,20,0.112,0.03,0.216,0.0
4,vae_conv_I,sine,100,0.228,0.089,0.214,0.0


In [25]:
new_df['disc_score'] = new_df.apply( lambda row:row['disc_mean'] + ' +/- ' + row['disc_CI'], axis = 1 )
new_df['pred_score'] = new_df.apply( lambda row: row['pred_mean'] + ' +/- ' + row['pred_CI'], axis = 1 )

new_df.drop(columns=['disc_mean', 'disc_CI','pred_mean', 'pred_CI'], inplace=True) 

new_df.head()

Unnamed: 0,model,dataset,train_perc,disc_score,pred_score
0,vae_conv_I,sine,2,.472 +/- .039,.280 +/- .034
1,vae_conv_I,sine,5,.041 +/- .025,.218 +/- .000
2,vae_conv_I,sine,10,.089 +/- .037,.215 +/- .001
3,vae_conv_I,sine,20,.112 +/- .030,.216 +/- .000
4,vae_conv_I,sine,100,.228 +/- .089,.214 +/- .000


In [26]:

non_pivoted_columns = ['model', 'train_perc']
pivoting_column = ['dataset']
metrics = ['disc_score', 'pred_score']

final_df = []
for metric in metrics: 
    pivoted_columns = metric
    cols = non_pivoted_columns + pivoting_column + [pivoted_columns]
    temp_df = new_df[cols]
    

    pivoted = temp_df.pivot_table(index = non_pivoted_columns, 
                                          aggfunc=lambda x: ' '.join(x),
                                          columns=pivoting_column, 
                                          values=pivoted_columns).reset_index()
    pivoted.insert(0, 'metric', metric)    
    
    final_df.append(pivoted)


final_df = pd.concat(final_df, axis=0, ignore_index=True)
final_df.head(10)

dataset,metric,model,train_perc,air,energy,sine,stocks,stocks2
0,disc_score,vae_conv_I,2,.497 +/- .008,.485 +/- .015,.472 +/- .039,.450 +/- .000,.482 +/- .031
1,disc_score,vae_conv_I,5,.401 +/- .271,.498 +/- .003,.041 +/- .025,.403 +/- .025,.460 +/- .055
2,disc_score,vae_conv_I,10,.498 +/- .003,.500 +/- .000,.089 +/- .037,.422 +/- .124,.297 +/- .215
3,disc_score,vae_conv_I,20,.500 +/- .000,.500 +/- .000,.112 +/- .030,.496 +/- .005,.431 +/- .126
4,disc_score,vae_conv_I,100,.499 +/- .000,.500 +/- .000,.228 +/- .089,.374 +/- .054,.396 +/- .065
5,pred_score,vae_conv_I,2,.069 +/- .005,.266 +/- .003,.280 +/- .034,.193 +/- .103,.163 +/- .016
6,pred_score,vae_conv_I,5,.046 +/- .002,.295 +/- .003,.218 +/- .000,.128 +/- .005,.122 +/- .019
7,pred_score,vae_conv_I,10,.005 +/- .000,.292 +/- .002,.215 +/- .001,.103 +/- .004,.074 +/- .000
8,pred_score,vae_conv_I,20,.011 +/- .001,.319 +/- .002,.216 +/- .000,.104 +/- .001,.051 +/- .001
9,pred_score,vae_conv_I,100,.008 +/- .001,.342 +/- .014,.214 +/- .000,.040 +/- .000,.018 +/- .000


In [27]:
final_df.to_csv(f"./{scores_dir}/{model}_disc_and_pred_scores_ALL_PIVOTED.csv", index=False)