# VAE Time Series Generation with top 10 Variables

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from config import Config
from dataset import DataManager
from models import VAE
from train import ModelManager
from generate import Generator
from evaluate import Evaluator
from visualize import Visualizer
from utils import Utils

In [None]:
pd.set_option('display.max_columns', None)
%matplotlib inline

# Configs

In [None]:
import importlib
import config
importlib.reload(config)
from config import Config

In [None]:
config = Config()

config.BETA = 0.7
config.FINAL_BETA = 0.7
config.EPOCHS = 2000              
config.LEARNING_RATE = 8e-5  
config.BATCH_SIZE = {"FULL": 80} 


Utils.set_seed(config.SEED)

# Load Data

In [None]:
data_manager = DataManager(config)
data_manager.load_data()
data_manager.prepare_dataloaders()

print(f"Size of P1 dataset: {data_manager.P1_dataset.shape}")
print(f"Size of P2 dataset: {data_manager.P2_dataset.shape}")
print(f"Size of P3 dataset: {data_manager.P3_dataset.shape}")

# Train Conditional VAE

In [None]:
model_manager = ModelManager(config)

print("Start training Conditional VAE...")
cvae_model = model_manager.train_model(data_manager.full_loader)

# Generate Synthetic Data

In [None]:
config.GEN_SAMPLES = 1000

In [None]:

generator = Generator(
    model=cvae_model,  
    scalers=[data_manager.scaler_P1, data_manager.scaler_P2, data_manager.scaler_P3],
    config=config
)


gen_data, gen_labels, gen_P1, gen_P2, gen_P3 = generator.generate_all()

In [None]:
gen_P1 = generator.gen_P1
gen_P2 = generator.gen_P2
gen_P3 = generator.gen_P3

# Save Outcome

In [None]:
np.save('generated_data.npy', gen_data)
np.save('generated_labels.npy', gen_labels)

In [None]:
torch.save(cvae_model.state_dict(), 'Policy-CVAE.pth')

In [None]:
variables = [
    'Emissions|CO2',
    'Final Energy|Industry|Solids|Coal',
    'Final Energy|Industry|Solids|Biomass',
    'Final Energy|Residential and Commercial|Solids|Coal',
    'Emissions|CO2|Energy|Demand|Industry',
    'Secondary Energy|Electricity|Coal',
    'Emissions|CO2|Energy|Supply|Electricity',
    'Primary Energy|Coal',
    'Emissions|CO2|Energy and Industrial Processes',
    'Emissions|CO2|AFOLU'
]
years = [str(y) for y in range(2020, 2101, 5)]    

rows = []
for i in range(gen_data.shape[0]):          
    for j, var in enumerate(variables):   
        vals = gen_data[i, :, j].tolist()  
        rows.append([var] + vals + [int(gen_labels[i])])

df = pd.DataFrame(rows, columns=['Variable'] + years + ['Label'])
df.to_csv('synthetic_data.csv', index=False)

# Data Quality Estimation based on RF

In [None]:
import importlib
import evaluate
importlib.reload(evaluate)
from evaluate import Evaluator

In [None]:
#del Utils
import importlib
import utils
importlib.reload(utils)
from utils import Utils

In [None]:
gen_P1 = generator.gen_P1
gen_P2 = generator.gen_P2
gen_P3 = generator.gen_P3

In [None]:
X_test = data_manager.all_data  
Y_test = data_manager.all_labels 
gen_samples = generator.gen_data  
gen_labels = generator.gen_labels  

In [None]:
evaluator = Evaluator(config)
clf_rg, clf_gr, real_df, gen_df = evaluator.evaluate(
    X_test,
    Y_test,
    gen_samples,
    gen_labels
)

In [None]:
top_real_to_gen, top_gen_to_real = evaluator.print_feature_importance(top_n=10)

# Similarity Assessment

In [None]:
from similarity import rmse_mean, mape_mean, dtw_distance_mean, wasserstein_distance_mean

In [None]:
from dataset import TimeSeriesScaler

In [None]:
scaler_real_P1 = TimeSeriesScaler(range_values=(0, 1))
scaler_real_P2 = TimeSeriesScaler(range_values=(0, 1))
scaler_real_P3 = TimeSeriesScaler(range_values=(0, 1))

scaled_real_P1 = scaler_real_P1.fit_transform(real_P1)  
scaled_real_P2 = scaler_real_P2.fit_transform(real_P2)
scaled_real_P3 = scaler_real_P3.fit_transform(real_P3)

In [None]:
scaler_gen_P1 = TimeSeriesScaler(range_values=(0, 1))
scaler_gen_P2 = TimeSeriesScaler(range_values=(0, 1))
scaler_gen_P3 = TimeSeriesScaler(range_values=(0, 1))

scaled_gen_P1 = scaler_real_P1.fit_transform(gen_P1)  
scaled_gen_P2 = scaler_real_P2.fit_transform(gen_P2)
scaled_gen_P3 = scaler_real_P3.fit_transform(gen_P3)

## RMSE

In [None]:
features = range(10) 
df_rmse = pd.DataFrame(index=['P1', 'P2', 'P3'], columns=features)

for i, (real, gen) in enumerate(zip([scaled_real_P1, scaled_real_P2, scaled_real_P3], 
                                   [scaled_gen_P1, scaled_gen_P2, scaled_gen_P3])):
    policy_name = df_rmse.index[i]  # P1/P2/P3
    for feat in features:
        df_rmse.loc[policy_name, feat] = rmse_mean(real, gen, feature_idx=feat)

In [None]:
df_rmse.to_csv('rmse_results.csv', float_format='%.4f')  

## MAPE

In [None]:
features = range(10) 
df_mape = pd.DataFrame(index=['P1', 'P2', 'P3'], columns=features)

for i, (real, gen) in enumerate(zip([scaled_real_P1, scaled_real_P2, scaled_real_P3], 
                                   [scaled_gen_P1, scaled_gen_P2, scaled_gen_P3])):
    policy_name = df_mape.index[i]  
    for feat in features:
        df_mape.loc[policy_name, feat] = mape_mean(real, gen, feature_idx=feat)

In [None]:
df_mape.to_csv('mape_results.csv', float_format='%.4f')  

## DTW

In [None]:
features = range(10) 
df_dtw = pd.DataFrame(index=['P1', 'P2', 'P3'], columns=features)

for i, (real, gen) in enumerate(zip([scaled_real_P1, scaled_real_P2, scaled_real_P3], 
                                   [scaled_gen_P1, scaled_gen_P2, scaled_gen_P3])):
    policy_name = df_dtw.index[i]  
    for feat in features:
        df_dtw.loc[policy_name, feat] = dtw_distance_mean(real, gen, feature_idx=feat)

In [None]:
df_dtw.to_csv('dtw_results.csv', float_format='%.4f')  

## Wasserstein_distance

In [None]:
features = range(10) 
df_w = pd.DataFrame(index=['P1', 'P2', 'P3'], columns=features)

for i, (real, gen) in enumerate(zip([scaled_real_P1, scaled_real_P2, scaled_real_P3], 
                                   [scaled_gen_P1, scaled_gen_P2, scaled_gen_P3])):
    policy_name = df_w.index[i]  
    for feat in features:
        df_w.loc[policy_name, feat] = wasserstein_distance_mean(real, gen, feature_idx=feat)

In [None]:
df_w.to_csv('wasserstein_results.csv', float_format='%.4f')  