In [None]:
""" Dimensionality Reduction

    Se pretende en este notebook mostrar una comparativa en términos de resultados
    de la reducción dimensional que hacen dos métodos sobre el mismo conjunto de datos:
    PCA vs Auto-Encoders.

"""
## LIBRARIES AND DATA

# Generic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime as dt
from datetime import date
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras



In [None]:
### DATA EXTRACTION AND PREPARATION

# Numeric values extraction (excel LVSM_Def.xlsx)
values_column_names = ["time", "branch" , "organization", "substation", "transformer_code", "App SW", 
                        "V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
                        "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
                        "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
                        "temp_amb",
                        "aplus_L1", "aminus_L1", "RplusL_L1", "RminusL_L1", "RplusC_L1", "RminusC_L1", 
                        "aplus_L2", "aminus_L2", "RplusL_L2", "RminusL_L2", "RplusC_L2", "RminusC_L2",
                        "aplus_L3", "aminus_L3", "RplusL_L3", "RminusL_L3", "RplusC_L3", "RminusC_L3"]

# Retrieve data on values
# script_path = os.path.dirname(__file__)
# Read csv from local file
data_lvsm = pd.read_csv('../DATA/LVSM_Def.csv',  sep = ';', header=0, names=values_column_names)

# Read csv from GitHub
# url_data = 'https://gitlab.com/Ander_gargas/tfm-cic/-/raw/master/Listado_Trafos.csv'
# data_lvsm = pd.read_csv(url_data,  sep = ';', header=0, names=values_column_names, encoding='latin-1')

# Cleaning data table
data = data_lvsm.drop(["aplus_L1", "aminus_L1", "RplusL_L1", "RminusL_L1", "RplusC_L1", "RminusC_L1", 
                  "aplus_L2", "aminus_L2", "RplusL_L2", "RminusL_L2", "RplusC_L2", "RminusC_L2",
                  "aplus_L3", "aminus_L3", "RplusL_L3", "RminusL_L3", "RplusC_L3", "RminusC_L3"], axis=1)
data = data.reset_index(drop = True)

# Change column types to appropiate
data = data.astype({"time": str, "branch": str , "organization": str, "substation": str, "transformer_code": str, "App SW": str})

data[["V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
      "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
      "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
      "temp_amb"]] = data[["V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
                           "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
                           "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
                           "temp_amb"]].astype(float)

### Deal with the "24:00" problem. Adapt BOTH the hour and the day. 
# Get the indexes and replace hour
for i, date in enumerate(data['time']):
    if date.split()[1].split(':')[0] == '24':
        data.loc[i, 'time'] = data.loc[i, 'time'].replace("24:00","00:00")
        data.loc[i, 'time'] = pd.to_datetime(data.loc[i, 'time'], format = '%Y-%m-%d %H:%M') + timedelta(days = 1)

# Update the format
data['time'] = pd.to_datetime(data['time'], format = '%Y-%m-%d %H:%M:%S')

# Copy of the dataframe to split date and hour
data_new = data.copy(deep=True)

# Split the time column into date and hour columns, for diagram's input preparation
data_new['date'] = (data_new['time']).dt.date
data_new['hour'] = (data_new['time']).dt.time

# Delete the old time column
data_new = data_new.drop(["time"], axis=1)

# Put both columns at the start
data_new = pd.concat([data_new['hour'], data_new.drop('hour',axis=1)], axis=1)
data_new = pd.concat([data_new['date'], data_new.drop('date',axis=1)], axis=1)

# Cleaning NA values
if data_new.isna().sum().sum() < .10 * len(data_new): 
    # print ("Cleaning NA values from dataset")
    data_new = data_new.dropna()
else:
    raise Exception("Careful! Deleting NaN values would cut most of the dataset")

# Remove duplicates
if data.duplicated().sum() < .10 * len(data_new): 
    # print ("Cleaning duplicate values from dataset")
    data_new = data_new.drop_duplicates(subset=['date', 'hour', 'substation', 'App SW'])
else:
    raise Exception("Careful! Deleting duplicated values would cut most of the dataset")

# Prepare the train (learn), validation (optimize) and test dataset (classification performances)
msk = np.random.rand(len(data_new)) < 0.98

df_train = data_new[msk]
df_test = data_new[~msk]


In [None]:
### Part 1: PCA analysis

"""
Principal Component Analysis (PCA) is one of the most popular dimensionality reduction algorithms. 
PCA works by finding the axes that account for the larges amount of variance in the data which are orthogonal to each other.

The steps to perform PCA are:
    - Standardize the data.
    - Obtain the Eigenvectors and Eigenvalues from the covariance matrix or correlation matrix, or perform Singular Value Decomposition.
"""

In [None]:
### Part 2: Simple undercomplete AE

"""
An Autoencoder (AE) on the other hand is a special kind of neural network which is trained to copy its input to its output. 
First, it maps the input to a latent space of reduced dimension, then code back the latent representation to the output.
An AE learns to compress data by reducing the reconstruction error.
"""

In [None]:
### Part 3: Stacked Linear AE

In [None]:
### PArt 4: Non-Linear AE