# Exploratory Data Analysis 

## Performed on the energy network dataset, to verify outliers, distribution and meaningful graphs

In [14]:
# pip install sweetviz
# conda install -c conda-forge lux-api

# Specific libraries
# import sweetviz as sv
import autoviz
# from autoviz.AutoViz_Class import AutoViz_Class
import lux

# General libraries
import sys, os
from os import system
import warnings

import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

from datetime import datetime as dt
from datetime import date
from datetime import timedelta

### Part 0: Integrating the data

In [5]:
# Constants
values_column_names = ["time", "branch" , "organization", "substation", "transformer_code", "App SW", 
                        "V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
                        "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
                        "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
                        "temp_amb",
                        "aplus_L1", "aminus_L1", "RplusL_L1", "RminusL_L1", "RplusC_L1", "RminusC_L1", 
                        "aplus_L2", "aminus_L2", "RplusL_L2", "RminusL_L2", "RplusC_L2", "RminusC_L2",
                        "aplus_L3", "aminus_L3", "RplusL_L3", "RminusL_L3", "RplusC_L3", "RminusC_L3"]

script_path = os.getcwd()
data = pd.read_csv('../DATA/LVSM_Def.csv',  sep = ';', header=0, names=values_column_names)

# Cleaning data table
data = data.drop(["aminus_L1", "RminusL_L1", "RplusC_L1", 
                  "aminus_L2", "RminusL_L2", "RplusC_L2",
                  "aminus_L3", "RminusL_L3", "RplusC_L3"], axis=1)

data = data.reset_index(drop = True)

# Change column types to appropiate
data = data.astype({"time": str, "branch": str , "organization": str, "substation": str, "transformer_code": str, "App SW": str})

data[["V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
      "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
      "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
      "temp_amb"]] = data[["V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
                           "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
                           "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
                           "temp_amb"]].astype(float)


### Deal with the "24:00" problem. Adapt BOTH the hour and the day.
# Get the indexes and replace hour
for i, date in enumerate(data['time']):
    if date.split()[1].split(':')[0] == '24':
        data.loc[i, 'time'] = data.loc[i, 'time'].replace("24:00","00:00")
        data.loc[i, 'time'] = pd.to_datetime(data.loc[i, 'time'], format = '%Y-%m-%d %H:%M') + timedelta(days = 1)

# Update the format
data['time'] = pd.to_datetime(data['time'], format = '%Y-%m-%d %H:%M:%S')


In [6]:
# Copy of the dataframe to split date and hour
data_new = data.copy(deep=True)

In [7]:
### Preformat

# Split the time column into date and hour columns, for diagram's input preparation
data_new['date'] = (data_new['time']).dt.date
data_new['hour'] = (data_new['time']).dt.time

# Delete the old time column
data_new = data_new.drop(["time"], axis=1)

# Put both columns at the start
data_new = pd.concat([data_new['hour'], data_new.drop('hour',axis=1)], axis=1)
data_new = pd.concat([data_new['date'], data_new.drop('date',axis=1)], axis=1)

# Cleaning NA values
if data_new.isna().sum().sum() < .10 * len(data_new): 
    print ("Cleaning NA values from dataset")
    data_new = data_new.dropna()
else:
    raise Exception("Careful! Deleting NaN values would cut most of the dataset")

# Remove duplicates
if data.duplicated().sum() < .10 * len(data_new): 
    print ("Cleaning duplicate values from dataset")
    data_new = data_new.drop_duplicates(subset=['date', 'hour', 'substation', 'App SW'])
else:
    raise Exception("Careful! Deleting duplicated values would cut most of the dataset")




Cleaning NA values from dataset
Cleaning duplicate values from dataset


In [8]:
warnings.filterwarnings('ignore')
data_new.head()

Unnamed: 0,date,hour,branch,organization,substation,transformer_code,App SW,V_L1,I_L1,W_L1,...,temp_amb,aplus_L1,RplusL_L1,RminusC_L1,aplus_L2,RplusL_L2,RminusC_L2,aplus_L3,RplusL_L3,RminusC_L3
0,2019-06-16,01:00:00,AE,SZZ,S201,TR1,003F,234.0,65.0,14964.0,...,30.0,16082.0,1983.0,0.0,16736.0,1620.0,0.0,23015.0,2179.0,0.0
1,2019-06-16,02:00:00,AE,SZZ,S201,TR1,003F,233.0,57.0,13091.0,...,29.0,14342.0,1441.0,0.0,14545.0,1057.0,28.0,23764.0,2906.0,0.0
2,2019-06-16,03:00:00,AE,SZZ,S201,TR1,003F,236.0,55.0,12847.0,...,29.0,13543.0,1381.0,0.0,14073.0,1141.0,0.0,22147.0,2942.0,0.0
3,2019-06-16,04:00:00,AE,SZZ,S201,TR1,003F,234.0,135.0,30517.0,...,29.0,20757.0,2954.0,0.0,22059.0,2021.0,0.0,27317.0,3701.0,0.0
4,2019-06-16,05:00:00,AE,SZZ,S201,TR1,003F,235.0,102.0,23069.0,...,29.0,29753.0,5054.0,0.0,31259.0,3121.0,2.0,33013.0,3778.0,0.0




In [9]:
# Prepare the train and test dataset
msk = np.random.rand(len(data_new)) < 0.98

df_train = data_new[msk]
df_test = data_new[~msk]

### Part 0: Automating EDA - Pandas methods

In [None]:
data_new.describe()

### Part 1: Automating EDA - Using Sweetviz

In [None]:
# Create analysis report
analyze_report = sv.analyze(data_new)
analyze_report.show_html()

In [None]:
# Create a Train and Test Data Comparison - Differences and Similarities
compare = sv.compare([df_train, "Training Data"], [df_test, "Test Data"], "W_L1")
compare.show_html()

In [12]:
# Create a Comparison - Particular column

intra_com = sv.compare_intra(data_new[["date", "hour", "substation", "App SW", 
                        "V_L1", "I_L1", "W_L1", "QL_L1", "QC_L1","cos_L1", "angle_L1",
                        "V_L2", "I_L2", "W_L2", "QL_L2", "QC_L2","cos_L2", "angle_L2",
                        "V_L3", "I_L3", "W_L3", "QL_L3", "QC_L3","cos_L3", "angle_L3",
                        "temp_amb"]], data_new["substation"] == "S242", ["S201", "S2274", "S242", "S286", "S287", "S406", "S480", "S499", "S531", "S612", "S68638", "S7116", "S733", "S740", "S744", "S76020", "S813", "S820", "S850", "S868"])
intra_com.show_html(filepath='Compare_Intra.html', open_browser=True, layout='widescreen', scale=None)

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:04 -> (00:00 left)
Report Compare_Intra.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [None]:
# Display the info on the output cell, not the browser
# report.show_notebook(w=None, h=None, scale=None,layout='widescreen',filepath=None)

### Part 2: Automating EDA - Using Autoviz

In [15]:
AV = AutoViz_Class()

df1 = AV.AutoViz('../DATA/LVSM_Def.xlsx')

NameError: name 'AutoViz_Class' is not defined

### Part 3: Automating EDA - Using Lux

In [None]:
data_new

In [None]:
data_new.intent = ["I_L1", "W_L1"]
data_new

In [None]:
interestingVis = data_new.exported
interestingVis

In [None]:
interestingVis[0]

In [None]:
X.recommendation["Enhance"]

In [None]:
print(interestingVis[0].to_matplotlib())

### Part 4: Filter useful info

In [None]:
data = data.drop(["aminus_L1", "RminusL_L1", "RplusC_L1", 
                  "aminus_L2", "RminusL_L2", "RplusC_L2",
                  "aminus_L3", "RminusL_L3", "RplusC_L3"], axis=1)


In [None]:
# Randomly shuffle a dataframe
data.reindex(np.random.permutation(data.index))