# Notebook developing main.py

In [1]:
import datetime
import pandas as pd
import os
from plotly.subplots import make_subplots

### import scripts with codes to do eda
from scripts import ydata_profiling as dp
from scripts import univariate_analysis as uv
from scripts import bivariate_analysis as bv
from scripts import data_segmentation_analysis as ds

In [2]:
### read json config
import json
path_json = 'config.json'
with open(path_json, 'r') as archivo_json:
    config = json.load(archivo_json)

### 0. Define parameters of the report

In [3]:
# read data
name_data_pkl = config['config_report']['name_data_pkl']
path_data_pkl = 'data/' + name_data_pkl
data = pd.read_pickle(path_data_pkl)
data.head(2)

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-01-01 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
2009-01-01 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1


In [4]:
# define id report
name_report = config['config_report']['name_report']
datetime_report = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
id_report = name_report + '-' + datetime_report
id_report

'dataset-climate-tf-2024-01-11-20-48-21'

In [5]:
# create folders to save report
os.makedirs('output_eda/' + id_report)
os.makedirs('output_eda/' + id_report + '/ydata_profiling')
os.makedirs('output_eda/' + id_report + '/univariate_analysis')
os.makedirs('output_eda/' + id_report + '/bivariate_analysis')
os.makedirs('output_eda/' + id_report + '/data_segmentation_analysis')

### 1. Define reports to show

In [6]:
### define reports to show
show_ydata_profiling = config['reports_to_show']['ydata_profiling']
show_univariate_analysis = config['reports_to_show']['univariate_analysis']
show_bivariate_analysis = config['reports_to_show']['bivariate_analysis']
show_data_segmentation_analysis = config['reports_to_show']['data_segmentation_analysis']

print('--- repots to show ---')
print('show_ydata_profiling: ', show_ydata_profiling)
print('show_univariate_analysis: ', show_univariate_analysis)
print('show_bivariate_analysis: ', show_bivariate_analysis)
print('show_data_segmentation_analysis: ', show_data_segmentation_analysis)
print('--- --- --- --- --- ---')

--- repots to show ---
show_ydata_profiling:  False
show_univariate_analysis:  False
show_bivariate_analysis:  True
show_data_segmentation_analysis:  False
--- --- --- --- --- ---


### 2. ydata-profiling

In [7]:
if show_ydata_profiling:

    # read params
    param_minimal = config['ydata_profiling']['minimal']

    # generate report
    print(f'ydata-profiling... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    dp.generate_report_ydata_profiling(df = data, 
                                       minimal = param_minimal, 
                                       id_report = id_report)

### 3. univariate_analysis
The code is divided in two parts:
- read params to generate de plots
- generate individual plolty figure of each plot

In [8]:
if show_univariate_analysis:
    
    """ PARAMS """
    # read params number columns to plot
    param_number_columns = config['univariate_analysis']['number_columns']
    
    # read params zoom tendency
    param_zoom = config['univariate_analysis']['zoom_tendency']['zoom']
    param_zoom_start_date = config['univariate_analysis']['zoom_tendency']['start_date']
    param_zoom_end_date = config['univariate_analysis']['zoom_tendency']['end_date']
    
    # read params smooth data
    param_smooth_ma_window = config['univariate_analysis']['smooth_data']['moving_average']['window']
    param_smooth_wma_weights = config['univariate_analysis']['smooth_data']['weighted_moving_average']['weights']
    param_smooth_ema_aplha = config['univariate_analysis']['smooth_data']['exponential_moving_average']['alpha']
    
    # read params acf/pacf
    param_lags = config['univariate_analysis']['acf_pacf']['lags']


    """ PLOTS """
    ################### fig histogram all features ###################
    print(f'statistics... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_statistics = uv.generate_descriptive_statistics(df = data)
    fig_statistics.write_html(f"output_eda/{id_report}/univariate_analysis/statistics.html")

    
    ################### fig histogram all features ###################
    print(f'histogram... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_hist_all = uv.plot_multiple_hist(df = data, number_columns = param_number_columns)
    fig_hist_all.write_html(f"output_eda/{id_report}/univariate_analysis/histograms.html")

    fig_hist_kde_all = uv.plot_kde_hist(df = data, number_columns = param_number_columns)
    fig_hist_kde_all.savefig(f"output_eda/{id_report}/univariate_analysis/histograms_kde.png", dpi = 300)


    ################### zoom data to tendency plots (trend-moving averavge) - zoom to reduce cost to plot ###################
    if param_zoom:
        data_tendency = data.loc[param_zoom_start_date:param_zoom_end_date]
        data_smooth = data.loc[param_zoom_start_date:param_zoom_end_date]
    else:
        data_tendency = data.copy()
        data_smooth = data.copy()

    
    # ################### fig tendency all features ###################    
    print(f'trend... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_tendency_all = uv.plot_multiple_tendency(df = data_tendency, number_columns = param_number_columns)
    fig_tendency_all.write_html(f"output_eda/{id_report}/univariate_analysis/tendency.html")
    
    # ################### fig boxplot for each month and year ###################  
    print(f'boxplots... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')    
    fig_boxplot_all = uv.plot_multiple_boxplot_months(df = data, number_columns = 1)  # always 1 boxplot for column beacuse there are 12 months
    fig_boxplot_all.write_html(f"output_eda/{id_report}/univariate_analysis/boxplots.html")
    
    # ################### fig smooth data ###################    
    ## moving average
    print(f'moving average... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    data_moving_average = uv.apply_moving_average(df = data_smooth.copy(), window_size = param_smooth_ma_window)
    fig_moving_average = uv.plot_compare_tendencias(df_original = data_smooth, 
                                                    df_smoothed = data_moving_average,
                                                    number_columns = param_number_columns,
                                                    kind_smooth = f'moving average - window: {param_smooth_ma_window}'
                                                )
    fig_moving_average.write_html(f"output_eda/{id_report}/univariate_analysis/moving_average.html")
    
    ## weighted moving average
    print(f'weighted moving average... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    data_weighted_moving_average = uv.apply_weighted_moving_average(df = data_smooth.copy(), weights = param_smooth_wma_weights)
    fig_weighted_moving_average = uv.plot_compare_tendencias(df_original = data_smooth,
                                                             df_smoothed = data_weighted_moving_average,
                                                             number_columns = param_number_columns,
                                                             kind_smooth = f'weighted moving average - weights: [{param_smooth_wma_weights}]'
                                                            )
    fig_weighted_moving_average.write_html(f"output_eda/{id_report}/univariate_analysis/weighted_moving_average.html")
    
    ## exponential moving average
    print(f'exponential moving average... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    data_exponential_moving_average = uv.apply_exponential_moving_average(df = data_smooth.copy(), alpha = param_smooth_ema_aplha)
    fig_exponential_moving_average = uv.plot_compare_tendencias(df_original = data_smooth,
                                                                df_smoothed = data_exponential_moving_average,
                                                                number_columns = param_number_columns,
                                                                kind_smooth = f'exponential moving average - alpha: {param_smooth_ema_aplha}'
                                                               )
    fig_exponential_moving_average.write_html(f"output_eda/{id_report}/univariate_analysis/exponential_moving_average.html")
    
    
    # ################### fig acf ###################
    print(f'acf... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_acf = uv.plot_all_acf(df = data, lags = param_lags, number_columns = param_number_columns)
    fig_acf.write_html(f"output_eda/{id_report}/univariate_analysis/acf.html")

    print(f'acf stats models... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_acf_stats = uv.plot_all_acf_stats(df = data, lags = param_lags, number_columns = param_number_columns) # v2 statsmodels
    fig_acf_stats.savefig(f"output_eda/{id_report}/univariate_analysis/acf_stats.png", dpi = 300)
    
    
    # ################### fig pacf ###################
    print(f'pacf... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_pacf = uv.plot_all_pacf(df = data, lags = param_lags, number_columns = param_number_columns)
    fig_pacf.write_html(f"output_eda/{id_report}/univariate_analysis/pacf.html")

    print(f'pacf stats models... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    fig_acf_stats = uv.plot_all_pacf_stats(df = data, lags = param_lags, number_columns = param_number_columns) # v2 statsmodels
    fig_acf_stats.savefig(f"output_eda/{id_report}/univariate_analysis/pacf_stats.png", dpi = 300)


    # ################### fin ###################
    print(f'fin... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

### 4. bivariate_analysis

In [9]:
if show_bivariate_analysis:
    print('go')

go


In [10]:
""" PARAMS """
# read params feature target
target = config['config_report']['feature_target']

# read param correlations
param_theshold_corr = config['bivariate_analysis']['correlations']['threshold_corr']  # threshold in correlations between each feature 
param_theshold_corr_target = config['bivariate_analysis']['correlations']['threshold_corr_target']  # threshold in correlations between a target


""" PLOTS """
################### fig correlations ###################
print(f'correlations... time:{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

# correlations all features
_, df_corr_upper = bv.calculate_correlations_triu(data)
df_corr_upper_filtered = bv.filter_correlations_by_threshold(df_corr_upper, param_theshold_corr)
fig_corr_all = bv.plot_heatmap(df_corr = df_corr_upper_filtered)
fig_corr_all.write_html(f"output_eda/{id_report}/bivariate_analysis/corr_all.html")

# correlations against the target
corr_target = bv.calculate_correlations_target(data, target)
corr_target_filtered = bv.filter_correlations_by_threshold(corr_target, param_theshold_corr_target)
fig_corr_target = bv.plot_heatmap(df_corr = corr_target_filtered)
fig_corr_target.write_html(f"output_eda/{id_report}/bivariate_analysis/corr_target.html")

correlations... time:2024-01-11 20:48:22


### 5. data_segmentation_analysis

In [11]:
if show_data_segmentation_analysis:
    print('go')