In [None]:
#default_exp analysis

# Analysis

> The data collected in the `data` module is used to understand the features and generate profile reports. The reports are saved in the `nbs/reports` folder of the repo

In [None]:
#hide
import os
import pandas as pd
import pandas_profiling
from nbdev.showdoc import *

# multiple outputs for cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# plotting
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# plotting options
plt.style.use('default')

# plotting output options
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext blackcellmagic

profile_data = True

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


## Load data

Load the data from the `nbs/data/processed` folder

In [None]:
#hide
if profile_data:
    final_df = pd.read_parquet('data/processed/final_df.parquet')
    final_df_std_scaling = pd.read_parquet('data/processed/final_df_std_scaling.parquet')
    final_df_robust_scaling = pd.read_parquet('data/processed/final_df_robust_scaling.parquet')

## Profile

All the pandas profiler reports are saved in the `nbs/reports/pandas_profiling` folder

### No scaling

Profile the data without scaling. The report is saved in `nbs/reports/pandas_profiler/final_df_profile_no_scaling.html`

In [None]:
#hide
if profile_data:
    from pandas_profiling import ProfileReport
    profile = ProfileReport(final_df, title='Final df - No scaling')
    profile.to_file("reports/pandas_profiler/final_df_profile_no_scaling.html")

Summarize dataset: 100%|██████████| 394/394 [00:55<00:00,  7.04it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]
Render HTML: 100%|██████████| 1/1 [00:09<00:00,  9.89s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 28.82it/s]


### Standard scaling

Profile the data with standard scaling. The report is saved in `nbs/reports/pandas_profiler/final_df_std_scaling_profile.html`

In [None]:
#hide
if profile_data:
    from pandas_profiling import ProfileReport
    profile = ProfileReport(final_df_std_scaling, title='Final df - Standard scaling')
    profile.to_file("reports/pandas_profiler/final_df_std_scaling_profile.html")

Summarize dataset: 100%|██████████| 393/393 [00:44<00:00,  8.93it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.66s/it]
Render HTML: 100%|██████████| 1/1 [00:09<00:00,  9.79s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 21.72it/s]


### Robust scaling

Profile the data with robust scaling. The report is saved in `nbs/reports/pandas_profiler/final_df_robust_scaling_profile.html`

In [None]:
#hide
if profile_data:
    from pandas_profiling import ProfileReport
    profile = ProfileReport(final_df_robust_scaling, title='Final df - Robust scaling')
    profile.to_file("reports/pandas_profiler/final_df_robust_scaling_profile.html")

Summarize dataset: 100%|██████████| 393/393 [00:47<00:00,  8.35it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]
Render HTML: 100%|██████████| 1/1 [00:09<00:00,  9.39s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 30.55it/s]


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_data.ipynb.
Converted 01_analysis.ipynb.
Converted 02_models.ipynb.
Converted 03_feature_importance.ipynb.
Converted 04_validation.ipynb.
Converted index.ipynb.
