Author: Kevin ALBERT  

Created: September 2020  

# Datareport
_**How to generate an interactive and static report.**_

In [None]:
# install python modules
!conda install -y -c conda-forge pandas-profiling dtale seaborn pyarrow fastparquet xlrd

In [1]:
import pandas as pd
import dtale
import pandas_profiling as pp
from IPython.display import Javascript

In [2]:
# check versions
!conda -V
!python -V
!conda list |grep pandas
!conda list |grep pandas_profiling
!conda list |grep dtale

conda 4.8.4
Python 3.8.5
pandas                    1.1.0            py38h950e882_0    conda-forge
pandas-profiling          2.8.0              pyh9f0ad1d_1    conda-forge
dtale                     1.15.2             pyh9f0ad1d_1    conda-forge


### load data

In [3]:
# demo with clean '*.parquet' data: ../bronze/ -> silver -> gold -> platinum
df = pd.read_parquet('../../data/platinum/diabetes.parquet')

### summary

In [4]:
# concise summary (shape, memory use, data types, nan's)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PatientID               10000 non-null  int64  
 1   Pregnancies             10000 non-null  int64  
 2   PlasmaGlucose           10000 non-null  int64  
 3   DiastolicBloodPressure  10000 non-null  int64  
 4   TricepsThickness        10000 non-null  int64  
 5   SerumInsulin            10000 non-null  int64  
 6   BMI                     10000 non-null  float64
 7   DiabetesPedigree        10000 non-null  float64
 8   Age                     10000 non-null  int64  
 9   Diabetic                10000 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 781.4 KB


### interactive report

In [5]:
# start webapp (change IP, port)
d = dtale.show(df, host="52.174.238.247", port="40000", ignore_duplicate=True, drop_index=True, reaper_on=False)

In [6]:
# show all running instances
d.main_url()

http://52.174.238.247:40000/dtale/main/1


In [7]:
# stop webapp
d.kill()

Executing shutdown...


2020-09-27 15:38:50,014 - INFO     - Executing shutdown...


### static report

In [8]:
reportFile = "../../data/report/diabetes_report.html"

In [9]:
# quick report on 100% records (no correlation matrix stuff)
pp.ProfileReport(df=df.sample(frac=1),
                 minimal=True,
                 correlations={"cramers": {"calculate": False}}).to_file(reportFile)

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=20.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [10]:
# open the report (*.html)
display(Javascript('window.open("{url}");'.format(url=reportFile)))

<IPython.core.display.Javascript object>

#### then further generate code to clean the dataset...