extra notebook worked by: Andre Fontes

In [1]:
import pandas as pd
import dtale
import pandas_profiling as pp
from IPython.display import Javascript

In [2]:
# check versions
!conda -V
!python -V
!conda list |grep pandas
!conda list |grep pandas_profiling
!conda list |grep dtale

conda 4.9.2
Python 3.8.6
pandas                    1.1.4            py38h0ef3d22_0    conda-forge/label/main
pandas-profiling          2.9.0              pyh9f0ad1d_0    conda-forge/label/main
dtale                     1.24.0             pyhd3deb0d_0    conda-forge/label/main


### load data

In [3]:
# https://biolib.com/shd2020/Diabetes-Bioinformatics-Data/
# manually copied files into these folders
synthetic_df = pd.read_csv("../../data/bronze/synthetic_data.csv")
features_description = pd.read_csv("../../data/bronze/feature_descriptions.csv")

### summary

In [4]:
# concise summary (shape, memory use, data types, nan's)
synthetic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78441 entries, 0 to 78440
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      78441 non-null  object
 1   gender                    78441 non-null  object
 2   age                       78441 non-null  object
 3   time_in_hospital          78441 non-null  int64 
 4   num_lab_procedures        78441 non-null  int64 
 5   num_procedures            78441 non-null  int64 
 6   num_medications           78441 non-null  int64 
 7   number_outpatient         78441 non-null  int64 
 8   number_emergency          78441 non-null  int64 
 9   number_inpatient          78441 non-null  int64 
 10  number_diagnoses          78441 non-null  int64 
 11  max_glu_serum             78441 non-null  object
 12  A1Cresult                 78441 non-null  object
 13  metformin                 78441 non-null  object
 14  repaglinide           

### static report

In [5]:
reportFile = "../../data/report/synthetic_diabetes_report.html"

In [6]:
%%time
# full report on 100% records (no correlation matrix stuff)
pp.ProfileReport(df=synthetic_df.sample(frac=1),
                 minimal=False,
                 progress_bar=False,
                 correlations={"cramers": {"calculate": False}}).to_file(reportFile)

In [7]:
# open the report (*.html)
display(Javascript('window.open("{url}");'.format(url=reportFile)))

<IPython.core.display.Javascript object>

### interactive report

In [8]:
# start webapp (change IP, port)
d = dtale.show(synthetic_df, host="13.93.37.217", port="40000", ignore_duplicate=True, drop_index=True, reaper_on=False)

In [9]:
# show all running instances
d.main_url()

http://13.93.37.217:40000/dtale/main/1


In [None]:
# stop webapp
# d.kill()

### data checks

In [10]:
# count distinct (=unique) observations (+ missing), sorted (high cardinality > 390)
synthetic_df.nunique(dropna=False).sort_values(ascending=False)

num_lab_procedures          109
num_medications              71
number_outpatient            27
number_emergency             25
_diag_3                      22
_diag_2                      22
_diag_1                      22
number_inpatient             18
time_in_hospital             14
number_diagnoses             14
age                          10
num_procedures                7
race                          5
repaglinide                   4
glipizide                     4
glyburide-metformin           4
insulin                       4
max_glu_serum                 4
A1Cresult                     4
metformin                     4
nateglinide                   4
glimepiride                   4
glyburide                     4
pioglitazone                  4
rosiglitazone                 4
acarbose                      4
tolazamide                    3
miglitol                      3
readmitted                    3
change                        2
chlorpropamide                2
tolbutam

In [None]:
# def is_outlier(s):
   # lower_limit = s.mean() - (s.std() * 3)
   # upper_limit = s.mean() + (s.std() * 3)
   # return ~s.between(lower_limit, upper_limit)

#df = df[~df.groupby('Group')['count'].apply(is_outlier)]

In [None]:
synthetic_df[synthetic_df["readmitted"] == '<30'].shape