# test 2 profiling tool with plotly

In this notebook, i will transfer what have been done previously in `matplotlib` to `plotly`.



In [180]:
import pandas as pd
import os
import pathlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import plotly.express as px
import plotly.graph_objects as go


## File Input

In [181]:
# read all files matching *.csv* in the directory

path = "../rawdata"
data = pathlib.Path(path)
# recursively read files
csv_files = list(data.rglob("*.csv*"))
df = pd.read_csv(csv_files[0])
df.tail()

Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
24973,43897237,3158501,True,11,neurologic|altered mental status / pain|enceph...,"348.31, G93.41",Other
24974,44151864,3158501,True,11,hematology|white blood cell disorders|leukocyt...,"288.8, D72.829",Other
24975,44379558,3158501,True,11,pulmonary|disorders of acid base|respiratory a...,"276.2, E87.2",Other
24976,44000639,3158501,True,11,neurologic|altered mental status / pain|delirium,"293.0, F05",Other
24977,44496559,3173599,True,36,cardiovascular|chest pain / ASHD|acute coronar...,"410.90, I21.3",Primary


## % of null
this section is dedicated to test the percentage of null .

$$
\begin{align}
\%\,of\,missing\,values= \frac{num_{null}}{num_{rows}}\\
\end{align}
$$
where $num_{row}$ is the total number of row in this column, $num_{null}$ the number of missing values for that particular column



In [182]:
percentage_of_null = list(df.isnull().sum()/len(df.index))

fig = go.Figure()
fig.add_trace(go.Bar(
    y=df.columns,
    x=percentage_of_null,
    name=r'% of null',
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
    ),
    showlegend=True
))

fig.update_layout(
                  title_text=r"% of null",
                  title_font_size=20,
                  xaxis_tickformat = ',.1%'
                  )


fig.show()

## Uniquiness
This section is dedicated to test uniquesness from
$$
\begin{align}
Uniqueness = \frac{num_{dis}-1}{num_{non-missing}-1}
\end{align}
$$
where $num_{non-missing}$ is the number of non-missing values in that column



In [183]:
uniqueness = (df.nunique() - 1)/(len(df.index) - df.isnull().sum() - 1) * 100

fig2 = go.Figure()
fig2.add_trace(go.Bar(
    y=df.columns,
    x=uniqueness,
    name=r'uniquiness',
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
    ),
    showlegend=True
))

fig2.update_layout(
                  title_text=r"uniquiness",
                  title_font_size=20,
                  xaxis_tickformat = ',.1%'
                  )


fig2.show()


## Distribution
In this section, we will split the data file into categorical and numerical. Since pandas uses numpy's primitive type, therefore
- categorical (object, datetime, bool)
- numerical (float64,int64)

> Note: if pandas infer object, it refers to string. Also not sure about placing datetime and bool here.


Let's start by desgining a function to plot its type

In [184]:
def get_num_cat_dtype(input_df):
    """
    return the two list containing columns of categorical data and numerical data
    Args:
        input_df (pandas dataframe): dataframe you wish to determine 
        whether it's cateogircal or numerical

    Returns:
        numerical_col (list): a list of columnnames for numerical data 
        categorical_col (list): a list of columnnames for categorical data 
    Note:
        is implemented with selected_dtypes better? revisit later
    """
    numerical_col = []
    categorical_col = []
    
    for col in input_df.columns:
        curr_type = input_df[col].dtype
        if curr_type in ["object","bool"]:
            categorical_col.append(col)
        else:
            numerical_col.append(col)
        print(curr_type)
        
    return numerical_col, categorical_col


numerical_columns,cate_columns = get_num_cat_dtype(df)
# print(numerical_columns)
# print(cate_columns)
    

int64
int64
bool
int64
object
object
object


### Categorical distribution

In [185]:
def get_categorical_distribution(input_df,cat_col_list):
    """
    get the minimum and maximum for every column in df[[a,b,c]] and export
    in json format
    Args:
        input_df (_type_): _description_
        cat_list (_type_): a list of categorical distribution
    Returns:
        res (dict): a nested json containing result
    Example:
        output looks like
        {'activeupondischarge': {'min': 4, 'max': 5},
        'diagnosisstring': {'min': 24, 'max': 146},
        'icd9code': {'min': 3, 'max': 35},
        'diagnosispriority': {'min': 5, 'max': 7}}
    """
    res = {}
    
    for col in cat_col_list:
        res[col] = {"min":input_df[col].apply(str).apply(len).min(),
                    "max":input_df[col].apply(str).apply(len).max()}
        

    return res
        

cat_plot = get_categorical_distribution(df,cate_columns)
cat_plot

{'activeupondischarge': {'min': 4, 'max': 5},
 'diagnosisstring': {'min': 24, 'max': 146},
 'icd9code': {'min': 3, 'max': 35},
 'diagnosispriority': {'min': 5, 'max': 7}}

In [186]:
fig3 = go.Figure()

# Use x instead of y argument for horizontal plot

for i in range(len(cate_columns)):
      fig3.add_trace(go.Box(x=df[cate_columns].applymap(str).applymap(len).iloc[:,i],
                        name = cate_columns[i]
                        ),
                  )

fig3.update_layout(
                  title_text=r"categorical value distribution",
                  title_font_size=20
                  )

fig3.show()

### Numerical distribution



In [187]:
fig4 = go.Figure()

# Use x instead of y argument for horizontal plot

for i in range(len(numerical_columns)):
    curr_column = df[numerical_columns].iloc[:,i]
    fig4.add_trace(go.Box(x=curr_column/curr_column.max(),
                        name = numerical_columns[i]
                        ),
                  )

fig4.update_layout(
                  title_text=r"numerical value distribution",
                  title_font_size=20,
                  xaxis_tickformat = ',.1%'
                  )

fig4.show()

## Pattern

In this section, i need to determine if it contains certain pattern or not? For the simplicity of the project, I am just going to list:
- contains digits 0-9
- contains lower a-z
- contains upper A-Z

In [188]:
def get_pattern(input_df):
    """
    find the patterns for this this dataset
    Args:
        input_df (_type_): the dataframe you wish to compute pattern

    Returns:
        res (dataframe): a boolean dataframe for heatmap plot
    """
    # define your local methods
    contains_digit = lambda x: any([char.isdigit() for char in x])
    contains_lower = lambda x: any([char.islower() for char in x])
    contains_upper = lambda x: any([char.isupper() for char in x])

    methods_list = [contains_digit,contains_lower,contains_upper]
    
    res = pd.DataFrame()
    
    for i in range(3):
        temp = df.applymap(str).applymap(methods_list[i]).apply(any)
        res = pd.concat([res,temp],axis=1)

    # rename the columns    
    res.set_axis(["0","a","A"],axis=1,copy=False)
    
    return res

df_pattern = get_pattern(df)
df_pattern

Unnamed: 0,0,0.1,0.2
diagnosisid,True,False,False
patientunitstayid,True,False,False
activeupondischarge,False,True,True
diagnosisoffset,True,False,False
diagnosisstring,True,True,True
icd9code,True,True,True
diagnosispriority,False,True,True


In [189]:
df_pattern.to_numpy().astype(dtype=int)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 1],
       [1, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [0, 1, 1]])

In [190]:
fig5 = go.Figure()

fig5.add_trace(go.Heatmap(
                   z=df_pattern.to_numpy().astype(dtype=int),
                   x=["0","a","A"],
                   y=list(df_pattern.index),
                   hoverongaps = False,
                   showlegend=True,
                   colorscale = 'Greys',
                   name = ""
                   ),)
# did some trick to set block size to be same
fig5.update_layout(
                  title_text=r"pattern",
                  title_font_size=20,
                  xaxis = dict(
                      side = "top"
                  )
                  )

fig5.show()