In [2]:
import altair as alt
import pandas as pd

# Load Data

In [3]:
df = pd.read_csv('health_data.csv')
df.head()

Unnamed: 0,PatientID,AgeNBR,GenderCD,RelationshipDSC,ClinicID,ModalityDSC,VisitsCNT,RiskNBR
0,42534,39,F,Employee,A,Primary,1,
1,169233,46,F,Dependent,A,Nurse Visit,3,0.165685
2,42880,59,F,Employee,B,|||Preventive,1,1.133132
3,118044,36,F,Employee,B,Health Coach,1,0.701008
4,36774,39,F,Dependent,A,Massage,1,1.221869


## dataset metadata

In [5]:
df.describe()

Unnamed: 0,PatientID,AgeNBR,VisitsCNT,RiskNBR
count,7273.0,7273.0,7273.0,6937.0
mean,102272.12952,58.753609,2.374123,0.989636
std,58336.218918,23.7433,3.599425,0.988562
min,85.0,18.0,1.0,7.2e-05
25%,51925.0,38.0,1.0,0.285526
50%,103086.0,59.0,1.0,0.682753
75%,152914.0,79.0,2.0,1.357584
max,202870.0,99.0,74.0,9.312341


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7273 entries, 0 to 7272
Data columns (total 8 columns):
PatientID          7273 non-null int64
AgeNBR             7273 non-null int64
GenderCD           6688 non-null object
RelationshipDSC    7273 non-null object
ClinicID           7273 non-null object
ModalityDSC        7273 non-null object
VisitsCNT          7273 non-null int64
RiskNBR            6937 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 454.7+ KB


# Clean up data

## clean up values in `ModalityDSC` column by removing `|||` chars

In [14]:
df.ModalityDSC = df.ModalityDSC.apply(lambda x: x.replace('|||',''))

In [15]:
categorical_columns = ['GenderCD', 'RelationshipDSC', 'ClinicID', 'ModalityDSC']

for column in categorical_columns:
    unique_values = df[column].unique()
    print(f"{column} unique values: ")
    for value in unique_values:
        print(f'\t\t\t\t {value}')

GenderCD unique values: 
				 F
				 M
				 nan
RelationshipDSC unique values: 
				 Employee
				 Dependent
ClinicID unique values: 
				 A
				 B
				 C
ModalityDSC unique values: 
				 Primary
				 Nurse Visit
				 Preventive
				 Health Coach
				 Massage
				 Lab Visit
				 Optometry
				 Chiropractic
				 Acupuncture
				 Physical Therapy
				 Dental
				 Nurse Practitioner
				 Dermatology
				 Mental Health
				 Screening
				 Bod Pod
				 Psychiatry


# Graph data

In [46]:
df_modality_group = df.groupby(['ModalityDSC', 'ClinicID', 'GenderCD'])[['VisitsCNT']].sum().reset_index()
df_modality_group.head()

Unnamed: 0,ModalityDSC,ClinicID,GenderCD,VisitsCNT
0,Acupuncture,A,F,198
1,Acupuncture,A,M,190
2,Acupuncture,B,F,216
3,Acupuncture,B,M,185
4,Acupuncture,C,F,228


In [55]:

brush = alt.selection(type='interval')

visits = alt.Chart(df_modality_group).mark_bar().encode(
    x='sum(VisitsCNT):Q',
    y=alt.Y('ModalityDSC:N', sort='-x'),
    color=alt.condition(brush, 'ClinicID:N', alt.value('lightgray')),
    tooltip = ['ClinicID','sum(VisitsCNT):Q']
).properties(
    width=800,
    height=500).add_selection(
    brush
)

bars = alt.Chart(df_modality_group).mark_bar().encode(
    y='GenderCD:N',
    color='GenderCD:N',
    x='count(GenderCD):Q'
).transform_filter(
    brush
)

visits & bars