In [31]:
import altair as alt
import pandas as pd
from helpers import RAW_DATA, TRANSFORMED_DATA, find_csv_files

# Table of contents
1. [Introduction](#introduction)
2. [Some paragraph](#paragraph1)
    1. [Sub paragraph](#subparagraph1)
3. [Another paragraph](#paragraph2)

## This is the introduction <a name="introduction"></a>
Some introduction text, formatted in heading 2 style

## Some paragraph <a name="paragraph1"></a>
The first paragraph text

### Sub paragraph <a name="subparagraph1"></a>
This is a sub paragraph, formatted in heading 3 style

## Another paragraph <a name="paragraph2"></a>
The second paragraph text

# TODO 
- Convert columns to categorical 
- Compare time/size for csv and parquet and partquet gzipped files 

# Load Data

In [36]:
# fetch files 
files = find_csv_files(RAW_DATA)

In [38]:
# load files 
df = pd.DataFrame()
for file in files:
    temp_df = pd.read_csv(file)
    df = pd.concat([df,temp_df], ignore_index=True)
df.head()

Unnamed: 0,PatientID,AgeNBR,GenderCD,RelationshipDSC,ClinicID,ModalityDSC,VisitsCNT,RiskNBR
0,42534,39,F,Employee,A,Primary,1,
1,169233,46,F,Dependent,A,Nurse Visit,3,0.165685
2,42880,59,F,Employee,B,|||Preventive,1,1.133132
3,118044,36,F,Employee,B,Health Coach,1,0.701008
4,36774,39,F,Dependent,A,Massage,1,1.221869


## dataset metadata

In [17]:
df.describe()

Unnamed: 0,PatientID,AgeNBR,VisitsCNT,RiskNBR
count,7273.0,7273.0,7273.0,6937.0
mean,102272.12952,58.753609,2.374123,0.989636
std,58336.218918,23.7433,3.599425,0.988562
min,85.0,18.0,1.0,7.2e-05
25%,51925.0,38.0,1.0,0.285526
50%,103086.0,59.0,1.0,0.682753
75%,152914.0,79.0,2.0,1.357584
max,202870.0,99.0,74.0,9.312341


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7273 entries, 0 to 7272
Data columns (total 8 columns):
PatientID          7273 non-null int64
AgeNBR             7273 non-null int64
GenderCD           6688 non-null object
RelationshipDSC    7273 non-null object
ClinicID           7273 non-null object
ModalityDSC        7273 non-null object
VisitsCNT          7273 non-null int64
RiskNBR            6937 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 454.7+ KB


# Clean up data

## clean up values in `ModalityDSC` column by removing `|||` chars

In [19]:
df.ModalityDSC = df.ModalityDSC.apply(lambda x: x.replace('|||',''))

In [21]:
categorical_columns = ['GenderCD', 'RelationshipDSC', 'ClinicID', 'ModalityDSC']

for column in categorical_columns:
    # set columns as category type 
    df[column] = df[column].astype('category')
    unique_values = df[column].unique()
    print(f"{column} unique values: ")
    for value in unique_values:
        print(f'\t\t\t\t {value}')

GenderCD unique values: 
				 F
				 M
				 nan
RelationshipDSC unique values: 
				 Employee
				 Dependent
ClinicID unique values: 
				 A
				 B
				 C
ModalityDSC unique values: 
				 Primary
				 Nurse Visit
				 Preventive
				 Health Coach
				 Massage
				 Lab Visit
				 Optometry
				 Chiropractic
				 Acupuncture
				 Physical Therapy
				 Dental
				 Nurse Practitioner
				 Dermatology
				 Mental Health
				 Screening
				 Bod Pod
				 Psychiatry


# Save Data 

In [29]:
df.to_parquet(TRANSFORMED_DATA + 'cleaned_up_health.parquet.gzip', compression = 'gzip')

# Graph data

In [24]:
df_modality_group = df.groupby(['ModalityDSC', 'ClinicID'])[['VisitsCNT']].sum().reset_index()
df_modality_group.head()

Unnamed: 0,ModalityDSC,ClinicID,VisitsCNT
0,Acupuncture,A,411.0
1,Acupuncture,B,421.0
2,Acupuncture,C,408.0
3,Bod Pod,A,50.0
4,Bod Pod,B,


In [25]:
visits = (
    alt.Chart(df_modality_group)
    .mark_bar()
    .encode(
        x="sum(VisitsCNT):Q",
        y=alt.Y("ModalityDSC:N", sort="-x"),
        color="ClinicID:N",
        tooltip=["ClinicID", "sum(VisitsCNT):Q"],
    )
    .properties(width=700, height=500)
)
visits

In [None]:
visits.save('figures/visits.html')

# Explore Risk 

In [None]:
df.head()

In [None]:
alt.Chart(df[0:5000]).mark_area().encode(
    x='AgeNBR:O',
    y=alt.Y(
        'sum(VisitsCNT):Q',
        title='Number of Visits',
        axis=alt.Axis(format='~s')
    ),
    facet=alt.Facet('ClinicID:N', columns=3),
).properties(
    title='Number of visitors by Age and Clinic',
    width=200,
    height=80
)

In [None]:
risk = (
    alt.Chart(df[0:5000])
    .transform_density(
        "RiskNBR", groupby=["ModalityDSC"], as_=["RiskNBR", "density"], extent=[0, 8],
    )
    .mark_area()
    .encode(x="RiskNBR:Q", y="density:Q",)
    .properties(title="Number of visitors by Age and Clinic", width=200, height=80)
    .facet("ModalityDSC:N", columns=3)
)
risk

In [None]:
risk = (
    alt.Chart(df[0:5000])
    .transform_density(
        "AgeNBR", groupby=["ModalityDSC"], as_=["AgeNBR", "density"], extent=[0, 99],
    )
    .mark_area()
    .encode(x="AgeNBR:O", y="density:Q",)
    .properties(title="Number of visitors by Age and Clinic", width=200, height=80)
    .facet("ModalityDSC:N", columns=3)
)
risk

Need a histogram of risk versus Age per modality 