# Clustering Analysis

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load csv file

In [None]:
hospital_core_file = "C:/Users/Lenovo/OneDrive - Drexel University/Fall 2023/BSAN-710/Capstone Project/Production (Codes and EDA)/Dataset/Data Cleaning/Merged/Hospital_Core.csv"
hospital_severity_file = "C:/Users/Lenovo/OneDrive - Drexel University/Fall 2023/BSAN-710/Capstone Project/Production (Codes and EDA)/Dataset/Data Cleaning/Transformed/KID_2019_Severity.csv"

## Define dictionary

In [None]:
# Define the dtype dictionary
dtype_dict = {
     'AGE_NEONATE': 'str', 'AMONTH': 'str', 'ELECTIVE': 'str', 'LOS': 'str', 'PAY1': 'str', 'PRDAY10': 'str',
 'PRDAY11': 'str','PRDAY8': 'str', 'PRDAY9': 'str','RACE': 'str', 'TOTCHG': 'str', 'ZIPINC_QRTL': 'str',
 'DQTR': 'str', 'PRDAY12':'str', 'PRDAY13':'str', 'PRDAY14':'str', 'PRDAY15':'str', 'PRDAY16':'str', 'PRDAY17':'str',
 'PRDAY18':'str', 'PRDAY19':'str', 'PRDAY20':'str', 'PRDAY21':'str', 'PRDAY22':'str', 'PRDAY23':'str',
 'PRDAY24':'str', 'PRDAY25':'str', 'PRDAY2':'str', 'PRDAY3':'str', 'PRDAY4':'str', 'PRDAY5':'str',
 'PRDAY6':'str', 'TRAN_IN':'str', 'TRAN_OUT':'str', 'FEMALE':'object',  'DISPUNIFORM': 'str', 'DIED':'str'
}


## Read dataset and filter only type 1 diabetes codes tranpose into one columns

In [None]:
# Read hospital_core dataset
hospital_core_df = dd.read_csv(hospital_core_file, dtype=dtype_dict)
# Read severity dataset
hospital_severity_df = dd.read_csv(hospital_severity_file, dtype=dtype_dict)

# Filter for age
age_condition = (hospital_core_df['AGE'] >= 0) & (hospital_core_df['AGE'] <= 18)

# Pre-existing type 1 diabetes diagnosis codes 
type1_preexist = ['O24011', 'O24012', 'O23013', 'O24019', 'O2402', 'O2403']

# Vectorized condition for both type1_preexist and 'E10' for columns I10_DX1 to I10_DX5
dx_conditions = False
for column in ['I10_DX1', 'I10_DX2', 'I10_DX3', 'I10_DX4', 'I10_DX5']:
    dx_conditions |= hospital_core_df[column].isin(type1_preexist).fillna(False)
    dx_conditions |= hospital_core_df[column].str.startswith('E10').fillna(False)

# Combine age and diabetes diagnosis conditions
combined_condition = age_condition & dx_conditions

hospital_core_df = hospital_core_df[combined_condition]

# Extract all diagnosis codes related to type 1 diabetes
def extract_diagnosis_codes(row):
    return [code for code in row if pd.notnull(code) and (code.startswith('E10') or code in type1_preexist)]

hospital_core_df['TYPE1_DIABETES_CODES'] = hospital_core_df[['I10_DX1', 'I10_DX2', 'I10_DX3', 'I10_DX4', 'I10_DX5']].apply(extract_diagnosis_codes, axis=1, meta=('DIABETES_CODES', 'object'))

# Transform dask dataframe to pandas dataframe
hospital_core_df = hospital_core_df.compute()

# Reset the index for a unique index
hospital_core_df.reset_index(drop=True, inplace=True)

#hospital_core_df.head()


In [None]:
hospital_core_df.shape[0]

In [None]:
# Join Severity to Hospital_Core by 'RECNUM'
hospital_severity_df = hospital_severity_df.compute()

hospital_core_severity_df = hospital_core_df.merge(hospital_severity_df, on='RECNUM', how='inner')
hospital_core_severity_df.head()


In [None]:
hospital_core_severity_df.info()

In [None]:
# Checking Null values
hospital_core_severity_df.isnull().sum()*100/hospital_core_severity_df.shape[0]

In [None]:
def categorize_age(age):
    if age < 1:
        return "Infant"
    elif 1 <= age <= 13:
        return "Children"
    else:
        return "Adolescents"

# Categorization function 'AGE_GROUP'
hospital_core_severity_df['AGE_GROUP'] = hospital_core_severity_df['AGE'].apply(categorize_age)
hospital_core_severity_df['TYPE1_DIABETES_CODES'] = hospital_core_severity_df['TYPE1_DIABETES_CODES'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)


hospital_core_severity_df  = hospital_core_severity_df.drop('AGE',axis = 1)

In [None]:
select_diagnosis_columns =  ['AGE_GROUP', 'GENDER', 'HOSPITAL_REGION', 'LOS','ALL_PATIENT_SEVERITY', 'RACE_DESC','TYPE1_DIABETES_CODES']  # selective diagnosis columns.

hospital_core_severity_df = hospital_core_severity_df[select_diagnosis_columns]
hospital_core_severity_df.head()

In [None]:
print("Total number of records for clustering: " + str(hospital_core_severity_df.shape))

### Copy Data

In [None]:
# Keep a copy of data
hospital_core_severity_df_copy = hospital_core_severity_df.copy()

In [None]:
hospital_core_severity_df.head()

## Find Optimal k by Elbow Method

In [None]:
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt

np.random.seed(42)
cost = []
for num_clusters in list(range(1,8)):
    kmode = KModes(n_clusters=num_clusters, init ="Cao", n_init = 5, verbose=0)
    kmode.fit_predict(hospital_core_severity_df)
    cost.append(kmode.cost_)
    
y = np.array([i for i in range(1,8,1)])

plt.figure(figsize=(10,6))
#plt.plot(y,cost)
plt.plot(y, cost, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.grid(False)
plt.show()



## Model fitting

In [None]:
optimal_k =3 # as per elbow method

In [None]:
# fit the KModes clustering algorithm with 5 optimal clusters.
np.random.seed(42)
kmd = KModes(n_clusters=optimal_k, init = "Huang", n_init =5 , verbose=0)
kmd_clusters = kmd.fit_predict(hospital_core_severity_df)

In [None]:
# Combining predicted clusters with the original dataframe
hospital_core_severity_df = hospital_core_severity_df_copy.reset_index()

In [None]:
kmd_clusters_df = pd.DataFrame(kmd_clusters)
kmd_clusters_df.columns = ['CLUSTERS']
kmd_clusters_df = pd.concat([hospital_core_severity_df, kmd_clusters_df], axis = 1).reset_index()
kmd_clusters_df = kmd_clusters_df.drop(['index', 'level_0'], axis = 1)
# added row numbers
kmd_clusters_df['ROWNUM'] = range(1, len(kmd_clusters_df) + 1)

# Create a mapping from old to new cluster labels
label_mapping = {0: 1, 1: 2, 2: 3}

# start with cluster 1 instead of 0. map the old cluster labels to the new ones
kmd_clusters_df['CLUSTERS'] = kmd_clusters_df['CLUSTERS'].map(label_mapping)

### Extract the clusters dataset

In [None]:
import os
output_file = 'C:/Users/Lenovo/OneDrive - Drexel University/Fall 2023/BSAN-710/Capstone Project/Production (Codes and EDA)/Dataset/Data Cleaning/Merged/HCUP_Clustering.csv'

# Check if the file exists
if not os.path.exists(output_file):
    # If the file does not exist, create HCUP_Clustering.csv file
    kmd_clusters_df.to_csv(output_file, index=False)

kmd_clusters_df.head()

## Descriptive Analysis

### Cluster Proportion

In [None]:

# Calculate the proportions
np.random.seed(42)
cluster_size = kmd_clusters_df['CLUSTERS'].value_counts().sort_index()
cluster_size_proportion = (cluster_size / cluster_size.sum()) * 100

# Plotting
plt.figure(figsize=(10, 6))
cluster_size_proportion.plot(kind='bar')

# Adding titles and labels
plt.title('Cluster Size Proportions (%)')
plt.xlabel('Cluster')
plt.ylabel('Proportion (%)')
plt.xticks(rotation=0)  

# show percentages above the bars
for i, value in enumerate(cluster_size_proportion):
    plt.text(i, value + 0.5, f"{value:.2f}%", ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
kmd_clusters_df.head()

In [None]:
# Heatmap of AGE_GROUP 
crosstab = pd.crosstab(kmd_clusters_df['AGE_GROUP'], kmd_clusters_df['CLUSTERS'], normalize='index') * 100

# matrix with percentages
annot_array = crosstab.values
annotations = [[f'{val:.2f}%' for val in row] for row in annot_array]

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=annotations, cmap="YlGnBu", fmt="")
plt.title("Distribution of Age Group across Clusters")
plt.show()

# Heatmap of GENDER 
crosstab = pd.crosstab(kmd_clusters_df['GENDER'], kmd_clusters_df['CLUSTERS'], normalize='index') * 100

# matrix with percentages
annot_array = crosstab.values
annotations = [[f'{val:.2f}%' for val in row] for row in annot_array]

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=annotations, cmap="YlGnBu", fmt="")
plt.title("Distribution of Gender across Clusters")
plt.show()

# Heatmap of RACE_DESC 
crosstab = pd.crosstab(kmd_clusters_df['RACE_DESC'], kmd_clusters_df['CLUSTERS'], normalize='index') * 100

# matrix with percentages
annot_array = crosstab.values
annotations = [[f'{val:.2f}%' for val in row] for row in annot_array]

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=annotations, cmap="YlGnBu", fmt="")
plt.title("Distribution of Race across Clusters")
plt.show()

# Heatmap of RACE_DESC 
crosstab = pd.crosstab(kmd_clusters_df['HOSPITAL_REGION'], kmd_clusters_df['CLUSTERS'], normalize='index') * 100

# matrix with percentages
annot_array = crosstab.values
annotations = [[f'{val:.2f}%' for val in row] for row in annot_array]

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=annotations, cmap="YlGnBu", fmt="")
plt.title("Distribution of Hospital Region across Clusters")
plt.show()

# Heatmap of Severity 
crosstab = pd.crosstab(kmd_clusters_df['ALL_PATIENT_SEVERITY'], kmd_clusters_df['CLUSTERS'], normalize='index') * 100

# matrix with percentages
annot_array = crosstab.values
annotations = [[f'{val:.2f}%' for val in row] for row in annot_array]

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=annotations, cmap="YlGnBu", fmt="")
plt.title("Distribution of Health Severity across Clusters")
plt.show()

# Heatmap of LOS 
kmd_clusters_df['LOS'] = kmd_clusters_df['LOS'].astype(int)

def categorize_los(los):
    if los <= 5:
        return "Short-term"
    elif 5 < los <= 15:
        return "Mid-term"
    else:
        return "Long-term"

kmd_clusters_df['LOS_Category'] = kmd_clusters_df['LOS'].apply(categorize_los)

# crosstab with the LOS_Category and Clusters
crosstab = pd.crosstab(kmd_clusters_df['LOS_Category'], kmd_clusters_df['CLUSTERS'])
# Convert the crosstab values to proportions
crosstab_percentage = (crosstab / crosstab.sum().sum()) * 100

# Generate annotations with percentage symbol
annotations = crosstab_percentage.applymap(lambda x: f"{x:.2f}%")

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab_percentage, annot=annotations, cmap="YlGnBu", fmt="s")
plt.title("Distribution of LOS across Clusters in Percentages")
plt.xlabel("Cluster")
plt.ylabel("LOS Category")
plt.show()



# Heatmap of TYPE1 DIABETES 
crosstab = pd.crosstab(kmd_clusters_df['TYPE1_DIABETES_CODES'], kmd_clusters_df['CLUSTERS'], normalize='index') * 100

# matrix with percentages
annot_array = crosstab.values
annotations = [[f'{val:.2f}%' for val in row] for row in annot_array]

plt.figure(figsize=(10, 6))
sns.heatmap(crosstab, annot=annotations, cmap="YlGnBu", fmt="")
plt.title("Distribution of Type1 Diabetes Diagnosis across Clusters")
plt.show()


