# Cluster comparison
This will be accomplished in two steps: 
 1. statistical test across each variable comparing the sample populations across various measurements 
 2. identifying the mean values across each of these measurements.  
 
The result will be a report for each patient subgroup displaying its expected values for a particular measurement, the expected values for the same measurement for the rest of the patient population, the difference, and whether or not that difference is statistically significant. 

In [65]:
import os
import re
import pandas as pd
import numpy as np

# Statistical tests
from scipy.stats import ttest_ind, chisquare, chi2_contingency

# user scripts
from dfGenUtils import convertCategorical, genPatientDF, dropMissings

In [18]:
input_dir = "../data/heorData/"
graph_dir = "../data/graphs/"
patient_dir = "../data/patientData/"
output_dir = "../data/heorData/results/"

## Generate Dataframe and Metadata Variables
  1. Need to compute descriptive stats and statistical tests before imputation of missing vars
  2. Need to identify which columns belong to which data subgroups

In [81]:
cluster_df = pd.read_csv(graph_dir + "cosine_cluster5.csv")
cluster_df.head()

Unnamed: 0,subject_id,cluster
0,4,0
1,29573,0
2,4699,0
3,31535,0
4,57317,1


In [82]:
mortality = pd.read_csv(patient_dir + "mortality.csv")
mortality.head()

Unnamed: 0,subject_id,last_admission,death_date,time2event,mortality30
0,7275,2140-04-12,2140-05-08,26.0,1
1,357,2199-12-21,2201-08-02,589.0,0
2,878,2137-11-24,2137-12-11,17.0,1
3,794,2190-11-29,2191-09-19,294.0,0
4,1457,2189-01-25,,,0


In [193]:
# generate our patient dataframe and drop missings. But, do not impute
files = os.listdir(patient_dir)
files = [f for f in files if re.search("chart events|cross sect", f)]

patient_df = genPatientDF(patient_dir, files, cluster_df, mortality, time2event = True)
patient_df = dropMissings(patient_df, .5)
patient_df.head()

Unnamed: 0,subject_id,activity,activity_tolerance,braden_activity,braden_mobility,family_communication,marital_status,race,gender,braden_nutrition,...,avg_ld,min_lactate,max_lactate,avg_lactate,avg_phosphate,avg_sodium,avg_potassium,cluster,mortality30,time2event
0,4,Bedrest,Tolerated Well,Walks Occasionally,Slight Limitations,Family Visited,Single,Unknown / Not Specified,F,Probably Inadequate,...,330.0,2.1,2.1,2.1,3.45,137.75,3.5375,0,0,
1,52,Bedrest,Tolerated Well,Bedfast,Slight Limitations,Family Visited,Single,Unknown / Not Specified,M,Probably Inadequate,...,285.888889,,,,3.47,134.916667,3.6,0,0,548.0
2,78,Bedrest,Tolerated Well,Bedfast,No Limitations,Family Visited,A,Unknown / Not Specified,M,Adequate,...,,,,,3.166667,137.0,3.58,0,0,1083.0
3,117,Bedrest,Tolerated Well,Bedfast,Very Limited,Family Visited,Single,Unknown / Not Specified,F,Probably Inadequate,...,577.473684,1.2,21.9,8.90625,4.576087,137.472727,4.426316,0,1,18.0
4,140,Bedrest,Tolerated Well,Chairfast,No Limitations,Family Called,Divorced,Unknown / Not Specified,M,Adequate,...,,,,,3.133333,137.666667,4.333333,0,0,677.0


In [84]:
num_clusters = max(cluster_df.cluster) +1
clusters = range(num_clusters)
num_clusters

2

In [192]:
# identify which columns belong to which data subgroups
# format is dictionary{column name: data_group}
def extractColumnGroupings(patient_dir):
    columnMapping = {}
    patient_files = os.listdir(patient_dir)
    
    for f in patient_files:
        curr_df = pd.read_csv(patient_dir + f)
        columns = curr_df.columns
        
        # extract data subgroup name 
        m = re.search("_.*_(.*?)\.", f)
        if m:
            data_subgroup = m.group(1)
            
            # chart event files have the column names in the label variable
            chart_event = re.search("chart events_categorical", f)
            if chart_event:
                columns = list(curr_df.label.unique())
                columns = ['_'.join(c for c in col.lower().split()) for col in columns]
                
            [columnMapping.update({col:data_subgroup}) for col in columns]
            
    # manually add mortality
    columnMapping['mortality30'] = 'mortality'
    columnMapping['time2event'] = 'mortality'
    
    return columnMapping

columnMapping = extractColumnGroupings(patient_dir)
columnMapping

{'activity': 'activity',
 'activity_tolerance': 'activity',
 'braden_activity': 'activity',
 'braden_mobility': 'activity',
 'family_communication': 'demographics',
 'marital_status': 'demographics',
 'gender': 'demographics',
 'race': 'demographics',
 'braden_nutrition': 'diet',
 'diet_type': 'diet',
 'appetite': 'diet',
 'special_diet': 'diet',
 'heart_rhythm': 'heart lung',
 'lll_lung_sounds': 'heart lung',
 'lul_lung_sounds': 'heart lung',
 'rll_lung_sounds': 'heart lung',
 'rul_lung_sounds': 'heart lung',
 'respiratory_pattern': 'heart lung',
 'respiratory_effort': 'heart lung',
 'mental_status': 'mental drug',
 'recreational_drug_use': 'mental drug',
 'pain_location': 'pain',
 'pain_present': 'pain',
 'pain_cause': 'pain',
 'pain_type': 'pain',
 'abdominal_assessment': 'physical assessment',
 'bowel_sounds': 'physical assessment',
 'braden_moisture': 'physical assessment',
 'oral_cavity': 'physical assessment',
 'skin_color': 'physical assessment',
 'skin_condition': 'physical as

In [121]:
# define our variable groups - maybe I subset to top 50 most important columns? 
# I will need to extract the groupings 
num_vars = patient_df.select_dtypes(include=np.number).columns.tolist()
num_vars = [c for c in num_vars if c not in ["subject_id", "cluster"]]

cat_vars = patient_df.select_dtypes(include=["object"]).columns.tolist()
num_vars, cat_vars

(['age',
  'max_heart_rate',
  'min_heart_rate',
  'avg_heart_rate',
  'max_resp_rate',
  'min_resp_rate',
  'avg_resp_rate',
  'Abdominal',
  'Back',
  'Both Legs',
  'Chest Pain',
  'Generalized',
  'Headache',
  'Incisional',
  'Jaw',
  'Left Arm',
  'Left Chest',
  'Left Elbow',
  'Left Flank',
  'Left Foot',
  'Left Hip',
  'Left Leg',
  'Left Lower Quad',
  'Left Lower Quadrant',
  'Left Shoulder',
  'Left Upper Quad',
  'Left Upper Quadrant',
  'Mediastinal',
  'Midscapular',
  'Neck',
  'Not Indicated',
  'Perineum',
  'Periumbilical',
  'Right Arm',
  'Right Chest',
  'Right Elbow',
  'Right Flank',
  'Right Foot',
  'Right Hip',
  'Right Leg',
  'Right Lower Quad',
  'Right Lower Quadrant',
  'Right Shoulder',
  'Right Upper Quad',
  'Right Upper Quadrant',
  'max_wgt',
  'min_wgt',
  'wgt_change',
  'loss_perc',
  'min_fibrinogen',
  'max_fibrinogen',
  'avg_fibrinogen',
  'min_platelet',
  'max_platelet',
  'avg_platelet',
  'min_inr',
  'max_inr',
  'avg_inr',
  'min_pt',


## Statistical Tests
I take a one vs. all approach here. For each cluster, I compare the samples from the selected cluster to all other data. I do this for each cluster.

For numeric data, I compare the means using a t-test. For categorical data, I compare using the chi-squared test. 

In [152]:
# run ttests for all numeric variables for a given cluster
def tTests(cols, patient_df, cluster):
    test_res = []
    for col in cols:
        main = patient_df.loc[patient_df.cluster == cluster, col].dropna()
        other = patient_df.loc[patient_df.cluster != cluster, col].dropna()

        # run tests and append
        res = ttest_ind(main, other)
        sum_stats_main = {"mean":round(main.mean(),2),
                          "var":round(main.var(),2)}
        sum_stats_other = {"mean":round(other.mean(),2),
                           "var":round(other.var(),2)}
        
        test_res.append([col, # column
                         sum_stats_main,# numeric summary stats for comparison cluster                         
                         sum_stats_other, # numeric summary stats for all others
                         round(res.pvalue,4)]) # p value
    return test_res 

numeric_res = tTests(num_vars, patient_df, c)
numeric_res[:5]

[['age', {'mean': 58.78, 'var': 697.69}, -0.86, 0.3892],
 ['max_heart_rate', {'mean': 120.93, 'var': 618.54}, -1.0, 0.3187],
 ['min_heart_rate', {'mean': 60.17, 'var': 244.9}, -5.11, 0.0],
 ['avg_heart_rate', {'mean': 86.21, 'var': 169.96}, -1.05, 0.295],
 ['max_resp_rate', {'mean': 34.45, 'var': 169.7}, -1.25, 0.2112]]

In [148]:
# extract the top three most frequent categorical values and their percent frequency
def mostFrequentCats(frequencies, n = 3):
    top_cats = {}
    total_obs = frequencies.sum()
    freq_ratio = frequencies/total_obs
    [top_cats.update({row[0]: round(row[1],2)}) for i, row in enumerate(freq_ratio.iteritems()) if i < n]
    return top_cats

In [151]:
# run our chisquared tests for all categorical variables leveraging contingency tables for a given cluster
def chi2Tests(cols, patient_df, cluster):
    test_res = []
    for col in cols:
        # generate our contingency table
        main = patient_df.loc[patient_df.cluster == cluster, col].value_counts()
        other = patient_df.loc[patient_df.cluster != cluster, col].value_counts()
        contingency_tbl = pd.merge(main, other, how = 'left', left_index = True, right_index = True).fillna(0)
        
        # run tests and append
        chi2, p, ddof, expected = chi2_contingency(contingency_tbl)
        test_res.append([col, # column
                         mostFrequentCats(main,3), # top 3 cat labels and their frequencies
                         mostFrequenCats(other,3), # top 3 labels for the other group
                         round(p, 4)]) # p value
    return test_res

cat_res = chi2Tests(cat_vars, patient_df, c)
cat_res[:5]


[['activity', {'Bedrest': 0.99, 'Chair': 0.0, 'Commode': 0.0}, 4.24, 0.3743],
 ['activity_tolerance',
  {'Tolerated Well': 0.93, 'Good': 0.04, 'Fair': 0.02},
  133.02,
  0.0],
 ['braden_activity',
  {'Bedfast': 0.92, 'Chairfast': 0.04, 'Walks Occasionally': 0.04},
  4.37,
  0.224],
 ['braden_mobility',
  {'Slight Limitations': 0.49,
   'Very Limited': 0.38,
   'Completely Immobile': 0.07},
  18.39,
  0.0004],
 ['family_communication',
  {'Family Visited': 0.66, 'Family Called': 0.27, 'Family Talked to MD': 0.03},
  289.62,
  0.0]]

## Generate the Report


In [189]:
def genReport(num_res, cat_res, columnMapping, cluster):
    # gen base report
    num_df = pd.DataFrame(num_res)
    cat_df = pd.DataFrame(cat_res)
    report = pd.concat([num_df, cat_df], axis = 0)
    report.columns = ['variable', 'main_summary_stats', 'other_summary_stats', 'p_value']
    report['cluster'] = cluster
    
    # extract the data subgroups
    report['data_subgroup'] = report.variable.map(columnMapping)
    
    return report.reset_index(drop = True)

report = genReport(numeric_res, cat_res, columnMapping, c)
report.head()

Unnamed: 0,variable,mean_var_or_top_labels,test_statistic,p_value,cluster,data_subgroup
0,age,"{'mean': 58.78, 'var': 697.69}",-0.86,0.3892,0,demographics
1,max_heart_rate,"{'mean': 120.93, 'var': 618.54}",-1.0,0.3187,0,heart lung rate
2,min_heart_rate,"{'mean': 60.17, 'var': 244.9}",-5.11,0.0,0,heart lung rate
3,avg_heart_rate,"{'mean': 86.21, 'var': 169.96}",-1.05,0.295,0,heart lung rate
4,max_resp_rate,"{'mean': 34.45, 'var': 169.7}",-1.25,0.2112,0,heart lung rate


In [194]:
# run through the process for both clusters and then concatenate
cluster_report_list = []
for c in range(num_clusters):
    # run tests
    num_res = tTests(num_vars, patient_df, c)
    cat_res = chi2Tests(cat_vars, patient_df, c)
    
    # gen report
    report = genReport(num_res, cat_res, columnMapping, c)
    cluster_report_list.append(report)
    
# concatenate
cluster_reports = pd.concat(cluster_report_list, axis = 0)
cluster_reports

Unnamed: 0,variable,mean_var_or_top_labels,test_statistic,p_value,cluster,data_subgroup
0,age,"{'mean': 58.78, 'var': 697.69}",-0.86,0.3892,0,demographics
1,max_heart_rate,"{'mean': 120.93, 'var': 618.54}",-1.00,0.3187,0,heart lung rate
2,min_heart_rate,"{'mean': 60.17, 'var': 244.9}",-5.11,0.0000,0,heart lung rate
3,avg_heart_rate,"{'mean': 86.21, 'var': 169.96}",-1.05,0.2950,0,heart lung rate
4,max_resp_rate,"{'mean': 34.45, 'var': 169.7}",-1.25,0.2112,0,heart lung rate
...,...,...,...,...,...,...
122,gag_reflex,"{'Intact': 0.76, 'Impaired': 0.19, 'Absent': 0...",16.41,0.0003,1,physical assessment
123,oral_cavity,"{'Teeth/Tissue WNL': 0.91, 'Bleeding Gum': 0.0...",26.03,0.0001,1,physical assessment
124,skin_color,"{'Normal for Race': 0.75, 'Jaundiced': 0.19, '...",64.94,0.0000,1,physical assessment
125,skin_condition,"{'Dry ': 0.99, 'Dry': 0.0, 'Diaphoretic': 0.0}",1138.80,0.0000,1,physical assessment


In [195]:
cluster_reports.to_csv(output_dir + "Cluster Reports (cosine_knn5_k2).csv")