In [None]:
import pandas as pd

In [None]:
#1. Load patients
# import patient info
data_dir = 'data/physionet.org/files/mimiciii/1.4/'
patient_file = 'PATIENTS.csv'
df_patients = pd.read_csv(data_dir + patient_file)

df_patients=df_patients.drop(['ROW_ID','DOD_HOSP','DOD_SSN'],axis=1)

# convert date strings to datetime
df_patients.DOB = pd.to_datetime(df_patients.DOB,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_patients.DOD = pd.to_datetime(df_patients.DOD,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

# import admissions info
admissions_file = 'ADMISSIONS.csv'
df_admissions = pd.read_csv(data_dir + admissions_file)
df_admissions = df_admissions.drop(['ROW_ID','RELIGION','LANGUAGE','MARITAL_STATUS','ETHNICITY'],axis=1)

# convert time strings to datetime
df_admissions.ADMITTIME = pd.to_datetime(df_admissions.ADMITTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_admissions.DISCHTIME = pd.to_datetime(df_admissions.DISCHTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_admissions.EDREGTIME = pd.to_datetime(df_admissions.EDREGTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_admissions.EDOUTTIME = pd.to_datetime(df_admissions.EDOUTTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

# calculate length of hospital stay
df_admissions['HOSPITAL_DAYS'] = (df_admissions['DISCHTIME'] - df_admissions['ADMITTIME']).dt.total_seconds()/(24*60*60)
# negative admit days = dead on arrival, remove
doa_idx = df_admissions[df_admissions['HOSPITAL_DAYS']<0].index
df_admissions = df_admissions.drop(doa_idx,axis=0)

# merge patient and admissions df
df_patient_admit = df_patients.merge(df_admissions,how='left',left_on=['SUBJECT_ID'],right_on=['SUBJECT_ID'])

# calculate age at admit
df_patient_admit['ADMIT_AGE'] = df_patient_admit['ADMITTIME'].dt.year - df_patient_admit['DOB'].dt.year

# 2. Remove patients <age
# not necessary, but if wanted to limit analysis to non-pediatric issues
age = 0
child_idx = df_patient_admit[df_patient_admit['ADMIT_AGE']<age].index
child_patients = df_patient_admit.iloc[child_idx]['SUBJECT_ID'].unique()
df_patient_admit = df_patient_admit.drop(child_idx, axis=0)

# 3. Load icustays
# import icu stays info
icustays_file = 'ICUSTAYS.csv'
df_icustays = pd.read_csv(data_dir + icustays_file)

child_idx = df_icustays[df_icustays['SUBJECT_ID'].isin(child_patients)].index
df_icustays = df_icustays.drop(child_idx,axis=0)

df_icustays = df_icustays.drop(['ROW_ID'],axis=1)
df_icustays.INTIME = pd.to_datetime(df_icustays.INTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
df_icustays.OUTTIME = pd.to_datetime(df_icustays.OUTTIME,format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

# join patients, admissions, and icustays
df_patient_admit_icu = df_patient_admit.merge(df_icustays,how='left',left_on=['SUBJECT_ID','HADM_ID'],right_on=['SUBJECT_ID','HADM_ID'])

df_patient_admit_icu['DAYS_ADM_TO_ICU'] = (df_patient_admit_icu['INTIME'] - df_patient_admit_icu['ADMITTIME']).dt.total_seconds()/(24*60*60)

# calculate days from hospital admission to icu admission
df_patient_admit_icu['DAYS_ADM_TO_ICU'] = (df_patient_admit_icu['INTIME'] - df_patient_admit_icu['ADMITTIME']).dt.total_seconds()/(24*60*60)

# logical for whether patients were admitted to ICU within 24 hours of hospital admission
df_patient_admit_icu['SAMEDAY_ADM_TO_ICU'] = df_patient_admit_icu['DAYS_ADM_TO_ICU'].apply(lambda x: int(x<=1))

# clean up diagnoses
df_patient_admit_icu['DIAGNOSIS'] = df_patient_admit_icu['DIAGNOSIS'].str.strip()

import re
all_diagnoses = []

# 1. save "raw" diagnoses from df to map back to later
diagnoses_raw = df_patient_admit_icu['DIAGNOSIS'].dropna().tolist()

# 2. create a "split" diagnoses list of lists that is a 1-to-1 match to "raw"
diagnoses_split = [re.split(';|,',x) for x in diagnoses_raw]

# 3. use "split" diagnosis to get list of "unique diagnoses"
all_diagnoses = [d for sublist in diagnoses_split for d in sublist]
# strip again in case splitting led to leading whitespaces
all_diagnoses = [d.strip() for d in all_diagnoses]

unique_diagnoses = list(set(all_diagnoses))
unique_diagnoses.pop(0) # first entry is empty ('')

# 5. create 'clean' unique diagnoses that fix data by removing non-alphanumeric characters appropriately
# remove these characters first and replace with whitespace as these are consistently used to separate terms
unique_diagnoses_clean = [re.sub(r'[\\\|/.-]+', ' ', i) for i in unique_diagnoses]

# remove any other non-alphanumerics entirely as these are typically erroneous
unique_diagnoses_clean = [re.sub(r'[^a-zA-Z\d\s ]', '', i) for i in unique_diagnoses_clean]



In [None]:
# 1a. play with thresholds, determine best one
from fuzzywuzzy import fuzz, process
import numpy as np
from difflib import SequenceMatcher

thresholds = np.linspace(70,95,6)
mean_scores = []
max_len_prcts = []
for thresh in thresholds:
    mean_thresh = []
    max_thresh = []
    for ix,d in enumerate(unique_diagnoses_clean):
        try:
            tmp_tup = process.extractBests(d,unique_diagnoses_clean[ix+1:],scorer=fuzz.token_set_ratio,score_cutoff=thresh,limit=200)
        except:
            tmp_tup = [('',0)]
        
        tmp_d = [item[0] for item in tmp_tup] # these are all the similar diagnoses to d
        tmp_scr = [item[1] for item in tmp_tup] # these are all the scores
        
#         mean_thresh keeps throwing nans at the end, don't know why, so excluding this from determination of appropriate thresh
#         calculate mean score
#         if not len(tmp_scr) and not np.all(np.isnan(tmp_scr)):
#             mean_thresh.append(np.nanmean(tmp_scr))
        
        # get longest matching string across tmp_d
        lcs = d # initialize longest common substring to current diagnosis
        for d2 in tmp_d:
            if lcs=='': # nothing common among ALL diagnoses, so length = nan
                break
            
            # since tmp_d sorted by score, each subsequent comparison should yield a smaller lcs
            s = SequenceMatcher(None, lcs, d2)
            match = s.find_longest_match(0,len(lcs),0,len(d2))
            lcs = lcs[match.a:match.a+match.size].strip()
            lcsl = len(lcs)
            
        max_thresh.append(lcsl/len(d))
    
#     mean_thresh keeps throwing nans at the end, don't know why, so excluding this from determination of appropriate thresh
#     calculate uber-mean score across d's
#     if not len(mean_thresh) and not np.all(np.isnan(mean_thresh)):
#         mean_scores.append(np.nanmean(mean_thresh))
    
    # calculate uber-longest length across d's (as % of original length of d)
    max_len_prcts.append(np.mean(max_thresh))
    

In [None]:
import matplotlib.pyplot as plt

# plot uber-max lengths as a function of threshold across diagnoses
plt.plot(thresholds,max_len_prcts)
plt.xlabel('Score Threshold')
plt.ylabel('Mean Max Length')
# identify kinks in plots to determine appropriate threshold

# seems like optimal score threshold is 90