In [2]:
import pandas as pd
train_df = pd.read_csv('clinical-outcome-prediction/saved_files/DIA_GROUPS_3_DIGITS_adm_train.csv')

test_df = pd.read_csv('clinical-outcome-prediction/saved_files/DIA_GROUPS_3_DIGITS_adm_test.csv')

train_df.head()

Unnamed: 0,id,text,long_texts,short_texts,discharge_summary,short_codes
0,159643,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: This 60 ...,"Aortic valve disorders,Congestive heart failur...","Aortic valve disorder,CHF NOS,Hyperpotassemia,...",Admission Date: [**2141-3-13**] Dischar...,"424,428,276,997,427,414,412,V458"
1,147171,CHIEF COMPLAINT: Substernal Chest Pain\n\nPRES...,Acute myocardial infarction of other anterior ...,"AMI anterior wall, init,Ac systolic hrt failur...",Admission Date: [**2102-9-26**] ...,410428997427414458
2,199961,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: The pati...,"Ankylosing spondylitis,Hypertensive chronic ki...","Ankylosing spondylitis,Hyp kid NOS w cr kid V,...",Admission Date: [**2115-6-29**] Dischar...,"720,403,805,847,E885,780,285,250"
3,136812,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: This is ...,Obstructive chronic bronchitis with (acute) ex...,"Obs chr bronc w(ac) exac,Pneumonia, organism N...",Admission Date: [**2106-4-14**] Dischar...,491486280427426438729244
4,175700,CHIEF COMPLAINT: s/p rollover MVC with prolong...,"Closed fracture of shaft of fibula with tibia,...","Fx shaft fib w tib-clos,Pneumococcal pneumonia...",Admission Date: [**2159-2-9**] D...,"823,481,861,518,285,813,807,E816,883,799,V641,..."


In [3]:
from sklearn.model_selection import train_test_split

train_subset, val_subset = train_test_split(
    train_df,
    test_size=0.1,  # 10% for validation
    random_state=345  
)


print(f"Original train set shape: {train_df.shape}")
print(f"New train subset shape: {train_subset.shape}")
print(f"Validation set shape: {val_subset.shape}")
print(f"Test set shape: {test_df.shape}")

Original train set shape: (33994, 6)
New train subset shape: (30594, 6)
Validation set shape: (3400, 6)
Test set shape: (9829, 6)


In [4]:
train_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30594 entries, 32361 to 3608
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 30594 non-null  int64 
 1   text               30594 non-null  object
 2   long_texts         30594 non-null  object
 3   short_texts        30594 non-null  object
 4   discharge_summary  30594 non-null  object
 5   short_codes        30594 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.6+ MB


In [5]:
val_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3400 entries, 22471 to 2489
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 3400 non-null   int64 
 1   text               3400 non-null   object
 2   long_texts         3400 non-null   object
 3   short_texts        3400 non-null   object
 4   discharge_summary  3400 non-null   object
 5   short_codes        3400 non-null   object
dtypes: int64(1), object(5)
memory usage: 185.9+ KB


In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9829 entries, 0 to 9828
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 9829 non-null   int64 
 1   text               9829 non-null   object
 2   long_texts         9829 non-null   object
 3   short_texts        9829 non-null   object
 4   discharge_summary  9829 non-null   object
 5   short_codes        9829 non-null   object
dtypes: int64(1), object(5)
memory usage: 460.9+ KB


In [12]:
import os
os.makedirs('new_split')

FileExistsError: [Errno 17] File exists: 'new_split'

In [None]:
train_subset.to_csv('new_split/train_split.csv')
val_subset.to_csv('new_split/val_split.csv')
test_df.to_csv('new_split/test_split.csv')

In [9]:
train_unique_short_codes = pd.Series(
    code for codes in train_subset['short_codes'] for code in codes.split(',')
).nunique()

train_unique_short_codes

1186

In [10]:
val_unique_short_codes = pd.Series(
    code for codes in val_subset['short_codes'] for code in codes.split(',')
).nunique()

val_unique_short_codes

852

In [11]:
test_unique_short_codes = pd.Series(
    code for codes in test_df['short_codes'] for code in codes.split(',')
).nunique()

test_unique_short_codes

1029

In [12]:
combined_unique_short_codes = pd.Series(
    code for codes in pd.concat([train_subset['short_codes'], val_subset['short_codes'], test_df['short_codes']]) for code in codes.split(',')
).nunique()

combined_unique_short_codes

1224

In [13]:
len(set(pd.Series(
    code for codes in train_df['short_codes'] for code in codes.split(',')
)).intersection(pd.Series(
    code for codes in test_df['short_codes'] for code in codes.split(',')
)))

1000

In [8]:
unique_short_codes = pd.Series(
    code for codes in pd.concat([train_df['short_codes'], test_df['short_codes']]) for code in codes.split(',')
).unique()


with open('unique_short_codes.txt', 'w') as f:
    for code in unique_short_codes:
        f.write(f"{code}\n")


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33994 entries, 0 to 33993
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 33994 non-null  int64 
 1   text               33994 non-null  object
 2   long_texts         33994 non-null  object
 3   short_texts        33994 non-null  object
 4   discharge_summary  33994 non-null  object
 5   short_codes        33994 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.6+ MB


In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9829 entries, 0 to 9828
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 9829 non-null   int64 
 1   text               9829 non-null   object
 2   long_texts         9829 non-null   object
 3   short_texts        9829 non-null   object
 4   discharge_summary  9829 non-null   object
 5   short_codes        9829 non-null   object
dtypes: int64(1), object(5)
memory usage: 460.9+ KB


In [11]:
# Function to filter codes based on the required group
def filter_codes_neurology(codes):
    neurology_codes = [code for code in codes if code.isdigit() and 320 <= int(code) <= 389]
    return neurology_codes


# Split short_codes into lists, then filter by Neurology codes
train_df['short_codes_list'] = train_df['short_codes'].apply(lambda x: x.split(','))
train_df['neurology_codes'] = train_df['short_codes_list'].apply(filter_codes)


neurology_codes_list = train_df['neurology_codes'].tolist()
neurology_codes_filtered = [codes for codes in neurology_codes_list if codes]


print(neurology_codes_filtered)

NameError: name 'filter_codes' is not defined

In [None]:
from itertools import chain
all_neurology_codes = list(set(chain(*neurology_codes_list)))
print(all_neurology_codes)


['375', '348', '364', '343', '362', '380', '340', '353', '349', '358', '337', '357', '345', '369', '336', '351', '379', '361', '386', '325', '359', '388', '360', '341', '372', '331', '344', '350', '352', '324', '371', '342', '338', '368', '363', '333', '373', '367', '347', '377', '327', '370', '385', '365', '334', '322', '366', '320', '321', '376', '356', '335', '381', '326', '387', '378', '323', '346', '374', '384', '355', '383', '332', '382', '339', '354', '389', '330']


In [None]:
len(all_neurology_codes)

68

In [None]:
# Function to filter codes based on the Neurology range (320-389)
def filter_codes(codes):
    neurology_codes = [code for code in codes if code.isdigit() and 320 <= int(code) <= 389]
    return neurology_codes

# Split short_codes into lists, then filter by Neurology codes
train_df['short_codes_list'] = train_df['short_codes'].apply(lambda x: x.split(','))
train_df['neurology_codes'] = train_df['short_codes_list'].apply(filter_codes)

# Create a new column indicating whether the row contains any Neurology codes
train_df['has_neurology_codes'] = train_df['neurology_codes'].apply(lambda x: bool(x))

# Count how many rows have Neurology codes
count_neurology_rows = train_df['has_neurology_codes'].sum()

# Display the dataframe with the new columns and the count of rows with Neurology codes
neurology_df = train_df[train_df.has_neurology_codes].reset_index(drop = True)

In [None]:
neurology_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10515 entries, 0 to 10514
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   10515 non-null  int64 
 1   text                 10515 non-null  object
 2   long_texts           10515 non-null  object
 3   short_texts          10515 non-null  object
 4   discharge_summary    10515 non-null  object
 5   short_codes          10515 non-null  object
 6   short_codes_list     10515 non-null  object
 7   neurology_codes      10515 non-null  object
 8   has_neurology_codes  10515 non-null  bool  
dtypes: bool(1), int64(1), object(7)
memory usage: 667.6+ KB


In [None]:
neurology_df.head()

Unnamed: 0,id,text,long_texts,short_texts,discharge_summary,short_codes,short_codes_list,neurology_codes,has_neurology_codes
0,175700,CHIEF COMPLAINT: s/p rollover MVC with prolong...,"Closed fracture of shaft of fibula with tibia,...","Fx shaft fib w tib-clos,Pneumococcal pneumonia...",Admission Date: [**2159-2-9**] D...,"823,481,861,518,285,813,807,E816,883,799,V641,...","[823, 481, 861, 518, 285, 813, 807, E816, 883,...",[327],True
1,140536,CHIEF COMPLAINT: Presented with respiratory di...,"Subendocardial infarction, initial episode of ...","Subendo infarct, initial,Pneumonia, organism N...",Admission Date: [**2183-1-7**] D...,"410,486,276,425,414,427,695,401,416,496,356,43...","[410, 486, 276, 425, 414, 427, 695, 401, 416, ...",[356],True
2,120286,CHIEF COMPLAINT: Intracranial Hemmorrhage s/p ...,"Intracerebral hemorrhage,Other convulsions,Obs...","Intracerebral hemorrhage,Convulsions NEC,Obstr...",Admission Date: [**2136-3-30**] ...,"431,780,331,486,322,305,348,730,719,E884,555,569","[431, 780, 331, 486, 322, 305, 348, 730, 719, ...","[331, 322, 348]",True
3,109206,CHIEF COMPLAINT: Right hip pain / Osteoarthrit...,"Osteoarthrosis, localized, not specified wheth...","Loc osteoarth NOS-pelvis,Acute kidney failure ...",Admission Date: [**2107-5-24**] ...,"715,584,518,790,276,428,272,403,585,305,327,28...","[715, 584, 518, 790, 276, 428, 272, 403, 585, ...",[327],True
4,133127,"CHIEF COMPLAINT: Atrial fibrillation w/ RVR, V...",Subdural hemorrhage following injury without m...,"Subdural hem w/o coma,Compression of brain,Chr...",Admission Date: [**2121-9-19**] ...,"852,348,428,425,427,E885,434,244,733,443,V450,...","[852, 348, 428, 425, 427, E885, 434, 244, 733,...",[348],True


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33994 entries, 0 to 33993
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   33994 non-null  int64 
 1   text                 33994 non-null  object
 2   long_texts           33994 non-null  object
 3   short_texts          33994 non-null  object
 4   discharge_summary    33994 non-null  object
 5   short_codes          33994 non-null  object
 6   short_codes_list     33994 non-null  object
 7   neurology_codes      33994 non-null  object
 8   has_neurology_codes  33994 non-null  bool  
dtypes: bool(1), int64(1), object(7)
memory usage: 2.1+ MB


In [None]:


# Function to filter codes based on a specific range
def filter_codes(codes, lower, upper):
    return [code for code in codes if code.isdigit() and lower <= int(code) <= upper]


def creating_cols(df : pd.DataFrame) -> pd.DataFrame : 
    # Split short_codes into lists
    df['short_codes_list'] = df['short_codes'].apply(lambda x: x.split(','))

    # Filter by each range and create new columns
    df['neurology_codes'] = df['short_codes_list'].apply(lambda codes: filter_codes(codes, 320, 389))
    df['cardiovascular_codes'] = df['short_codes_list'].apply(lambda codes: filter_codes(codes, 390, 459))
    df['respiratory_codes'] = df['short_codes_list'].apply(lambda codes: filter_codes(codes, 460, 519))

    # Create boolean columns to indicate the presence of codes for each group
    df['has_neurology_codes'] = df['neurology_codes'].apply(lambda x: bool(x))
    df['has_cardiovascular_codes'] = df['cardiovascular_codes'].apply(lambda x: bool(x))
    df['has_respiratory_codes'] = df['respiratory_codes'].apply(lambda x: bool(x))

    return df

train_df = creating_cols(train_df)

# Count how many rows have codes in each group
count_neurology_rows = train_df['has_neurology_codes'].sum()
count_cardiovascular_rows = train_df['has_cardiovascular_codes'].sum()
count_respiratory_rows = train_df['has_respiratory_codes'].sum()



print(f"Number of rows with Neurology codes: {count_neurology_rows}")
print(f"Number of rows with Cardiovascular codes: {count_cardiovascular_rows}")
print(f"Number of rows with Respiratory codes: {count_respiratory_rows}")


Number of rows with Neurology codes: 10515
Number of rows with Cardiovascular codes: 28474
Number of rows with Respiratory codes: 16504


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9829 entries, 0 to 9828
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 9829 non-null   int64 
 1   text               9829 non-null   object
 2   long_texts         9829 non-null   object
 3   short_texts        9829 non-null   object
 4   discharge_summary  9829 non-null   object
 5   short_codes        9829 non-null   object
dtypes: int64(1), object(5)
memory usage: 460.9+ KB


In [None]:
test_df = creating_cols(test_df)

# Count how many rows have codes in each group
count_neurology_rows = test_df['has_neurology_codes'].sum()
count_cardiovascular_rows = test_df['has_cardiovascular_codes'].sum()
count_respiratory_rows = test_df['has_respiratory_codes'].sum()



print(f"Number of rows with Neurology codes: {count_neurology_rows}")
print(f"Number of rows with Cardiovascular codes: {count_cardiovascular_rows}")
print(f"Number of rows with Respiratory codes: {count_respiratory_rows}")

Number of rows with Neurology codes: 2995
Number of rows with Cardiovascular codes: 8328
Number of rows with Respiratory codes: 4724


# Notes 

- Symptoms increased (14k)
- fine tune llm on the trainig data
- bert wala baseline : clinical besrt
- Seq2care and Dr knows : go through it
- BioBert fine tuning
- sparese indexing (initiallly) : try it out
- statisitcal analysis (T-test)
- how many ICD-9 codes vs how many after preprocessing
- ttransfer not relevant to another folder (compress it) <--------
- build the MIMIC KG with the 3 digit codes
- CPU server index build
-> need the validation set : for e.g. while training the BERT model for supervised training task


-> or for setting the threshold for maximising hte f1 score

-> Keep the 10% as validation set