In [2]:
import pandas as pd
import polars as pl

In [3]:
mimic_discharge = pl.read_csv('physionet.org/files/mimic-iv-note/2.2/note/unzipped_discharge.csv').to_pandas()
mimic_discharge.head()

Unnamed: 0,Unnamed: 1,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...


In [8]:
mimic_discharge = mimic_discharge.sort_values(by=["charttime"])
mimic_discharge = mimic_discharge.drop_duplicates(subset=["text"], keep="last")
mimic_discharge.head(1)

Unnamed: 0,Unnamed: 1,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
228408,228408,16904137-DS-2,16904137,21081215,DS,2,2105-10-12 00:00:00,2105-10-12 14:34:00,\nName: ___ Unit No: ___...


In [10]:
combined_adm_texts = mimic_discharge.groupby('hadm_id')['text'].apply(lambda x: '\n\n'.join(x)).reset_index()
notes_df = mimic_discharge[["hadm_id", "note_id", "subject_id", "charttime"]]
notes_df = notes_df.drop_duplicates(subset=["hadm_id"], keep="last")
notes_df = pd.merge(combined_adm_texts, notes_df, on="hadm_id", how="inner")
notes_df.head(1)

Unnamed: 0,hadm_id,text,note_id,subject_id,charttime
0,20000019,\nName: ___ Unit No: ___\n \nA...,10467237-DS-7,10467237,2159-03-23 00:00:00


In [11]:
notes_df["text"] = notes_df["text"].str.strip()

# remove entries without admission id, subject id or text
notes_df = notes_df.dropna(subset=["hadm_id", "subject_id", "text"])

notes_df

Unnamed: 0,hadm_id,text,note_id,subject_id,charttime
0,20000019,Name: ___ Unit No: ___\n \nAdmi...,10467237-DS-7,10467237,2159-03-23 00:00:00
1,20000024,Name: ___ Unit No: ___\n \nAdm...,16925328-DS-6,16925328,2151-05-26 00:00:00
2,20000034,Name: ___ Unit No: ___\n...,19430048-DS-12,19430048,2174-05-24 00:00:00
3,20000041,Name: ___ Unit No: ___\n \nAdm...,18910522-DS-17,18910522,2143-09-06 00:00:00
4,20000057,Name: ___ Unit No: ___...,11146739-DS-13,11146739,2190-01-18 00:00:00
...,...,...,...,...,...
331787,29999670,Name: ___ Unit No: ___\...,16289688-DS-10,16289688,2174-02-02 00:00:00
331788,29999723,Name: ___ Unit No: ___\n...,10382924-DS-16,10382924,2170-08-10 00:00:00
331789,29999745,Name: ___ Unit No: ___\n \n...,11326722-DS-21,11326722,2160-07-12 00:00:00
331790,29999809,Name: ___ Unit No: ___\n \...,12133002-DS-8,12133002,2172-09-30 00:00:00


In [12]:
import pandas as pd
import re

def extract_sections(text):
    
    sections = {
        "CHIEF_COMPLAINT": r"chief complaint:\s*([\s\S]*?)(?:\n\n|\Z)",
        "PRESENT_ILLNESS": r"present illness:\s*([\s\S]*?)(?:\n\n|\Z)",
        "MEDICAL_HISTORY": r"medical history:\s*([\s\S]*?)(?:\n\n|\Z)",
        "MEDICATION_ADM": r"medications on admission:\s*([\s\S]*?)(?:\n\n|\Z)",
        "ALLERGIES": r"allergies:\s*([\s\S]*?)(?:\n\n|\Z)",
        "PHYSICAL_EXAM": r"physical exam:\s*([\s\S]*?)(?:\n\n|\Z)",
        "FAMILY_HISTORY": r"family history:\s*([\s\S]*?)(?:\n\n|\Z)",
        "SOCIAL_HISTORY": r"social history:\s*([\s\S]*?)(?:\n\n|\Z)"
    }
    
    extracted = {}
    for key, pattern in sections.items():
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            extracted[key] = match.group(1).strip()
        else:
            extracted[key] = ""
    
    return extracted




In [13]:
# Assuming you have a DataFrame called notes_df with a 'TEXT' column

notes_df['extracted'] = notes_df['text'].apply(extract_sections)

# Expand the dictionary into separate columns
notes_df = pd.concat([notes_df, notes_df['extracted'].apply(pd.Series)], axis=1).drop('extracted', axis =1)


In [14]:
notes_df[(notes_df.CHIEF_COMPLAINT != "") | (notes_df.PRESENT_ILLNESS != "") |
                    (notes_df.MEDICAL_HISTORY != "")]

Unnamed: 0,hadm_id,text,note_id,subject_id,charttime,CHIEF_COMPLAINT,PRESENT_ILLNESS,MEDICAL_HISTORY,MEDICATION_ADM,ALLERGIES,PHYSICAL_EXAM,FAMILY_HISTORY,SOCIAL_HISTORY
0,20000019,Name: ___ Unit No: ___\n \nAdmi...,10467237-DS-7,10467237,2159-03-23 00:00:00,"fever, nausea/vomiting, flank pain\n \nMajor S...",HISTORY OF PRESENT ILLNESS: \nMs. ___ is a __...,Type 2 diabetes \nAsthma \nHyperlipidemia \...,The Preadmission Medication list is accurate a...,No Known Allergies / Adverse Drug Reactions\n ...,"ADMISSION EXAM: \nVitals: tmax 101.2, tc 98.__...",She has a sister deceased with endometrial can...,___\nFamily History:\nShe has a sister decease...
1,20000024,Name: ___ Unit No: ___\n \nAdm...,16925328-DS-6,16925328,2151-05-26 00:00:00,"weakness, diarrhea\n \nMajor Surgical or Invas...",Ms. ___ is a ___ year-old woman with PMH signi...,allergic rhinitis\n anemia\nirritable bowel sy...,The Preadmission Medication list is accurate a...,Aspirin\n \nAttending: ___.\n \nChief Complain...,ADMISSION PHYSICAL EXAM: \n Vital Signs: 149/...,"Significant for mother with liver cancer, fath...",___\nFamily History:\nSignificant for mother w...
3,20000041,Name: ___ Unit No: ___\n \nAdm...,18910522-DS-17,18910522,2143-09-06 00:00:00,L knee pain\n \nMajor Surgical or Invasive Pro...,Patient presented with L knee pain that was no...,"dyslipid, h/o RUE DVT ___ -> coumadin complete...",The Preadmission Medication list is accurate a...,Latex\n \nAttending: ___.\n \nChief Complaint:...,"General: NAD, A&O x3, non labored breathing\n_...",nc,___\nFamily History:\nnc
4,20000057,Name: ___ Unit No: ___...,11146739-DS-13,11146739,2190-01-18 00:00:00,"ankle pain (s/p mechanical fall), cough\n \nMa...",___ h/o hypothyroidism s/p mechanical fall tod...,"Right hip replacement ___\n Hypertension, ess...",The Preadmission Medication list is accurate a...,No Known Allergies / Adverse Drug Reactions\n ...,Admission exam:\nVitals- 98.2 ___ 34 (from 18)...,Sister: ___ disease\nGoals of care: \nDo you ...,___\nFamily History:\nSister: ___ disease\nGoa...
5,20000094,Name: ___ Unit No: ___\...,14046553-DS-11,14046553,2150-03-03 00:00:00,Cardiogenic shock\n \nMajor Surgical or Invasi...,The patient is an ___ y/o M with PMHx signific...,-CAD (s/p stenting of the distal right coronar...,The Preadmission Medication list is accurate a...,No Known Allergies / Adverse Drug Reactions\n ...,ADMISSION PHYSICAL EXAM\n=====================...,Noncontributory\n \nPhysical Exam:\nADMISSION ...,___\nFamily History:\nNoncontributory\n \nPhys...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331787,29999670,Name: ___ Unit No: ___\...,16289688-DS-10,16289688,2174-02-02 00:00:00,Chest Pain\n \nMajor Surgical or Invasive Proc...,"Mr. ___ is an ___ M w/ CAD, hx diastolic CHF, ...","- CAD\n- CHF, hx diastolic\n- Diabetes mellitu...",The Preadmission Medication list is accurate a...,Iodinated Contrast Media - IV Dye\n \nAttendin...,ADMISSION PHYSICAL EXAM:\n====================...,Father died of cancer. Mother died of lung can...,___\nFamily History:\nFather died of cancer. M...
331788,29999723,Name: ___ Unit No: ___\n...,10382924-DS-16,10382924,2170-08-10 00:00:00,"Dizziness, chest pressure, n/v\n \nMajor Surgi...",Mr. ___ is a ___ w/ hx of MS who ___ dizziness...,"- Multiple sclerosis, diagnosed roughly ___ ye...","CYANOCOBALAMIN 500 mcg Spray, Non-Aerosol - 1 ...",Patient recorded as having No Known Allergies ...,Vitals: 98.2F P79 BP157/75 R20 97% on RA\nGene...,No known history of MS or other auto-immune di...,___\nFamily History:\nNo known history of MS o...
331789,29999745,Name: ___ Unit No: ___\n \n...,11326722-DS-21,11326722,2160-07-12 00:00:00,"""I can't stop being moody and irritable and I'...","___ with borderline personality disorder, BPAD...",Past Medical History:\nh/o hypercholesterolemi...,Citalopram 40mg PO daily (increased from 30mg ...,Zocor\n \nAttending: ___\n \nChief Complaint:\...,"Appearance: ___ man, well-groomed, clean-shave...","Per OMR, 2 sisters and niece w/bipolar d/o. Si...",Social Hx:\nPt was born in ___. Has 7 brothers...
331790,29999809,Name: ___ Unit No: ___\n \...,12133002-DS-8,12133002,2172-09-30 00:00:00,,___ year old M with history of CAD s/p MI in _...,"1. CARDIAC RISK FACTORS: Diabetes, Dyslipidemi...",The Preadmission Medication list is accurate a...,Lipitor / Pravachol\n \nAttending: ___\n \n___...,ADMISSION PHYSICAL EXAMINATION: \nVS: 98.3 12...,Father had a stroke at ___. \n \nPhysical Exam...,___\nFamily History:\nFather had a stroke at _...


In [15]:
# add section headers and combine into TEXT_ADMISSION
notes_df = notes_df.assign(TEXT="CHIEF COMPLAINT: " + notes_df.CHIEF_COMPLAINT.astype(str)
                                + '\n\n' +
                                "PRESENT ILLNESS: " + notes_df.PRESENT_ILLNESS.astype(str)
                                + '\n\n' +
                                "MEDICAL HISTORY: " + notes_df.MEDICAL_HISTORY.astype(str)
                                + '\n\n' +
                                "MEDICATION ON ADMISSION: " + notes_df.MEDICATION_ADM.astype(str)
                                + '\n\n' +
                                "ALLERGIES: " + notes_df.ALLERGIES.astype(str)
                                + '\n\n' +
                                "PHYSICAL EXAM: " + notes_df.PHYSICAL_EXAM.astype(str)
                                + '\n\n' +
                                "FAMILY HISTORY: " + notes_df.FAMILY_HISTORY.astype(str)
                                + '\n\n' +
                                "SOCIAL HISTORY: " + notes_df.SOCIAL_HISTORY.astype(str))

notes_df.head(1)

Unnamed: 0,hadm_id,text,note_id,subject_id,charttime,CHIEF_COMPLAINT,PRESENT_ILLNESS,MEDICAL_HISTORY,MEDICATION_ADM,ALLERGIES,PHYSICAL_EXAM,FAMILY_HISTORY,SOCIAL_HISTORY,TEXT
0,20000019,Name: ___ Unit No: ___\n \nAdmi...,10467237-DS-7,10467237,2159-03-23 00:00:00,"fever, nausea/vomiting, flank pain\n \nMajor S...",HISTORY OF PRESENT ILLNESS: \nMs. ___ is a __...,Type 2 diabetes \nAsthma \nHyperlipidemia \...,The Preadmission Medication list is accurate a...,No Known Allergies / Adverse Drug Reactions\n ...,"ADMISSION EXAM: \nVitals: tmax 101.2, tc 98.__...",She has a sister deceased with endometrial can...,___\nFamily History:\nShe has a sister decease...,"CHIEF COMPLAINT: fever, nausea/vomiting, flank..."


In [16]:
notes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331792 entries, 0 to 331791
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   hadm_id          331792 non-null  int64 
 1   text             331792 non-null  object
 2   note_id          331792 non-null  object
 3   subject_id       331792 non-null  int64 
 4   charttime        331792 non-null  object
 5   CHIEF_COMPLAINT  331792 non-null  object
 6   PRESENT_ILLNESS  331792 non-null  object
 7   MEDICAL_HISTORY  331792 non-null  object
 8   MEDICATION_ADM   331792 non-null  object
 9   ALLERGIES        331792 non-null  object
 10  PHYSICAL_EXAM    331792 non-null  object
 11  FAMILY_HISTORY   331792 non-null  object
 12  SOCIAL_HISTORY   331792 non-null  object
 13  TEXT             331792 non-null  object
dtypes: int64(2), object(12)
memory usage: 35.4+ MB


In [18]:
notes_df.to_csv('mimic_iv_preprocessed.csv')

In [19]:
mimic_discharge.note_type.value_counts()

note_type
DS    331792
Name: count, dtype: int64

In [20]:
mimic_icd = pd.read_csv('/home/ashish/llama_inference/physionet.org/files/mimiciv/3.0/hosp/d_icd_diagnoses.csv')
mimic_icd.head()

Unnamed: 0.1,Unnamed: 0,icd_code,icd_version,long_title
0,0,10,9,Cholera due to vibrio cholerae
1,1,11,9,Cholera due to vibrio cholerae el tor
2,2,19,9,"Cholera, unspecified"
3,3,20,9,Typhoid fever
4,4,21,9,Paratyphoid fever A


In [21]:
icd = pl.read_csv('physionet.org/files/mimiciv/3.0/hosp/diagnoses_icd.csv').to_pandas()
icd.head()

Unnamed: 0,Unnamed: 1,subject_id,hadm_id,seq_num,icd_code,icd_version
0,0,10000032,22595853,1,5723,9
1,1,10000032,22595853,2,78959,9
2,2,10000032,22595853,3,5715,9
3,3,10000032,22595853,4,7070,9
4,4,10000032,22595853,5,496,9


In [22]:
icd = icd[icd.icd_version == 9]
icd = icd[['subject_id','hadm_id','icd_code']]
icd["SHORT_CODE"] = icd.icd_code.astype(str)


In [23]:
icd.loc[
    icd['SHORT_CODE'].str.startswith("V"), 'SHORT_CODE'] = icd.SHORT_CODE.apply(
    lambda x: x[:4])
icd.loc[
    icd['SHORT_CODE'].str.startswith("E"), 'SHORT_CODE'] = icd.SHORT_CODE.apply(
    lambda x: x[:4])
icd.loc[(~icd.SHORT_CODE.str.startswith("E")) & (
    ~icd.SHORT_CODE.str.startswith("V")), 'SHORT_CODE'] = icd.SHORT_CODE.apply(
    lambda x: x[:3])

In [24]:
icd = icd.drop_duplicates(
    ["hadm_id", "SHORT_CODE"])

# store all ICD codes for vectorization
icd9_codes = icd.SHORT_CODE.unique().tolist()

grouped_codes = icd.groupby(['hadm_id', 'subject_id'])['SHORT_CODE'].apply(
    lambda d: ",".join(d.astype(str))).reset_index()

In [25]:
grouped_codes = grouped_codes.rename(columns={'SHORT_CODE': 'SHORT_CODES'})



In [26]:
icd

Unnamed: 0,subject_id,hadm_id,icd_code,SHORT_CODE
0,10000032,22595853,5723,572
1,10000032,22595853,78959,789
2,10000032,22595853,5715,571
3,10000032,22595853,07070,070
4,10000032,22595853,496,496
...,...,...,...,...
6364515,19999987,23865745,41401,414
6364516,19999987,23865745,78039,780
6364517,19999987,23865745,0413,041
6364518,19999987,23865745,36846,368


In [27]:
grouped_codes

Unnamed: 0,hadm_id,subject_id,SHORT_CODES
0,20000019,10467237,"038,590,753,584,276,995,250,401,493,272,V164,285"
1,20000041,18910522,"715,250,V854,401,272,V458,530,V104,V125,278"
2,20000057,11146739,"719,996,E885,E849,465,493,401,365,362,780,244,..."
3,20000102,13074106,"644,V235,V270"
4,20000159,18949662,998285780785573E878
...,...,...,...
291125,29999692,15040495,"575,V130"
291126,29999723,10382924,"401,340,788,E947,780,459,530,296"
291127,29999745,11326722,"296,272,301,V154,309,V113"
291128,29999809,12133002,"414,038,599,995,596,496,411,562,305,250,401,27..."


In [34]:
grouped_codes.subject_id.value_counts()

subject_id
15464144    169
10714009    161
16662316    134
15229574    104
17517983     95
           ... 
14303891      1
13193917      1
10687335      1
16788749      1
12640657      1
Name: count, Length: 131078, dtype: int64

In [36]:
# merge discharge summaries into diagnosis table
notes_diagnoses_df = pd.merge(
    grouped_codes[['hadm_id', 'SHORT_CODES']], notes_df, how='inner', on='hadm_id')

notes_diagnoses_df

Unnamed: 0,hadm_id,SHORT_CODES,text,note_id,subject_id,charttime,CHIEF_COMPLAINT,PRESENT_ILLNESS,MEDICAL_HISTORY,MEDICATION_ADM,ALLERGIES,PHYSICAL_EXAM,FAMILY_HISTORY,SOCIAL_HISTORY,TEXT
0,20000019,"038,590,753,584,276,995,250,401,493,272,V164,285",Name: ___ Unit No: ___\n \nAdmi...,10467237-DS-7,10467237,2159-03-23 00:00:00,"fever, nausea/vomiting, flank pain\n \nMajor S...",HISTORY OF PRESENT ILLNESS: \nMs. ___ is a __...,Type 2 diabetes \nAsthma \nHyperlipidemia \...,The Preadmission Medication list is accurate a...,No Known Allergies / Adverse Drug Reactions\n ...,"ADMISSION EXAM: \nVitals: tmax 101.2, tc 98.__...",She has a sister deceased with endometrial can...,___\nFamily History:\nShe has a sister decease...,"CHIEF COMPLAINT: fever, nausea/vomiting, flank..."
1,20000041,"715,250,V854,401,272,V458,530,V104,V125,278",Name: ___ Unit No: ___\n \nAdm...,18910522-DS-17,18910522,2143-09-06 00:00:00,L knee pain\n \nMajor Surgical or Invasive Pro...,Patient presented with L knee pain that was no...,"dyslipid, h/o RUE DVT ___ -> coumadin complete...",The Preadmission Medication list is accurate a...,Latex\n \nAttending: ___.\n \nChief Complaint:...,"General: NAD, A&O x3, non labored breathing\n_...",nc,___\nFamily History:\nnc,CHIEF COMPLAINT: L knee pain\n \nMajor Surgica...
2,20000057,"719,996,E885,E849,465,493,401,365,362,780,244,...",Name: ___ Unit No: ___...,11146739-DS-13,11146739,2190-01-18 00:00:00,"ankle pain (s/p mechanical fall), cough\n \nMa...",___ h/o hypothyroidism s/p mechanical fall tod...,"Right hip replacement ___\n Hypertension, ess...",The Preadmission Medication list is accurate a...,No Known Allergies / Adverse Drug Reactions\n ...,Admission exam:\nVitals- 98.2 ___ 34 (from 18)...,Sister: ___ disease\nGoals of care: \nDo you ...,___\nFamily History:\nSister: ___ disease\nGoa...,CHIEF COMPLAINT: ankle pain (s/p mechanical fa...
3,20000102,"644,V235,V270",Name: ___ Unit No: ___\n \n...,13074106-DS-18,13074106,2135-10-28 00:00:00,preterm labor\n \nMajor Surgical or Invasive P...,Patient is a ___ yo G2P0 at ___ who presented ...,PNC: ___ ___\nLabs: O+/Ab-/HBsAg-/RPRNR/RI/ GB...,none\n \nDischarge Medications:\n1. breast pum...,Patient recorded as having No Known Allergies ...,PE: 97.8 79 18 120/65\nGen: NAD\nAbd: So...,noncontributory\n \nPhysical Exam:\nPE: 97.8 ...,___\nFamily History:\nnoncontributory\n \nPhys...,CHIEF COMPLAINT: preterm labor\n \nMajor Surgi...
4,20000235,"572,428,585,682,112,571,397,284,456,V451,459,4...",Name: ___ Unit No: ___\n \nA...,12640657-DS-11,12640657,2139-12-03 00:00:00,Altered mental status,___ year-old male with cirrhosis of unknown et...,Cirrhosis -- Patient believes etiology is not ...,Synthroid 25 mcg PO daily \nAllopurinol ___ m...,Prednisone\n \nAttending: ___\n \nChief Compla...,"VITALS - T 98.2, BP 90/45, HR 62, RR 20, SpO2 ...",Mother with lung cancer at age ___.\nFather wi...,___\nFamily History:\nMother with lung cancer ...,CHIEF COMPLAINT: Altered mental status\n\nPRES...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209317,29999670,"410,427,428,250,272,285,414,403,585,438,369,V1...",Name: ___ Unit No: ___\...,16289688-DS-10,16289688,2174-02-02 00:00:00,Chest Pain\n \nMajor Surgical or Invasive Proc...,"Mr. ___ is an ___ M w/ CAD, hx diastolic CHF, ...","- CAD\n- CHF, hx diastolic\n- Diabetes mellitu...",The Preadmission Medication list is accurate a...,Iodinated Contrast Media - IV Dye\n \nAttendin...,ADMISSION PHYSICAL EXAM:\n====================...,Father died of cancer. Mother died of lung can...,___\nFamily History:\nFather died of cancer. M...,CHIEF COMPLAINT: Chest Pain\n \nMajor Surgical...
209318,29999723,"401,340,788,E947,780,459,530,296",Name: ___ Unit No: ___\n...,10382924-DS-16,10382924,2170-08-10 00:00:00,"Dizziness, chest pressure, n/v\n \nMajor Surgi...",Mr. ___ is a ___ w/ hx of MS who ___ dizziness...,"- Multiple sclerosis, diagnosed roughly ___ ye...","CYANOCOBALAMIN 500 mcg Spray, Non-Aerosol - 1 ...",Patient recorded as having No Known Allergies ...,Vitals: 98.2F P79 BP157/75 R20 97% on RA\nGene...,No known history of MS or other auto-immune di...,___\nFamily History:\nNo known history of MS o...,"CHIEF COMPLAINT: Dizziness, chest pressure, n/..."
209319,29999745,"296,272,301,V154,309,V113",Name: ___ Unit No: ___\n \n...,11326722-DS-21,11326722,2160-07-12 00:00:00,"""I can't stop being moody and irritable and I'...","___ with borderline personality disorder, BPAD...",Past Medical History:\nh/o hypercholesterolemi...,Citalopram 40mg PO daily (increased from 30mg ...,Zocor\n \nAttending: ___\n \nChief Complaint:\...,"Appearance: ___ man, well-groomed, clean-shave...","Per OMR, 2 sisters and niece w/bipolar d/o. Si...",Social Hx:\nPt was born in ___. Has 7 brothers...,"CHIEF COMPLAINT: ""I can't stop being moody and..."
209320,29999809,"414,038,599,995,596,496,411,562,305,250,401,27...",Name: ___ Unit No: ___\n \...,12133002-DS-8,12133002,2172-09-30 00:00:00,,___ year old M with history of CAD s/p MI in _...,"1. CARDIAC RISK FACTORS: Diabetes, Dyslipidemi...",The Preadmission Medication list is accurate a...,Lipitor / Pravachol\n \nAttending: ___\n \n___...,ADMISSION PHYSICAL EXAMINATION: \nVS: 98.3 12...,Father had a stroke at ___. \n \nPhysical Exam...,___\nFamily History:\nFather had a stroke at _...,CHIEF COMPLAINT: \n\nPRESENT ILLNESS: ___ year...


In [37]:
notes_diagnoses_df.to_csv('mimic_iv_preprocessed_icd_codes.csv')

In [62]:
sampled_df = notes_diagnoses_df.sample(n=2000, weights='hadm_id')


In [63]:
sampled_df.to_csv('mimic-iv-sampled.csv')

In [67]:
sampled_df = sampled_df.drop_duplicates()

In [100]:
symptom_data = []
count = 0
for i in tqdm(os.listdir('mimic_iv_train_symptoms')) : 
    symptom = {}
    subject_id = int(i.split('.json')[0])
    with open(f'mimic_iv_train_symptoms/{i}','r') as f : 
        
        try : 
            file_content = json.load(f)
            if file_content is not None : 
                symptom['subject_id']  = subject_id
                symptom['Symptoms'] = file_content['Symptoms']
                symptom_data.append(symptom)

        except : 
            count +=1
            continue

    


 37%|███▋      | 730/1980 [00:00<00:00, 7010.37it/s]

100%|██████████| 1980/1980 [00:00<00:00, 7912.30it/s]


In [101]:
count

31

In [102]:
extracted_symptoms = pd.DataFrame(symptom_data)
extracted_symptoms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  1949 non-null   int64 
 1   Symptoms    1949 non-null   object
dtypes: int64(1), object(1)
memory usage: 30.6+ KB


In [103]:
extracted_symptoms

Unnamed: 0,subject_id,Symptoms
0,13589930,"[Chills, Anorexia, Nausea, Urinary frequency, ..."
1,13511794,"[Right upper quadrant pain, Epigastric pain, N..."
2,16572655,"[Shortness of breath, Fevers, Decreased urine ..."
3,11035448,"[Nausea, Vomiting, Fever, Suprapubic discomfor..."
4,15919557,"[Hematuria, Urethral pain, Penile pain]"
...,...,...
1944,10252334,"[unresponsiveness, transient right-sided weakn..."
1945,15758946,"[left upper back pain, left sided sharp interm..."
1946,17118056,"[infected wound right lower extremity, swollen..."
1947,18497825,"[Abdominal pain, Emesis, Decreased bowel funct..."


In [104]:
extracted_symptoms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  1949 non-null   int64 
 1   Symptoms    1949 non-null   object
dtypes: int64(1), object(1)
memory usage: 30.6+ KB


In [105]:
full_sampled_df = pd.merge(sampled_df, extracted_symptoms , how = 'inner', on = 'subject_id')
full_sampled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   hadm_id          1969 non-null   int64 
 1   SHORT_CODES      1969 non-null   object
 2   text             1969 non-null   object
 3   note_id          1969 non-null   object
 4   subject_id       1969 non-null   int64 
 5   charttime        1969 non-null   object
 6   CHIEF_COMPLAINT  1969 non-null   object
 7   PRESENT_ILLNESS  1969 non-null   object
 8   MEDICAL_HISTORY  1969 non-null   object
 9   MEDICATION_ADM   1969 non-null   object
 10  ALLERGIES        1969 non-null   object
 11  PHYSICAL_EXAM    1969 non-null   object
 12  FAMILY_HISTORY   1969 non-null   object
 13  SOCIAL_HISTORY   1969 non-null   object
 14  TEXT             1969 non-null   object
 15  Symptoms         1969 non-null   object
dtypes: int64(2), object(14)
memory usage: 246.2+ KB


In [107]:
full_sampled_df.to_csv('mimic-iv-preprocessed-icd-symptoms.csv')