### Importing Libraries

In [1]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import ast
import os
import json
import csv
import re
from tqdm import tqdm

### Loading Parsed Dataset

In [2]:
def get_data_path(filename):
    """
    Returns the path to a file in the data/clean directory.
    
    Args:
        filename (str): Name of the file (including extension)
    
    Returns:
        str: Full path to the file
    """
    cwd = os.getcwd()
    file_path = os.path.join(cwd, '..', 'data', 'clean', filename)
    return file_path

In [3]:
final_parsed_dataset = get_data_path('final_parsed_dataset.csv')
df = pd.read_csv(final_parsed_dataset, index_col=0)

In [4]:
df

Unnamed: 0,idx,visit motivation,admission,patient information,patient medical history,surgeries,symptoms,medical examinations,diagnosis tests,treatments,discharge,_raw_summary
0,155216,"Discomfort in the neck and lower back, restric...","[{'reason': 'None', 'date': 'None', 'duration'...","{'age': 'Sixteen years old', 'sex': 'Female', ...","{'physiological context': 'None', 'psychologic...","[{'reason': 'None', 'Type': 'None', 'time': 'N...",[{'name of symptom': 'Discomfort in the neck a...,"[{'name': 'None', 'result': 'None', 'details':...","[{'test': 'None', 'severity': 'None', 'result'...","[{'name': 'Olanzapine tablets', 'related condi...","{'reason': 'None', 'referral': 'None', 'follow...",
2,133948,Pain and restricted range of motion in the lef...,[{'reason': 'Idiopathic osteonecrosis of the f...,"{'age': '36 years old', 'sex': 'Female', 'ethn...","{'physiological context': 'None', 'psychologic...",[{'reason': 'Idiopathic osteonecrosis of the f...,"[{'name of symptom': 'Pain', 'intensity of sym...","[{'name': 'Physical examination', 'result': 'S...",[{'test': 'Magnetic resonance imaging (MRI) sc...,,"{'reason': 'Good condition post-surgery', 'ref...",
3,80176,Pain in the left proximal forearm after a fall,"[{'reason': 'None', 'date': 'None', 'duration'...","{'age': '49', 'sex': 'male', 'ethnicity': 'Non...",{'physiological context': 'History of left elb...,"[{'reason': 'Posttraumatic arthritis', 'Type':...","[{'name of symptom': 'Pain', 'intensity of sym...","[{'name': 'Physical examination', 'result': 'N...","[{'test': 'Radiographs', 'severity': 'Minimall...","[{'name': 'Closed treatment in a cast', 'relat...","{'reason': 'None', 'referral': 'None', 'follow...",
4,72232,Recurrent attacks of pain in both knees over 1...,"[{'reason': 'None', 'date': 'None', 'duration'...","{'age': '47', 'sex': 'Male', 'ethnicity': 'Non...","{'physiological context': 'None', 'psychologic...","[{'reason': 'None', 'Type': 'None', 'time': 'N...","[{'name of symptom': 'Pain', 'intensity of sym...","[{'name': 'Examination', 'result': 'Severe ten...","[{'test': 'MRI', 'severity': 'Moderate-sized',...","[{'name': 'Diclofenac sodium', 'related condit...","{'reason': 'None', 'referral': 'None', 'follow...",
5,31864,Inability to walk and a long history of osteom...,"[{'reason': 'None', 'date': 'None', 'duration'...","{'age': '24 years', 'sex': 'Female', 'ethnicit...",{'physiological context': 'Inability to walk s...,"[{'reason': 'Contracted pelvis', 'Type': 'Cesa...","[{'name of symptom': 'Inability to walk', 'int...","[{'name': 'Nerve conduction study', 'result': ...","[{'test': 'None', 'severity': 'None', 'result'...","[{'name': 'Calcium and vitamin D', 'related co...","{'reason': 'None', 'referral': 'None', 'follow...",
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,39279,Stab wound under left nipple,[{'reason': '5 cm stab wound under left nipple...,"{'age': '28', 'sex': 'male', 'ethnicity': 'Non...","{'physiological context': 'None', 'psychologic...",[{'reason': 'Stab wound under left nipple with...,"[{'name of symptom': 'Dyspnoea', 'intensity of...","[{'name': 'Glasgow coma score', 'result': '12'...","[{'test': 'Blood pressure measurement', 'sever...","[{'name': 'Intravenous fluid therapy', 'relate...","{'reason': 'None', 'referral': 'None', 'follow...",
29996,137017,Diagnosed with falcine meningioma,"[{'reason': 'Craniotomy', 'date': 'None', 'dur...","{'age': '82', 'sex': 'Male', 'ethnicity': 'Non...","{'physiological context': ""Atrial fibrillation...","[{'reason': 'Prostate issue', 'Type': 'Prostat...","[{'name of symptom': 'Left-sided weakness', 'i...","[{'name': 'Preoperative thoracic radiographs',...",[{'test': 'Low-dose thoracic computed tomograp...,"[{'name': 'Anesthesia', 'related condition': '...","{'reason': 'None', 'referral': 'None', 'follow...",
29997,98004,Cardiac sounding chest pain,[{'reason': 'ST elevation myocardial infarctio...,"{'age': '54', 'sex': 'Male', 'ethnicity': 'Non...",{'physiological context': 'No past medical his...,[{'reason': 'Inferior segment elevation (ST) e...,"[{'name of symptom': 'Chest pain', 'intensity ...","[{'name': 'Electrocardiogram', 'result': 'ST e...","[{'test': 'Coronary angiography', 'severity': ...","[{'name': 'Primary coronary intervention', 're...","{'reason': 'None', 'referral': 'None', 'follow...",
29998,133320,Mass in her right thigh,"[{'reason': 'Diagnosis of leiomyosarcoma', 'da...","{'age': '49', 'sex': 'Woman', 'ethnicity': 'No...",{'physiological context': 'Noticed the mass fo...,"[{'reason': 'Leiomyosarcoma', 'Type': 'Wide tu...","[{'name of symptom': 'Mass in right thigh', 'i...","[{'name': 'Physical examination', 'result': 'C...","[{'test': 'Needle biopsy', 'severity': 'None',...","[{'name': 'Systemic chemotherapy', 'related co...","{'reason': 'None', 'referral': 'None', 'follow...",


### Exploding Patient Information & Medical History Column

In [18]:
def parse_patient_profile(val):
    """
    Converts each entry to a dictionary:
    - Handles NaN/missing
    - Handles single or multiple dicts in string
    - Handles already-parsed dicts
    - Merges multiple dicts into one (last key wins)
    """
    if pd.isna(val) or val is None:
        return {}, False  # Return empty dict, flag as missing
    if isinstance(val, dict):
        return val, True
    try:
        parsed = ast.literal_eval(val)
        if isinstance(parsed, dict):
            return parsed, True
        if isinstance(parsed, list):  # Handle rare case where it's a list of dicts
            result = {}
            for d in parsed:
                if isinstance(d, dict):
                    result.update(d)
            return result, True if result else False
    except Exception:
        pass
    # Handle string with multiple dicts not in a list
    dict_strs = re.findall(r"\{.*?\}", str(val))
    merged = {}
    found = False
    for d in dict_strs:
        try:
            parsed_d = ast.literal_eval(d)
            if isinstance(parsed_d, dict):
                merged.update(parsed_d)
                found = True
        except Exception:
            continue
    return merged, found

In [23]:
# Select and copy
df_patient_info = df[['idx','patient information']].copy()

# Parse/merge and create the flag
parsed_results = df_patient_info['patient information'].apply(parse_patient_profile)
df_patient_info['parsed_dict'] = parsed_results.apply(lambda x: x[0])

# Expand as columns
info_df = df_patient_info['parsed_dict'].apply(pd.Series)

# Join idx and expanded fields
df_info_expanded = pd.concat([df_patient_info['idx'], info_df], axis=1)

# Replace 'None' value with NA for consistency
df_info_expanded.replace('None', pd.NA, inplace=True)

# Select only age and sex for futher processing
df_info_expanded = df_info_expanded[['idx', 'age', 'sex']]
df_info_expanded.head(20)

Unnamed: 0,idx,age,sex
0,155216,Sixteen years old,Female
2,133948,36 years old,Female
3,80176,49,male
4,72232,47,Male
5,31864,24 years,Female
6,26809,24-day-old,Female
7,149866,16 years old,Female
8,87064,Seventy-three years old,Male
9,123006,23,female
10,119317,32,Female


In [24]:
df_info_expanded.to_csv('df_info_expanded.csv')

In [25]:
# Select and copy
df_medical_history = df[['idx','patient medical history']].copy()

# Parse/merge and create the flag
parsed_results = df_medical_history['patient medical history'].apply(parse_patient_profile)
df_medical_history['parsed_dict'] = parsed_results.apply(lambda x: x[0])
df_medical_history['has_medical_history'] = parsed_results.apply(lambda x: x[1])

# Expand as columns
history_df = df_medical_history['parsed_dict'].apply(pd.Series)

# Join idx, flag, and expanded fields
df_medical_expanded = pd.concat([df_medical_history[['idx', 'has_medical_history']], history_df], axis=1)

# Replace 'None' value with NA for consistency
df_medical_expanded.replace('None', pd.NA, inplace=True)

# View result
df_medical_expanded.head(20)


Unnamed: 0,idx,has_medical_history,physiological context,psychological context,vaccination history,allergies,exercise frequency,nutrition,sexual history,alcohol consumption,drug usage,smoking status
0,155216,True,,Diagnosed with bipolar affective disorder at t...,,,,,,,,
2,133948,True,,Intensifying feelings of helplessness,,,,,,,,
3,80176,True,History of left elbow arthrodesis performed fo...,,,,,,,,,
4,72232,True,,,,,,,,,,
5,31864,True,"Inability to walk since babyhood, did not walk...",,,,,,Got married at the age of 15 and became pregna...,,,
6,26809,True,"Normal Apgar score, no resuscitation required ...",,,,,,,,,
7,149866,True,"Coxa vara deformity of bilateral hips, bilater...",,,,,,,,,
8,87064,True,,Patient could not realize that his symptoms mi...,,,,,,,,
9,123006,True,,,,,,,,,,
10,119317,True,Born at full term by spontaneous vaginal deliv...,Mentally healthy,,,,,,,,


In [20]:
df_medical_expanded.to_csv('df_medical_expanded.csv')

### Exploding Surgeries Column

In [8]:
def to_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x) or x is None:
        return [None]  # Important: use [None] instead of []
    if isinstance(x, dict):
        return [x]
    try:
        import ast
        evaluated = ast.literal_eval(x)
        if isinstance(evaluated, list):
            return evaluated
        if isinstance(evaluated, dict):
            return [evaluated]
        return [None]
    except Exception:
        return [None]

In [9]:
df_surgery = df[['idx', 'surgeries']].copy()
df_surgery['surgeries'] = df_surgery['surgeries'].apply(to_list)
df_surgery_exploded = df_surgery.explode('surgeries').reset_index(drop=True)

# Add a has_surgery flag
df_surgery_exploded['has_surgery'] = df_surgery_exploded['surgeries'].apply(
    lambda x: isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())
)

# Expand only dict surgeries, leave others as NaN (they represent “no surgery”)
# This part was already correct in handling non-dict/empty dict cases by converting them to {}
surgery_fields = df_surgery_exploded['surgeries'].apply(
    lambda x: x if (isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())) else {}
).apply(pd.Series)

df_surgery_expanded = pd.concat([df_surgery_exploded[['idx', 'has_surgery']], surgery_fields], axis=1)

# Replace 'None' value with NA for consistency
df_surgery_expanded = df_surgery_expanded.fillna(pd.NA).replace(['None','NaN'], pd.NA)

df_surgery_expanded.head(50)

Unnamed: 0,idx,has_surgery,reason,Type,time,outcome,details
0,155216,False,,,,,
1,133948,True,Idiopathic osteonecrosis of the femoral head,Total Hip Arthroplasty (THA),After diagnosis,Discharged in good condition without specific ...,First THA on the left hip
2,133948,True,Pain and limited ROM in the contralateral hip ...,Total Hip Arthroplasty (THA),One year after the first THA,Discharged in good condition without specific ...,Second THA on the contralateral hip
3,80176,True,Posttraumatic arthritis,Left elbow arthrodesis,At the age of 18,,Elbow was fused at 90 degrees
4,80176,True,Hypertrophic nonunion of ulnar shaft fracture ...,Repair of nonunion and conversion of elbow art...,Three months after the fall and subsequent con...,,The stem of the ulnar component would act as a...
5,72232,False,,,,,
6,31864,True,Contracted pelvis,Cesarean section was planned but not performed,,Delivered vaginally and developed a fracture o...,
7,26809,True,Left-sided diaphragmatic defect with herniatio...,Surgical repair of diaphragmatic defect,,"Uneventful recovery, successful reduction of h...",Procedure performed in left lateral decubitus ...
8,149866,True,"Correction of deformity to realign the head, n...",Oblique osteotomy,,,"Performed on the right side, first oblique ost..."
9,87064,False,,,,,


In [10]:
df_surgery_expanded.to_csv('df_surgery_expanded.csv')

### Exploding Diagnosis Tests Column

In [11]:
df_diagnosis = df[['idx', 'diagnosis tests']].copy()
df_diagnosis['diagnosis tests'] = df_diagnosis['diagnosis tests'].apply(to_list)
df_diagnosis_exploded = df_diagnosis.explode('diagnosis tests').reset_index(drop=True)

# Add a has_diagnosis flag
df_diagnosis_exploded['has_diagnosis'] = df_diagnosis_exploded['diagnosis tests'].apply(
    lambda x: isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())
)

# Expand only dict surgeries, leave others as NaN (they represent “no surgery”)
diagnosis_fields = df_diagnosis_exploded['diagnosis tests'].apply(
    lambda x: x if (isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())) else {}
).apply(pd.Series)

df_diagnosis_expanded = pd.concat([df_diagnosis_exploded[['idx', 'has_diagnosis']], diagnosis_fields], axis=1)

df_diagnosis_expanded = df_diagnosis_expanded.fillna(pd.NA).replace(['None','NaN'], pd.NA)

df_diagnosis_expanded

Unnamed: 0,idx,has_diagnosis,test,severity,result,condition,time,details
0,155216,False,,,,,,
1,133948,True,Magnetic resonance imaging (MRI) scan,,Increased amount of joint fluid and bone marro...,Idiopathic osteonecrosis of the femoral head,,Patient did not complain of any pain on the co...
2,133948,True,Repeat MRI,,Similar findings to those noted previously in ...,,One year after the initial surgery and symptom...,
3,80176,True,Radiographs,Minimally displaced,Proximal ulnar shaft fracture,"Proximal ulnar shaft fracture, hypertrophic no...",,Elbow arthrodesis at 90 degrees with retained ...
4,72232,True,MRI,Moderate-sized,Focal area of marrow edema/contusion involving...,Bone marrow edema,"September 2016, three months later, April 2017...",Involvement of medial femoral condyle in mid a...
...,...,...,...,...,...,...,...,...
61350,133320,True,Histopathological examination,,Consistent with lung metastasis of leiomyosarcoma,Lung metastasis of leiomyosarcoma,One year and 3 months postoperatively,
61351,97973,True,Electrocardiogram (ECG),,Diffuse ST depressions in all precordial leads,Consistent with an acute coronary syndrome,,
61352,97973,True,Transthoracic echocardiogram,Ejection fraction (EF) of 45% with severe aort...,Torn right coronary cusp,Severe aortic insufficiency,,Emergent transthoracic echocardiogram performed
61353,97973,True,Blood cultures,,Positive for S.\nlugdunensis in both bottles,,,


In [12]:
df_diagnosis_expanded['time'].unique()

array([<NA>,
       'One year after the initial surgery and symptoms continued for two months and increased over the following three weeks',
       'September 2016, three months later, April 2017, four months later',
       ...,
       'One year after the first performance of IgG and IgA antibody testing',
       'Two samples collected 1 h apart from two different sites under aseptic precautions',
       'One year and 3 months postoperatively'], dtype=object)

In [13]:
df_diagnosis_expanded.to_csv('df_diagnosis_expanded.csv')

### Exploding Symptoms Column

In [14]:
df_symptoms = df[['idx', 'symptoms']].copy()
df_symptoms['symptoms'] = df_symptoms['symptoms'].apply(to_list)
df_symptoms_exploded = df_symptoms.explode('symptoms').reset_index(drop=True)

# Add a has_symptom flag
df_symptoms_exploded['has_symptom'] = df_symptoms_exploded['symptoms'].apply(
    lambda x: isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())
)

# Expand only dict surgeries, leave others as NaN (they represent “no surgery”)
symptoms_fields = df_symptoms_exploded['symptoms'].apply(
    lambda x: x if (isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())) else {}
).apply(pd.Series)

df_symptoms_expanded = pd.concat([df_symptoms_exploded[['idx', 'has_symptom']], symptoms_fields], axis=1)

df_symptoms_expanded = df_symptoms_expanded.fillna(pd.NA).replace(['None','NaN'], pd.NA)

df_symptoms_expanded

Unnamed: 0,idx,has_symptom,name of symptom,intensity of symptom,location,time,temporalisation,behaviours affecting the symptom,details
0,155216,True,"Discomfort in the neck and lower back, restric...",,Neck and lower back,Past four months,,Standing up from a sitting position,Head turned to the right and upwards due to su...
1,133948,True,Pain,Severe,Left hip joint,Persisting for two months,Increased over the following three weeks,Aggravated by hip joint flexion or rotation,Also complained of pain and limited ROM in the...
2,133948,True,Restricted range of motion,,Left hip joint,Persisting for two months,,,
3,133948,True,Gait disturbance,Severe,,,,Secondary to hip pain,Continued for two months and increased over th...
4,133948,True,Moderate moon face,Moderate,Face,At the time of the second surgery,,,Initially overlooked as weight gain
...,...,...,...,...,...,...,...,...,...
54939,137017,True,Left-sided weakness,,Left side,,,,
54940,98004,True,Chest pain,,Chest,,,,Cardiac sounding
54941,133320,True,Mass in right thigh,,Lateral side of the right thigh,Noticed four years prior to presentation,,,"Diameter of 4 cm, no adhesion with skin and no..."
54942,97973,True,Crushing substernal chest pressure,Acute onset,Substernal,,Following 1-week-long febrile illness,,Accompanied by dyspnea and profuse sweating


In [15]:
df_symptoms_expanded.to_csv('df_symptoms_expanded.csv')

In [16]:
df_symptoms_expanded['time'].unique()

array(['Past four months', 'Persisting for two months', <NA>, ...,
       'Presented for two days prior to admission',
       'After playing the drums and during activities of daily life, such as when opening a bottle lid',
       'From the last one month'], dtype=object)

In [17]:
df_symptoms_expanded['temporalisation'].unique()

array([<NA>, 'Increased over the following three weeks',
       'Recurrent attacks', ...,
       'Started at nine years of age and gradually increased',
       'Accelerating over the past 2 years',
       'Following 1-week-long febrile illness'], dtype=object)

### Exploding Treatment Column

In [26]:
df_treatments = df[['idx', 'treatments']].copy()
df_treatments['treatments'] = df_treatments['treatments'].apply(to_list)
df_treatments_exploded = df_treatments.explode('treatments').reset_index(drop=True)

# Add a has_treatments flag
df_treatments_exploded['has_treatments'] = df_treatments_exploded['treatments'].apply(
    lambda x: isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())
)

# Expand only dict treatments, leave others as NaN (they represent “no treatment”)
treatments_fields = df_treatments_exploded['treatments'].apply(
    lambda x: x if (isinstance(x, dict) and any(v is not pd.NA and v not in [None, '', 'None'] for v in x.values())) else {}
).apply(pd.Series)

df_treatments_expanded = pd.concat([df_treatments_exploded[['idx', 'has_treatments']], treatments_fields], axis=1)

df_treatments_expanded = df_treatments_expanded.fillna(pd.NA).replace(['None','NaN'], pd.NA)

df_treatments_expanded.head(20)

Unnamed: 0,idx,has_treatments,name,related condition,dosage,time,frequency,duration,reason for taking,reaction to treatment,details
0,155216,True,Olanzapine tablets,Bipolar affective disorder,5 mg per day,Past four months,Daily,,Control of exacerbated mental illness,"Pain and discomfort in neck, sustained and abn...",Previously managed with olanzapine tablets in ...
1,155216,True,Trihexyphenidyl,Rigidity in upper limbs,4 mg per day,Brief period of around three weeks,Daily,,Rigidity in upper limbs,Good response,
2,133948,False,,,,,,,,,
3,80176,True,Closed treatment in a cast,Proximal ulnar shaft fracture,,Initially after the fall,,,To treat the ulnar shaft fracture,Developed a hypertrophic nonunion,
4,80176,True,Conservative treatment,Ulna nonunion,,Three months after the fall,,An additional three months,To treat the ulna nonunion,Worsening motion through the nonunion site,
5,72232,True,Diclofenac sodium,Bone marrow edema,50 mg,September 2016,Twice daily,,To treat knee pain,Pain subsided and resolved,Advised to avoid prolonged weight-bearing acti...
6,72232,True,NSAIDs and physiotherapy,Bone marrow edema,,Three months after September 2016,,,Treatment for new onset of pain involving the ...,,Advised to use cane to minimize weight bearing...
7,72232,True,Conservative treatment,Bone marrow edema,,April 2017,,,Treatment for gradual pain over the medial sid...,,
8,31864,True,Calcium and vitamin D,Osteomalacia,,,,,To treat osteomalacia,No improvement,Became totally bedridden
9,31864,True,Calcitriol,Osteomalacia,0.25 mg daily,,Small doses,Short periods of time,To treat osteomalacia,"No improvement, eventually stopped all treatments",
