In [13]:
import pandas as pd
import os
import ast
import json
import csv
import re
from tqdm import tqdm
from pathlib import Path

In [7]:
pd.set_option('display.max_colwidth', 500)

In [8]:
def clean_json_text(s: str) -> str:
    """
    1. Remove outer quotes if the whole string is wrapped in one pair of quotes.
    2. Escape unescaped newlines inside strings; replace bare newlines with spaces.
    3. Escape other control characters as Unicode escapes.
    4. Collapse illegal double-quoted values (""foo"" → "foo").
    """
    if not isinstance(s, str) or not s.strip():
        return s

    # 1) Unwrap outer quotes
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        s = s[1:-1]

    out = []
    in_string = False
    esc = False
    for ch in s:
        if esc:
            out.append(ch)
            esc = False
        elif ch == '\\':
            out.append(ch)
            esc = True
        elif ch == '"' and not esc:
            in_string = not in_string
            out.append(ch)
        elif ch in ('\n', '\r'):
            out.append('\\n' if in_string else ' ')
        elif ord(ch) < 0x20:
            out.append(f'\\u{ord(ch):04x}')
        else:
            out.append(ch)
    fixed = ''.join(out)
    # 4) Collapse ""foo"" → "foo"
    fixed = re.sub(r'""([^\"]*?)""', r'"\1"', fixed)
    return fixed


def process_json_string(json_str):
    """Fix embedded JSON strings (e.g. summary field)"""
    if not json_str or not isinstance(json_str, str):
        return {}
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        # fallback: use cleaning then parse
        cleaned = clean_json_text(json_str)
        try:
            return json.loads(cleaned)
        except:
            return {}


def flatten_json(nested, prefix=''):
    """Flatten nested JSON into a flat dict"""
    flat = {}
    if isinstance(nested, dict):
        for k, v in nested.items():
            key = f"{prefix}{k}" if prefix else k
            if isinstance(v, (dict, list)):
                flat.update(flatten_json(v, f"{key}."))
            else:
                flat[key] = str(v) if v is not None else ''
    elif isinstance(nested, list):
        for i, item in enumerate(nested):
            key = f"{prefix}{i}"
            if isinstance(item, (dict, list)):
                flat.update(flatten_json(item, f"{key}."))
            else:
                flat[key] = str(item) if item is not None else ''
    return flat


def iter_jsonl_records(fp, cleaner=None):
    """
    Read JSONL records robustly, buffering multiline entries.
    cleaner: function to pre-clean raw text.
    """
    decoder = json.JSONDecoder()
    buffer = ''
    for line in fp:
        buffer += line
        text = cleaner(buffer) if cleaner else buffer
        idx = 0
        while True:
            try:
                obj, idx = decoder.raw_decode(text)
                yield obj
                text = text[idx:].lstrip()
                buffer = text
            except json.JSONDecodeError:
                break
    # Final attempt
    text = cleaner(buffer) if cleaner else buffer
    try:
        obj, _ = decoder.raw_decode(text)
        yield obj
    except json.JSONDecodeError:
        pass


def parse_jsonl_to_csv(input_file, output_file, max_rows=None):
    """Parse JSONL to flattened CSV using robust cleaning."""
    rows, keys, errors = [], set(), 0
    with open(input_file, 'r', encoding='utf-8') as f:
        total = sum(1 for _ in f)
        f.seek(0)
        for i, data in enumerate(tqdm(iter_jsonl_records(f, cleaner=clean_json_text),
                                      total=total, desc='Reading')):
            if max_rows and i >= max_rows:
                break
            try:
                # apply summary parsing if present
                if 'summary' in data and isinstance(data['summary'], str):
                    data['summary'] = process_json_string(data['summary'])
                flat = flatten_json(data)
                keys.update(flat.keys())
                rows.append(flat)
            except Exception:
                errors += 1
    # write CSV
    fieldnames = sorted(keys)
    with open(output_file, 'w', newline='', encoding='utf-8') as csvf:
        writer = csv.DictWriter(csvf, fieldnames=fieldnames)
        writer.writeheader()
        for row in tqdm(rows, desc='Writing'):
            for k in fieldnames:
                row.setdefault(k, '')
            writer.writerow(row)
    print(f"Done: {len(rows)} rows, {errors} errors.")


def parse_jsonl_to_csv_preserved(input_file, output_file, max_rows=None):
    """Parse JSONL to CSV preserving top-level JSON as strings."""
    rows, fields, errors = [], set(), 0
    with open(input_file, 'r', encoding='utf-8') as f:
        total = sum(1 for _ in f)
        f.seek(0)
        for i, data in enumerate(tqdm(iter_jsonl_records(f, cleaner=clean_json_text),
                                      total=total, desc='Reading')):
            if max_rows and i >= max_rows:
                break
            try:
                row = {}
                for k, v in data.items():
                    if isinstance(v, (dict, list)):
                        row[k] = json.dumps(v, ensure_ascii=False)
                    else:
                        row[k] = str(v) if v is not None else ''
                fields.update(row.keys())
                rows.append(row)
            except Exception:
                errors += 1
    fieldnames = sorted(fields)
    with open(output_file, 'w', newline='', encoding='utf-8') as csvf:
        writer = csv.DictWriter(csvf, fieldnames=fieldnames)
        writer.writeheader()
        for row in tqdm(rows, desc='Writing'):
            for k in fieldnames:
                row.setdefault(k, '')
            writer.writerow(row)
    print(f"Done: {len(rows)} rows, {errors} errors.")


def parse_json_series(series):
    """Parse a pandas Series of JSON strings, capturing failures."""
    out = []
    for item in series:
        if isinstance(item, str):
            cleaned = clean_json_text(item)
            try:
                obj = json.loads(cleaned)
                out.append(obj if isinstance(obj, dict) else {'value': obj})
            except Exception:
                out.append({'_raw_summary': item})
        else:
            out.append({'_raw_summary': str(item)})
    return out

In [19]:
ROOT = Path.cwd().parent

input_file = ROOT / "data" / "raw" / "augmented_notes_30K.jsonl"
output_file = ROOT / "data" / "raw" / "medical_data.csv"

if not input_file.exists():
    raise FileNotFoundError(f"Input not found: {input_file}")

parse_jsonl_to_csv_preserved(input_file, output_file)


Reading: 100%|██████████| 30000/30000 [01:01<00:00, 486.60it/s]
Writing: 100%|██████████| 30000/30000 [00:06<00:00, 4513.69it/s]


Done: 30000 rows, 0 errors.


In [20]:
df = pd.read_csv(output_file)

In [21]:
df

Unnamed: 0,conversation,full_note,idx,note,summary
0,"Doctor: Good morning, what brings you to the Outpatient department today?\nPatient: Good morning doctor, I have some discomfort in my neck and lower back, and I'm not able to maintain an erect posture.\nDoctor: Hmm, okay. Can you tell me more about the discomfort?\nPatient: Yes, I tend to fall on either side when I stand up from a sitting position, and my head is always turned to the right and upwards.\nDoctor: I see. Are you experiencing any pain in your neck?\nPatient: Yes, I have pain and...","A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal pos...",155216,"A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal pos...","{\n""visit motivation"": ""Discomfort in the neck and lower back, restriction of body movements, inability to maintain an erect posture, and requiring assistance in standing and walking."",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""Sixteen years old"",\n""sex"": ""Female"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n..."
1,"Doctor: Hi, how are you feeling today?\nPatient: Hi doctor, I have this dump pain on my right back and a swelling right there for several weeks.\nDoctor: Hmm, I see. Can you describe the pain a little more?\nPatient: It's like a dull ache and it hurts more when I move.\nDoctor: Okay. I understand. And you said you're in good state and very active?\nPatient: Yes, I am.\nDoctor: That's good to hear. Have you had any health problems in the past?\nPatient: Well, I had a thoracic trauma at work o...","This is the case of a 56-year-old man that was complaining of a dump pain on the right back and a swelling right in this place for several weeks. The patient was in good state and very active. There was not any health problem in the past except a thoracic trauma at work one year ago. In that time the patient was diagnosed with a simple fracture of the 9th right rib without any other consequences.\nOn the X-ray was seen a shadow in the lower part of the right hemithorax. After that, it was de...",77465,"This is the case of a 56-year-old man that was complaining of a dump pain on the right back and a swelling right in this place for several weeks. The patient was in good state and very active. There was not any health problem in the past except a thoracic trauma at work one year ago. In that time the patient was diagnosed with a simple fracture of the 9th right rib without any other consequences.\nOn the X-ray was seen a shadow in the lower part of the right hemithorax. After that, it was de...","{\n""visit motivation"": ""Complaints of a dull pain on the right back and a swelling in the same area"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""56"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiolog..."
2,"Doctor: Hello, what brings you to the hospital today?\nPatient: Hi, I have been experiencing pain and restricted range of motion in my left hip joint for two months now. I also have a gait disturbance.\nDoctor: I see. Can you tell me more about your chief complaint?\nPatient: Sure. I have been having severe pain in my left hip that gets worse when I try to flex or rotate it. And my gait is also affected because of the pain.\nDoctor: Hmm, I understand. Can you tell me if you have any lifestyl...",A 36-year old female patient visited our hospital with a chief complaint of pain and restricted range of motion (ROM) in the left hip joint persisting for two months. Physical examination of the patient revealed severe gait disturbance secondary to hip pain aggravated by hip joint flexion or rotation. The patient had no lifestyle habits or specific comorbidities thought to cause femoral head AVN. An anteroposterior view of the left hip revealed sclerosis and collapse of the femoral head and ...,133948,A 36-year old female patient visited our hospital with a chief complaint of pain and restricted range of motion (ROM) in the left hip joint persisting for two months. Physical examination of the patient revealed severe gait disturbance secondary to hip pain aggravated by hip joint flexion or rotation. The patient had no lifestyle habits or specific comorbidities thought to cause femoral head AVN. An anteroposterior view of the left hip revealed sclerosis and collapse of the femoral head and ...,"{\n""visit motivation"": ""Pain and restricted range of motion in the left hip joint"",\n""admission"": [\n{\n""reason"": ""Idiopathic osteonecrosis of the femoral head"",\n""date"": ""None"",\n""duration"": ""Three weeks"",\n""care center details"": ""None""\n},\n{\n""reason"": ""Pain and limited ROM in the contralateral hip joint and gait disturbance"",\n""date"": ""One year after the initial surgery"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""36 years old"",\n""sex"": ..."
3,"Doctor: Good morning, Mr. [Patient's Name]. I'm Dr. [Doctor's Name]. What brings you in today?\nPatient: Good morning, doctor. I have a pain in my left arm after a fall.\nDoctor: I see. Can you tell me more about the fall and the pain you're experiencing?\nPatient: I fell and hit my left arm about three months ago. The pain has been getting worse and I can't move my arm as well as I used to.\nDoctor: Hmm, I understand. I see from your medical history that you had a surgery on your left elbow...","A 49-year-old male presented with a complaint of pain in the left proximal forearm after a fall. The patient had a history of left elbow arthrodesis performed for posttraumatic arthritis at the age of 18. On physical examination he was tender at the proximal ulna. He had no active flexion or extension at his elbow, which was fused at 90 degrees but achieved 40 degrees of pronation and 60 degrees of supination. His motor and sensory exam was normal at the hand. Radiographs of the forearm and ...",80176,"A 49-year-old male presented with a complaint of pain in the left proximal forearm after a fall. The patient had a history of left elbow arthrodesis performed for posttraumatic arthritis at the age of 18. On physical examination he was tender at the proximal ulna. He had no active flexion or extension at his elbow, which was fused at 90 degrees but achieved 40 degrees of pronation and 60 degrees of supination. His motor and sensory exam was normal at the hand. Radiographs of the forearm and ...","{\n""visit motivation"": ""Pain in the left proximal forearm after a fall"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""49"",\n""sex"": ""male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological context"": ""History of le..."
4,"Doctor: Good morning, how are you feeling today?\nPatient: Hi doctor, I'm not feeling great. I have been experiencing pain in my knees for over a year now.\nDoctor: I see. You were referred to our clinic for recurrent attacks of pain in both knees, correct?\nPatient: Yes, that's correct.\nDoctor: Can you tell me more about the pain you have been experiencing?\nPatient: Well, it started about a year ago with severe pain over the medial aspect of my left knee. It lasted for two weeks and it wa...","A 47-year-old male patient was referred to the rheumatology clinic because of recurrent attacks of pain in both knees over 1 year.\nIn September 2016, the patient presented with severe pain over the medial aspect of the left knee for a two-week duration which prevented him from ambulation. The pain increased with weight-bearing physical activity. The patient reported no history of trauma before the onset of the knee pain. Examination showed severe tenderness over the medial side of the knee ...",72232,"A 47-year-old male patient was referred to the rheumatology clinic because of recurrent attacks of pain in both knees over 1 year.\nIn September 2016, the patient presented with severe pain over the medial aspect of the left knee for a two-week duration which prevented him from ambulation. The pain increased with weight-bearing physical activity. The patient reported no history of trauma before the onset of the knee pain. Examination showed severe tenderness over the medial side of the knee ...","{\n""visit motivation"": ""Recurrent attacks of pain in both knees over 1 year"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Rheumatology clinic""\n}\n],\n""patient information"": {\n""age"": ""47"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological cont..."
...,...,...,...,...,...
29995,"Doctor: Good morning, sir. Can you tell me what happened?\nPatient: I was stabbed.\nDoctor: I see. Where were you stabbed?\nPatient: Under my left nipple.\nDoctor: Okay. And how big was the stab wound?\nPatient: It was about 5 cm.\nDoctor: I see. And what kind of treatment did you receive before coming here?\nPatient: They inserted a chest drain because I was having trouble breathing.\nDoctor: I see. And was the chest drain clamped during transport?\nPatient: Yes, because of massive hemorrha...","A 28-year-old male was admitted to the emergency department (ED) with a 5 cm stab wound (SW) under his left nipple. Pre-hospital treatment included insertion of a left chest drain due to dyspnoea, but this was clamped during transport because of massive hemorrhage. On admission, he was self-ventilating, with palpable carotid pulses, but without a measurable blood pressure. He was agitated and pale with a Glasgow coma score of 12 since he could open his eyes, localize pain and speak. The bloo...",39279,"A 28-year-old male was admitted to the emergency department (ED) with a 5 cm stab wound (SW) under his left nipple. Pre-hospital treatment included insertion of a left chest drain due to dyspnoea, but this was clamped during transport because of massive hemorrhage. On admission, he was self-ventilating, with palpable carotid pulses, but without a measurable blood pressure. He was agitated and pale with a Glasgow coma score of 12 since he could open his eyes, localize pain and speak. The bloo...","{\n""visit motivation"": ""Stab wound under left nipple"",\n""admission"": [\n{\n""reason"": ""5 cm stab wound under left nipple with complications including massive hemorrhage and dyspnoea"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Emergency department (ED)""\n}\n],\n""patient information"": {\n""age"": ""28"",\n""sex"": ""male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""o..."
29996,"Doctor: Good morning, sir. I am Dr. John. How are you feeling today?\nPatient: I am okay, thank you.\nDoctor: Can you tell me what's wrong with you and why you are here today?\nPatient: Yes, I was diagnosed with falcine meningioma and admitted for craniotomy.\nDoctor: I see. And how long have you been experiencing weakness on the left side?\nPatient: It's been a few weeks now.\nDoctor: Okay. Do you have any other medical conditions besides falcine meningioma and left-sided weakness?\nPatient...","An 82-year-old man (64.5 kg, 175 cm) diagnosed with falcine meningioma was admitted for craniotomy. The patient presented left-sided weakness. Comorbidities included atrial fibrillation and Parkinson's disease. He had undergone prostate surgery under spinal anesthesia 1 year previously. Preoperative thoracic radiographs revealed several small nodular opacities in the upper lobe of the right lung. Low-dose thoracic computed tomography (CT) was performed to further evaluate the possibility of ...",137017,"An 82-year-old man (64.5 kg, 175 cm) diagnosed with falcine meningioma was admitted for craniotomy. The patient presented left-sided weakness. Comorbidities included atrial fibrillation and Parkinson's disease. He had undergone prostate surgery under spinal anesthesia 1 year previously. Preoperative thoracic radiographs revealed several small nodular opacities in the upper lobe of the right lung. Low-dose thoracic computed tomography (CT) was performed to further evaluate the possibility of ...","{\n""visit motivation"": ""Diagnosed with falcine meningioma"",\n""admission"": [\n{\n""reason"": ""Craniotomy"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""82"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""64.5 kg"",\n""height"": ""175 cm"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological context"": ""Atrial fibrilla..."
29997,"Doctor: Good morning, how are you feeling today?\nPatient: I'm feeling a bit better, thank you.\nDoctor: I see that you were brought to the hospital with cardiac sounding chest pain. Can you tell me more about that?\nPatient: Yes, I had a sharp pain in my chest that wouldn't go away.\nDoctor: I understand. And I see that you underwent an electrocardiogram. Can you tell me what the results showed?\nPatient: Yes, the results showed an elevation in leads II, III, and aVF.\nDoctor: Okay. Based o...","A 54 year-old man with no past medical history was brought to hospital with cardiac sounding chest pain and an electrocardiogram revealing ST elevation in leads II, III and aVF. He was taken for immediate primary coronary intervention for an inferior segment elevation (ST) elevation myocardial infarction via right radial approach. During diagnostic imaging of his left mainstem artery, he suffered a Ventricular Fibrillation (VF) cardiac arrest on table. He had immediate CPR by hand and then t...",98004,"A 54 year-old man with no past medical history was brought to hospital with cardiac sounding chest pain and an electrocardiogram revealing ST elevation in leads II, III and aVF. He was taken for immediate primary coronary intervention for an inferior segment elevation (ST) elevation myocardial infarction via right radial approach. During diagnostic imaging of his left mainstem artery, he suffered a Ventricular Fibrillation (VF) cardiac arrest on table. He had immediate CPR by hand and then t...","{\n""visit motivation"": ""Cardiac sounding chest pain"",\n""admission"": [\n{\n""reason"": ""ST elevation myocardial infarction and cardiac arrest"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Hospital with facilities for primary coronary intervention and intensive care unit""\n}\n],\n""patient information"": {\n""age"": ""54"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic contex..."
29998,"Doctor: Good morning, how can I help you today?\nPatient: Good morning, Doctor. I have a mass in my right thigh.\nDoctor: Okay, when did you first notice the mass?\nPatient: I first noticed it four years ago.\nDoctor: I see. And did you have a biopsy done on the mass?\nPatient: Yes, I did a needle biopsy.\nDoctor: And what was the result of the biopsy?\nPatient: I was referred to your hospital with a diagnosis of leiomyosarcoma.\nDoctor: Okay, I understand. Can you tell me more about the mas...","A 49-year-old woman visited the clinic due to a mass in her right thigh. The patient first noticed the mass four years prior to presentation. After needle biopsy was performed, she was referred to our hospital with a diagnosis of leiomyosarcoma.\nA physical examination confirmed the mass with a diameter of 4 cm on the lateral side of the right thigh. There was no adhesion with skin and no tenderness.\nMagnetic resonance imaging (MRI) identified a subcutaneous soft tissue mass, which exhibite...",133320,"A 49-year-old woman visited the clinic due to a mass in her right thigh. The patient first noticed the mass four years prior to presentation. After needle biopsy was performed, she was referred to our hospital with a diagnosis of leiomyosarcoma.\nA physical examination confirmed the mass with a diameter of 4 cm on the lateral side of the right thigh. There was no adhesion with skin and no tenderness.\nMagnetic resonance imaging (MRI) identified a subcutaneous soft tissue mass, which exhibite...","{\n""visit motivation"": ""Mass in her right thigh"",\n""admission"": [\n{\n""reason"": ""Diagnosis of leiomyosarcoma"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Referred to our hospital""\n}\n],\n""patient information"": {\n""age"": ""49"",\n""sex"": ""Woman"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological con..."


In [40]:
df[df['idx'] == 14]

Unnamed: 0,conversation,full_note,idx,note,summary
10219,"Doctor: Good morning, how are you feeling today?\nPatient: I'm feeling a little bit better, but still in some pain.\nDoctor: I understand. Can you tell me about your past medical history?\nPatient: Yes, I have liver cirrhosis secondary to hepatitis C, and I used to smoke. I also have post-stent coronary artery disease.\nDoctor: Okay, thank you for letting me know. We are here today for your elective left and right heart catheterization as a pre-transplant evaluation, correct?\nPatient: Yes, ...","The patient was a 62-year-old male with a past medical history of liver cirrhosis secondary to hepatitis C, tobacco use, and post-stent coronary artery disease, who initially came to the hospital for elective left and right heart catheterization as a pre-transplant evaluation. Physical examination showed abdominal distension and diffuse tenderness with the presence of prominent superficial abdominal veins. A computed tomography (CT) scan of the abdomen with contrast was obtained immediately....",14,"The patient was a 62-year-old male with a past medical history of liver cirrhosis secondary to hepatitis C, tobacco use, and post-stent coronary artery disease, who initially came to the hospital for elective left and right heart catheterization as a pre-transplant evaluation. Physical examination showed abdominal distension and diffuse tenderness with the presence of prominent superficial abdominal veins. A computed tomography (CT) scan of the abdomen with contrast was obtained immediately....","{\n""visit motivation"": ""Elective left and right heart catheterization as a pre-transplant evaluation"",\n""admission"": [\n{\n""reason"": ""Pre-transplant evaluation"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""62"",\n""sex"": ""male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical h..."


In [31]:
df.to_csv('augmented_notes_30K.csv', index=False)

In [22]:
if 'summary' in df.columns:
     idx = df['idx'] if 'idx' in df.columns else df.index
     parsed = parse_json_series(df['summary'])
     df_sum = pd.DataFrame(parsed).fillna('')
     df_result = pd.concat([pd.Series(idx).reset_index(drop=True), df_sum.reset_index(drop=True)], axis=1)
     df_result.to_csv('parsed_summary_output.csv', index=False)

In [23]:
parsed_summary_output = ROOT / "data" / "clean" / 'parsed_summary_output.csv'
df2 = pd.read_csv(parsed_summary_output)

In [30]:
df2

Unnamed: 0,idx,visit motivation,admission,patient information,patient medical history,surgeries,symptoms,medical examinations,diagnosis tests,treatments,discharge,_raw_summary
0,155216,"Discomfort in the neck and lower back, restriction of body movements, inability to maintain an erect posture, and requiring assistance in standing and walking.","[{'reason': 'None', 'date': 'None', 'duration': 'None', 'care center details': 'None'}]","{'age': 'Sixteen years old', 'sex': 'Female', 'ethnicity': 'None', 'weight': 'None', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'None', 'psychological context': 'Diagnosed with bipolar affective disorder at the age of eleven, first episode was that of mania.', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'None', 'Type': 'None', 'time': 'None', 'outcome': 'None', 'details': 'None'}]","[{'name of symptom': 'Discomfort in the neck and lower back, restriction of body movements, inability to maintain an erect posture', 'intensity of symptom': 'None', 'location': 'Neck and lower back', 'time': 'Past four months', 'temporalisation': 'None', 'behaviours affecting the symptom': 'Standing up from a sitting position', 'details': 'Head turned to the right and upwards due to sustained contraction of neck muscles, sideways bending of the back in the lumbar region, limbs positioned to ...","[{'name': 'None', 'result': 'None', 'details': 'None'}]","[{'test': 'None', 'severity': 'None', 'result': 'None', 'condition': 'None', 'time': 'None', 'details': 'None'}]","[{'name': 'Olanzapine tablets', 'related condition': 'Bipolar affective disorder', 'dosage': '5 mg per day', 'time': 'Past four months', 'frequency': 'Daily', 'duration': 'None', 'reason for taking': 'Control of exacerbated mental illness', 'reaction to treatment': 'Pain and discomfort in neck, sustained and abnormal contraction of neck muscles, requiring assistance in daily chores', 'details': 'Previously managed with olanzapine tablets in 2.5–10 mg doses per day at different times over the...","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",
1,77465,,,,,,,,,,,"{\n""visit motivation"": ""Complaints of a dull pain on the right back and a swelling in the same area"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""56"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiolog..."
2,133948,Pain and restricted range of motion in the left hip joint,"[{'reason': 'Idiopathic osteonecrosis of the femoral head', 'date': 'None', 'duration': 'Three weeks', 'care center details': 'None'}, {'reason': 'Pain and limited ROM in the contralateral hip joint and gait disturbance', 'date': 'One year after the initial surgery', 'duration': 'None', 'care center details': 'None'}]","{'age': '36 years old', 'sex': 'Female', 'ethnicity': 'None', 'weight': '7 kg heavier than at the time of the first procedure', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'None', 'psychological context': 'Intensifying feelings of helplessness', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'Idiopathic osteonecrosis of the femoral head', 'Type': 'Total Hip Arthroplasty (THA)', 'time': 'After diagnosis', 'outcome': 'Discharged in good condition without specific complications', 'details': 'First THA on the left hip'}, {'reason': 'Pain and limited ROM in the contralateral hip joint', 'Type': 'Total Hip Arthroplasty (THA)', 'time': 'One year after the first THA', 'outcome': 'Discharged in good condition without specific complications', 'details': 'Second THA on the cont...","[{'name of symptom': 'Pain', 'intensity of symptom': 'Severe', 'location': 'Left hip joint', 'time': 'Persisting for two months', 'temporalisation': 'Increased over the following three weeks', 'behaviours affecting the symptom': 'Aggravated by hip joint flexion or rotation', 'details': 'Also complained of pain and limited ROM in the contralateral hip joint one year after initial surgery'}, {'name of symptom': 'Restricted range of motion', 'intensity of symptom': 'None', 'location': 'Left hip...","[{'name': 'Physical examination', 'result': 'Severe gait disturbance secondary to hip pain', 'details': 'Aggravated by hip joint flexion or rotation'}, {'name': 'Anteroposterior view of the left hip', 'result': 'Sclerosis and collapse of the femoral head and dysplasia of the hip', 'details': 'None'}]","[{'test': 'Magnetic resonance imaging (MRI) scan', 'severity': 'None', 'result': 'Increased amount of joint fluid and bone marrow edema in the left hip, and femoral head necrosis on the contralateral side', 'condition': 'Idiopathic osteonecrosis of the femoral head', 'time': 'None', 'details': 'Patient did not complain of any pain on the contralateral side at the time of the first MRI'}, {'test': 'Repeat MRI', 'severity': 'None', 'result': 'Similar findings to those noted previously in the l...",,"{'reason': 'Good condition post-surgery', 'referral': 'Referred to the Department of Cardiology due to a progressive worsening of central', 'follow up': 'Regular outpatient visits', 'discharge summary': 'Discharged in good condition after both surgeries without specific complications'}",
3,80176,Pain in the left proximal forearm after a fall,"[{'reason': 'None', 'date': 'None', 'duration': 'None', 'care center details': 'None'}]","{'age': '49', 'sex': 'male', 'ethnicity': 'None', 'weight': 'None', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'History of left elbow arthrodesis performed for posttraumatic arthritis at the age of 18', 'psychological context': 'None', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'Posttraumatic arthritis', 'Type': 'Left elbow arthrodesis', 'time': 'At the age of 18', 'outcome': 'None', 'details': 'Elbow was fused at 90 degrees'}, {'reason': 'Hypertrophic nonunion of ulnar shaft fracture and functional limitations of elbow arthrodesis', 'Type': 'Repair of nonunion and conversion of elbow arthrodesis to arthroplasty', 'time': 'Three months after the fall and subsequent conservative treatment period', 'outcome': 'None', 'details': 'The stem of the ulnar comp...","[{'name of symptom': 'Pain', 'intensity of symptom': 'None', 'location': 'Left proximal forearm', 'time': 'After a fall', 'temporalisation': 'None', 'behaviours affecting the symptom': 'None', 'details': 'Patient was tender at the proximal ulna'}]","[{'name': 'Physical examination', 'result': 'No active flexion or extension at elbow, 40 degrees of pronation, 60 degrees of supination, normal motor and sensory exam at the hand', 'details': 'Elbow was fused at 90 degrees'}]","[{'test': 'Radiographs', 'severity': 'Minimally displaced', 'result': 'Proximal ulnar shaft fracture', 'condition': 'Proximal ulnar shaft fracture, hypertrophic nonunion', 'time': 'None', 'details': 'Elbow arthrodesis at 90 degrees with retained hardware was also noted'}]","[{'name': 'Closed treatment in a cast', 'related condition': 'Proximal ulnar shaft fracture', 'dosage': 'None', 'time': 'Initially after the fall', 'frequency': 'None', 'duration': 'None', 'reason for taking': 'To treat the ulnar shaft fracture', 'reaction to treatment': 'Developed a hypertrophic nonunion', 'details': 'None'}, {'name': 'Conservative treatment', 'related condition': 'Ulna nonunion', 'dosage': 'None', 'time': 'Three months after the fall', 'frequency': 'None', 'duration': 'An ...","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",
4,72232,Recurrent attacks of pain in both knees over 1 year,"[{'reason': 'None', 'date': 'None', 'duration': 'None', 'care center details': 'Rheumatology clinic'}]","{'age': '47', 'sex': 'Male', 'ethnicity': 'None', 'weight': 'None', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'None', 'psychological context': 'None', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'None', 'Type': 'None', 'time': 'None', 'outcome': 'None', 'details': 'None'}]","[{'name of symptom': 'Pain', 'intensity of symptom': 'Severe', 'location': 'Medial aspect of the left knee, lateral aspect of the left knee, medial side of the right knee', 'time': 'Over 1 year', 'temporalisation': 'Recurrent attacks', 'behaviours affecting the symptom': 'Increased with weight-bearing physical activity', 'details': 'Prevented ambulation, no history of trauma, mild effusion, moderate limitation of range of motion, no erythema or increased warmth'}]","[{'name': 'Examination', 'result': 'Severe tenderness over the medial side of the knee with mild effusion and moderate limitation of range of motion', 'details': 'No erythema or increased warmth of the knee'}]","[{'test': 'MRI', 'severity': 'Moderate-sized', 'result': 'Focal area of marrow edema/contusion involving the medial femoral condyle, bone marrow edema involving the lateral femoral condyle, extensive marrow edema involving the medial femoral condyle', 'condition': 'Bone marrow edema', 'time': 'September 2016, three months later, April 2017, four months later', 'details': 'Involvement of medial femoral condyle in mid and anterior parts predominantly along the articular surface, complete resol...","[{'name': 'Diclofenac sodium', 'related condition': 'Bone marrow edema', 'dosage': '50 mg', 'time': 'September 2016', 'frequency': 'Twice daily', 'duration': 'None', 'reason for taking': 'To treat knee pain', 'reaction to treatment': 'Pain subsided and resolved', 'details': 'Advised to avoid prolonged weight-bearing activities'}, {'name': 'NSAIDs and physiotherapy', 'related condition': 'Bone marrow edema', 'dosage': 'None', 'time': 'Three months after September 2016', 'frequency': 'None', '...","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,39279,Stab wound under left nipple,"[{'reason': '5 cm stab wound under left nipple with complications including massive hemorrhage and dyspnoea', 'date': 'None', 'duration': 'None', 'care center details': 'Emergency department (ED)'}]","{'age': '28', 'sex': 'male', 'ethnicity': 'None', 'weight': 'None', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'None', 'psychological context': 'None', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'Stab wound under left nipple with complications', 'Type': 'Median sternotomy incision', 'time': 'None', 'outcome': 'The wound was not bleeding post-surgery', 'details': 'A 6 cm cut in the lateral pericardium corresponding to the stab wound in the chest and a 7 cm, almost transmural wound in the left ventricle, parallel to a major diagonal branch'}]","[{'name of symptom': 'Dyspnoea', 'intensity of symptom': 'None', 'location': 'Chest', 'time': 'None', 'temporalisation': 'None', 'behaviours affecting the symptom': 'Stab wound', 'details': 'Chest drain was inserted pre-hospital due to dyspnoea'}, {'name of symptom': 'Agitation', 'intensity of symptom': 'None', 'location': 'None', 'time': 'None', 'temporalisation': 'None', 'behaviours affecting the symptom': 'None', 'details': 'Patient was agitated on admission'}, {'name of symptom': 'Pallor...","[{'name': 'Glasgow coma score', 'result': '12', 'details': 'Patient could open eyes, localize pain and speak'}]","[{'test': 'Blood pressure measurement', 'severity': 'None', 'result': 'Ranged from 80/60 to 100/60 mmHg after intravenous fluid therapy', 'condition': 'Hypotension', 'time': 'None', 'details': 'Blood pressure was not measurable upon admission'}, {'test': 'Heart rate monitoring', 'severity': 'None', 'result': 'Tachycardia of 100–120 beats per minute', 'condition': 'Tachycardia', 'time': 'None', 'details': 'None'}, {'test': 'Chest x-ray', 'severity': 'None', 'result': 'Persisting hemothorax an...","[{'name': 'Intravenous fluid therapy', 'related condition': 'Hypotension and hemorrhage', 'dosage': 'None', 'time': 'None', 'frequency': 'None', 'duration': 'None', 'reason for taking': 'To manage blood pressure and compensate for blood loss', 'reaction to treatment': 'Blood pressure increased but decreased again when therapy was reduced', 'details': 'Included transfusion of 1500 ml of crystalloid fluid and 250 ml of red cells'}, {'name': 'Chest drain insertion', 'related condition': 'Hemoth...","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",
29996,137017,Diagnosed with falcine meningioma,"[{'reason': 'Craniotomy', 'date': 'None', 'duration': 'None', 'care center details': 'None'}]","{'age': '82', 'sex': 'Male', 'ethnicity': 'None', 'weight': '64.5 kg', 'height': '175 cm', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': ""Atrial fibrillation, Parkinson's disease, previous prostate surgery under spinal anesthesia"", 'psychological context': 'None', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'Non-smoker'}","[{'reason': 'Prostate issue', 'Type': 'Prostate surgery', 'time': '1 year previously', 'outcome': 'None', 'details': 'Performed under spinal anesthesia'}]","[{'name of symptom': 'Left-sided weakness', 'intensity of symptom': 'None', 'location': 'Left side', 'time': 'None', 'temporalisation': 'None', 'behaviours affecting the symptom': 'None', 'details': 'None'}]","[{'name': 'Preoperative thoracic radiographs', 'result': 'Several small nodular opacities in the upper lobe of the right lung', 'details': 'None'}, {'name': 'Low-dose thoracic computed tomography (CT)', 'result': 'Fibro-atelectatic changes, bronchiectasis, calcifications, and small nodules', 'details': 'Considered to be sequelae to resolved pulmonary tuberculosis'}, {'name': 'Pulse oximetry', 'result': '100% in room air', 'details': 'None'}]","[{'test': 'Low-dose thoracic computed tomography (CT)', 'severity': 'None', 'result': 'Fibro-atelectatic changes, bronchiectasis, calcifications, and small nodules', 'condition': 'Sequelae to resolved pulmonary tuberculosis', 'time': 'None', 'details': 'None'}]","[{'name': 'Anesthesia', 'related condition': 'Craniotomy', 'dosage': 'None', 'time': 'None', 'frequency': 'None', 'duration': 'None', 'reason for taking': 'To induce unconsciousness for surgery', 'reaction to treatment': 'Insufficient tidal volume and reduced breathing sounds post-intubation, leading to re-intubation', 'details': 'Induced via intravenous propofol and remifentanil infusion, followed by 50 mg of rocuronium and intubation with a reinforced tracheal tube'}]","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",
29997,98004,Cardiac sounding chest pain,"[{'reason': 'ST elevation myocardial infarction and cardiac arrest', 'date': 'None', 'duration': 'None', 'care center details': 'Hospital with facilities for primary coronary intervention and intensive care unit'}]","{'age': '54', 'sex': 'Male', 'ethnicity': 'None', 'weight': 'None', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'No past medical history', 'psychological context': 'None', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'Inferior segment elevation (ST) elevation myocardial infarction', 'Type': 'Primary percutaneous coronary intervention (drug-eluting stent)', 'time': 'None', 'outcome': 'Successful treatment of right coronary artery culprit lesion', 'details': 'Procedure complicated by Ventricular Fibrillation cardiac arrest; required immediate CPR and five shocks to restore spontaneous circulation'}]","[{'name of symptom': 'Chest pain', 'intensity of symptom': 'None', 'location': 'Chest', 'time': 'None', 'temporalisation': 'None', 'behaviours affecting the symptom': 'None', 'details': 'Cardiac sounding'}]","[{'name': 'Electrocardiogram', 'result': 'ST elevation in leads II, III and aVF', 'details': 'None'}, {'name': 'Arterial blood gases', 'result': 'Hb of 69 g/l after procedure, down from admission Hb of 112 g/l', 'details': 'Significant drop in Hb likely due to 3 l of IV fluid during resuscitation and angiogram'}, {'name': 'Repeat arterial blood gas testing', 'result': 'Hb of 45 g/l', 'details': 'None'}, {'name': 'Bedside FAST scan', 'result': 'Free fluid observed in Morison’s space with vary...","[{'test': 'Coronary angiography', 'severity': 'None', 'result': 'Right coronary artery culprit lesion', 'condition': 'Inferior segment elevation (ST) elevation myocardial infarction', 'time': 'None', 'details': 'Continued during CPR'}]","[{'name': 'Primary coronary intervention', 'related condition': 'Inferior segment elevation (ST) elevation myocardial infarction', 'dosage': 'None', 'time': 'None', 'frequency': 'None', 'duration': 'None', 'reason for taking': 'To treat right coronary artery culprit lesion', 'reaction to treatment': 'Successful with restoration of spontaneous circulation', 'details': 'Right radial approach used'}, {'name': 'CPR', 'related condition': 'Ventricular Fibrillation cardiac arrest', 'dosage': 'None...","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",
29998,133320,Mass in her right thigh,"[{'reason': 'Diagnosis of leiomyosarcoma', 'date': 'None', 'duration': 'None', 'care center details': 'Referred to our hospital'}]","{'age': '49', 'sex': 'Woman', 'ethnicity': 'None', 'weight': 'None', 'height': 'None', 'family medical history': 'None', 'recent travels': 'None', 'socio economic context': 'None', 'occupation': 'None'}","{'physiological context': 'Noticed the mass four years prior to presentation', 'psychological context': 'None', 'vaccination history': 'None', 'allergies': 'None', 'exercise frequency': 'None', 'nutrition': 'None', 'sexual history': 'None', 'alcohol consumption': 'None', 'drug usage': 'None', 'smoking status': 'None'}","[{'reason': 'Leiomyosarcoma', 'Type': 'Wide tumor resection', 'time': 'None', 'outcome': 'Successful with no adjuvant chemotherapy and radiotherapy administered due to the small (<5 cm) and superficial tumor', 'details': 'None'}, {'reason': 'Lung nodules', 'Type': 'Excisional biopsy', 'time': 'One year and 3 months postoperatively', 'outcome': 'Histopathological diagnosis was consistent with lung metastasis of leiomyosarcoma', 'details': 'None'}, {'reason': 'Bone metastasis of the right femu...","[{'name of symptom': 'Mass in right thigh', 'intensity of symptom': 'None', 'location': 'Lateral side of the right thigh', 'time': 'Noticed four years prior to presentation', 'temporalisation': 'None', 'behaviours affecting the symptom': 'None', 'details': 'Diameter of 4 cm, no adhesion with skin and no tenderness'}]","[{'name': 'Physical examination', 'result': 'Confirmed the mass with a diameter of 4 cm', 'details': 'On the lateral side of the right thigh, no adhesion with skin and no tenderness'}, {'name': 'MRI', 'result': 'Identified a subcutaneous soft tissue mass', 'details': 'Exhibited low signal intensity on T1-weighted images and heterogeneous signal intensity on T2-weighted images, tumor was enhanced by a gadolinium injection'}, {'name': 'CT of the chest, abdomen, and pelvis', 'result': 'Did not ...","[{'test': 'Needle biopsy', 'severity': 'None', 'result': 'Diagnosis of leiomyosarcoma', 'condition': 'Leiomyosarcoma', 'time': 'None', 'details': 'None'}, {'test': 'Histopathological examination', 'severity': 'None', 'result': 'Consistent with lung metastasis of leiomyosarcoma', 'condition': 'Lung metastasis of leiomyosarcoma', 'time': 'One year and 3 months postoperatively', 'details': 'None'}]","[{'name': 'Systemic chemotherapy', 'related condition': 'Lung and bone metastases', 'dosage': 'None', 'time': 'None', 'frequency': 'None', 'duration': 'None', 'reason for taking': 'To treat lung and bone metastases', 'reaction to treatment': 'None', 'details': 'Using doxorubicin and ifosfamide'}]","{'reason': 'None', 'referral': 'None', 'follow up': 'None', 'discharge summary': 'None'}",


In [24]:
# 1) Filter only the rows that failed parsing
fail_df = df2[df2['_raw_summary'].notna() & (df2['_raw_summary'] != '')].copy()

# 2) Compute diagnostic columns
fail_df['snippet']      = fail_df['_raw_summary'].str.slice(0, 80).str.replace('\n', '⏎')
fail_df['char_count']   = fail_df['_raw_summary'].str.len()
fail_df['quote_count']  = fail_df['_raw_summary'].str.count('"')
fail_df['braces_diff']  = (
    fail_df['_raw_summary'].str.count(r'{') - fail_df['_raw_summary'].str.count(r'}')
)
fail_df['newline_count']= fail_df['_raw_summary'].str.count('\n')

# 3) Capture the json.loads error for each
def capture_error(msg):
    try:
        json.loads(msg)
        return ''
    except Exception as e:
        return str(e)

fail_df['error_message'] = fail_df['_raw_summary'].apply(capture_error)

# 4) Compute frequency of each error and sort
fail_df['error_frequency'] = fail_df.groupby('error_message')['error_message']\
                                   .transform('count')
report = fail_df.sort_values(['error_frequency', 'char_count'],
                             ascending=[False, False])

# 5) Show the top 50 in console
print(report[['idx', 'snippet', 'char_count', 'newline_count',
              'quote_count', 'braces_diff', 'error_message',
              'error_frequency']].head(300).to_string(index=False))

# 6) Also write out to CSV for deeper inspection
report.to_csv('parsing_diagnostics.csv', index=False)
print("\nDetailed diagnostics written to parsing_diagnostics.csv")

   idx                                                                          snippet  char_count  newline_count  quote_count  braces_diff                                                                  error_message  error_frequency
  2706 {⏎"visit motivation": "Bleeding per rectum and mild abdominal pain",⏎"admission"        4946            158          468            0                         Expecting ',' delimiter: line 50 column 39 (char 1409)                3
 16046 {⏎"visit motivation": "Evaluation of severe dry eye in the left eye",⏎"admission        4511            161          468            0                         Expecting ',' delimiter: line 50 column 39 (char 1409)                3
  6566 {⏎"visit motivation": "Right elbow mass",⏎"admission": [⏎{⏎"reason": "Right elbo        2840             97          276            0                         Expecting ',' delimiter: line 50 column 39 (char 1409)                3
 89035 {⏎"visit motivation": "Recurrent disseminated

In [25]:
df3 = df2[df2['_raw_summary'].isna()].copy()


In [26]:
df3.to_csv('final_parsed_dataset.csv')