In [1]:
import pandas as pd
import os
import ast
import json
import csv
import re
from tqdm import tqdm

In [2]:
pd.set_option('display.max_colwidth', 500)

In [3]:
def clean_json_text(s: str) -> str:
    """
    1. Remove outer quotes if the whole string is wrapped in one pair of quotes.
    2. Escape unescaped newlines inside strings; replace bare newlines with spaces.
    3. Escape other control characters as Unicode escapes.
    4. Collapse illegal double-quoted values (""foo"" → "foo").
    """
    if not isinstance(s, str) or not s.strip():
        return s

    # 1) Unwrap outer quotes
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        s = s[1:-1]

    out = []
    in_string = False
    esc = False
    for ch in s:
        if esc:
            out.append(ch)
            esc = False
        elif ch == '\\':
            out.append(ch)
            esc = True
        elif ch == '"' and not esc:
            in_string = not in_string
            out.append(ch)
        elif ch in ('\n', '\r'):
            out.append('\\n' if in_string else ' ')
        elif ord(ch) < 0x20:
            out.append(f'\\u{ord(ch):04x}')
        else:
            out.append(ch)
    fixed = ''.join(out)
    # 4) Collapse ""foo"" → "foo"
    fixed = re.sub(r'""([^\"]*?)""', r'"\1"', fixed)
    return fixed


def process_json_string(json_str):
    """Fix embedded JSON strings (e.g. summary field)"""
    if not json_str or not isinstance(json_str, str):
        return {}
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        # fallback: use cleaning then parse
        cleaned = clean_json_text(json_str)
        try:
            return json.loads(cleaned)
        except:
            return {}


def flatten_json(nested, prefix=''):
    """Flatten nested JSON into a flat dict"""
    flat = {}
    if isinstance(nested, dict):
        for k, v in nested.items():
            key = f"{prefix}{k}" if prefix else k
            if isinstance(v, (dict, list)):
                flat.update(flatten_json(v, f"{key}."))
            else:
                flat[key] = str(v) if v is not None else ''
    elif isinstance(nested, list):
        for i, item in enumerate(nested):
            key = f"{prefix}{i}"
            if isinstance(item, (dict, list)):
                flat.update(flatten_json(item, f"{key}."))
            else:
                flat[key] = str(item) if item is not None else ''
    return flat


def iter_jsonl_records(fp, cleaner=None):
    """
    Read JSONL records robustly, buffering multiline entries.
    cleaner: function to pre-clean raw text.
    """
    decoder = json.JSONDecoder()
    buffer = ''
    for line in fp:
        buffer += line
        text = cleaner(buffer) if cleaner else buffer
        idx = 0
        while True:
            try:
                obj, idx = decoder.raw_decode(text)
                yield obj
                text = text[idx:].lstrip()
                buffer = text
            except json.JSONDecodeError:
                break
    # Final attempt
    text = cleaner(buffer) if cleaner else buffer
    try:
        obj, _ = decoder.raw_decode(text)
        yield obj
    except json.JSONDecodeError:
        pass


def parse_jsonl_to_csv(input_file, output_file, max_rows=None):
    """Parse JSONL to flattened CSV using robust cleaning."""
    rows, keys, errors = [], set(), 0
    with open(input_file, 'r', encoding='utf-8') as f:
        total = sum(1 for _ in f)
        f.seek(0)
        for i, data in enumerate(tqdm(iter_jsonl_records(f, cleaner=clean_json_text),
                                      total=total, desc='Reading')):
            if max_rows and i >= max_rows:
                break
            try:
                # apply summary parsing if present
                if 'summary' in data and isinstance(data['summary'], str):
                    data['summary'] = process_json_string(data['summary'])
                flat = flatten_json(data)
                keys.update(flat.keys())
                rows.append(flat)
            except Exception:
                errors += 1
    # write CSV
    fieldnames = sorted(keys)
    with open(output_file, 'w', newline='', encoding='utf-8') as csvf:
        writer = csv.DictWriter(csvf, fieldnames=fieldnames)
        writer.writeheader()
        for row in tqdm(rows, desc='Writing'):
            for k in fieldnames:
                row.setdefault(k, '')
            writer.writerow(row)
    print(f"Done: {len(rows)} rows, {errors} errors.")


def parse_jsonl_to_csv_preserved(input_file, output_file, max_rows=None):
    """Parse JSONL to CSV preserving top-level JSON as strings."""
    rows, fields, errors = [], set(), 0
    with open(input_file, 'r', encoding='utf-8') as f:
        total = sum(1 for _ in f)
        f.seek(0)
        for i, data in enumerate(tqdm(iter_jsonl_records(f, cleaner=clean_json_text),
                                      total=total, desc='Reading')):
            if max_rows and i >= max_rows:
                break
            try:
                row = {}
                for k, v in data.items():
                    if isinstance(v, (dict, list)):
                        row[k] = json.dumps(v, ensure_ascii=False)
                    else:
                        row[k] = str(v) if v is not None else ''
                fields.update(row.keys())
                rows.append(row)
            except Exception:
                errors += 1
    fieldnames = sorted(fields)
    with open(output_file, 'w', newline='', encoding='utf-8') as csvf:
        writer = csv.DictWriter(csvf, fieldnames=fieldnames)
        writer.writeheader()
        for row in tqdm(rows, desc='Writing'):
            for k in fieldnames:
                row.setdefault(k, '')
            writer.writerow(row)
    print(f"Done: {len(rows)} rows, {errors} errors.")


def parse_json_series(series):
    """Parse a pandas Series of JSON strings, capturing failures."""
    out = []
    for item in series:
        if isinstance(item, str):
            cleaned = clean_json_text(item)
            try:
                obj = json.loads(cleaned)
                out.append(obj if isinstance(obj, dict) else {'value': obj})
            except Exception:
                out.append({'_raw_summary': item})
        else:
            out.append({'_raw_summary': str(item)})
    return out

In [4]:
# Get current working directory
cwd = os.getcwd()

In [7]:
# Parse the JSONL file and convert to CSV
if __name__ == "__main__":
    input_file = os.path.join(cwd,'data', 'raw', 'augmented_notes_30K.jsonl')
    output_file = os.path.join(cwd, 'data', 'raw', 'medical_data.csv')

    # Preserves original structure (keeps original JSON for complex fields)
    parse_jsonl_to_csv_preserved(input_file, output_file)


Reading: 100%|██████████| 30000/30000 [00:39<00:00, 752.50it/s]
Writing: 100%|██████████| 30000/30000 [00:06<00:00, 4476.55it/s]

Done: 30000 rows, 0 errors.





In [8]:
df = pd.read_csv(output_file)

In [9]:
df

Unnamed: 0,conversation,full_note,idx,note,summary
0,"Doctor: Good morning, what brings you to the Outpatient department today?\nPatient: Good morning doctor, I have some discomfort in my neck and lower back, and I'm not able to maintain an erect posture.\nDoctor: Hmm, okay. Can you tell me more about the discomfort?\nPatient: Yes, I tend to fall on either side when I stand up from a sitting position, and my head is always turned to the right and upwards.\nDoctor: I see. Are you experiencing any pain in your neck?\nPatient: Yes, I have pain and...","A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal pos...",155216,"A a sixteen year-old girl, presented to our Outpatient department with the complaints of discomfort in the neck and lower back as well as restriction of body movements. She was not able to maintain an erect posture and would tend to fall on either side while standing up from a sitting position. She would keep her head turned to the right and upwards due to the sustained contraction of the neck muscles. There was a sideways bending of the back in the lumbar region. To counter the abnormal pos...","{\n""visit motivation"": ""Discomfort in the neck and lower back, restriction of body movements, inability to maintain an erect posture, and requiring assistance in standing and walking."",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""Sixteen years old"",\n""sex"": ""Female"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n..."
1,"Doctor: Hi, how are you feeling today?\nPatient: Hi doctor, I have this dump pain on my right back and a swelling right there for several weeks.\nDoctor: Hmm, I see. Can you describe the pain a little more?\nPatient: It's like a dull ache and it hurts more when I move.\nDoctor: Okay. I understand. And you said you're in good state and very active?\nPatient: Yes, I am.\nDoctor: That's good to hear. Have you had any health problems in the past?\nPatient: Well, I had a thoracic trauma at work o...","This is the case of a 56-year-old man that was complaining of a dump pain on the right back and a swelling right in this place for several weeks. The patient was in good state and very active. There was not any health problem in the past except a thoracic trauma at work one year ago. In that time the patient was diagnosed with a simple fracture of the 9th right rib without any other consequences.\nOn the X-ray was seen a shadow in the lower part of the right hemithorax. After that, it was de...",77465,"This is the case of a 56-year-old man that was complaining of a dump pain on the right back and a swelling right in this place for several weeks. The patient was in good state and very active. There was not any health problem in the past except a thoracic trauma at work one year ago. In that time the patient was diagnosed with a simple fracture of the 9th right rib without any other consequences.\nOn the X-ray was seen a shadow in the lower part of the right hemithorax. After that, it was de...","{\n""visit motivation"": ""Complaints of a dull pain on the right back and a swelling in the same area"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""56"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiolog..."
2,"Doctor: Hello, what brings you to the hospital today?\nPatient: Hi, I have been experiencing pain and restricted range of motion in my left hip joint for two months now. I also have a gait disturbance.\nDoctor: I see. Can you tell me more about your chief complaint?\nPatient: Sure. I have been having severe pain in my left hip that gets worse when I try to flex or rotate it. And my gait is also affected because of the pain.\nDoctor: Hmm, I understand. Can you tell me if you have any lifestyl...",A 36-year old female patient visited our hospital with a chief complaint of pain and restricted range of motion (ROM) in the left hip joint persisting for two months. Physical examination of the patient revealed severe gait disturbance secondary to hip pain aggravated by hip joint flexion or rotation. The patient had no lifestyle habits or specific comorbidities thought to cause femoral head AVN. An anteroposterior view of the left hip revealed sclerosis and collapse of the femoral head and ...,133948,A 36-year old female patient visited our hospital with a chief complaint of pain and restricted range of motion (ROM) in the left hip joint persisting for two months. Physical examination of the patient revealed severe gait disturbance secondary to hip pain aggravated by hip joint flexion or rotation. The patient had no lifestyle habits or specific comorbidities thought to cause femoral head AVN. An anteroposterior view of the left hip revealed sclerosis and collapse of the femoral head and ...,"{\n""visit motivation"": ""Pain and restricted range of motion in the left hip joint"",\n""admission"": [\n{\n""reason"": ""Idiopathic osteonecrosis of the femoral head"",\n""date"": ""None"",\n""duration"": ""Three weeks"",\n""care center details"": ""None""\n},\n{\n""reason"": ""Pain and limited ROM in the contralateral hip joint and gait disturbance"",\n""date"": ""One year after the initial surgery"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""36 years old"",\n""sex"": ..."
3,"Doctor: Good morning, Mr. [Patient's Name]. I'm Dr. [Doctor's Name]. What brings you in today?\nPatient: Good morning, doctor. I have a pain in my left arm after a fall.\nDoctor: I see. Can you tell me more about the fall and the pain you're experiencing?\nPatient: I fell and hit my left arm about three months ago. The pain has been getting worse and I can't move my arm as well as I used to.\nDoctor: Hmm, I understand. I see from your medical history that you had a surgery on your left elbow...","A 49-year-old male presented with a complaint of pain in the left proximal forearm after a fall. The patient had a history of left elbow arthrodesis performed for posttraumatic arthritis at the age of 18. On physical examination he was tender at the proximal ulna. He had no active flexion or extension at his elbow, which was fused at 90 degrees but achieved 40 degrees of pronation and 60 degrees of supination. His motor and sensory exam was normal at the hand. Radiographs of the forearm and ...",80176,"A 49-year-old male presented with a complaint of pain in the left proximal forearm after a fall. The patient had a history of left elbow arthrodesis performed for posttraumatic arthritis at the age of 18. On physical examination he was tender at the proximal ulna. He had no active flexion or extension at his elbow, which was fused at 90 degrees but achieved 40 degrees of pronation and 60 degrees of supination. His motor and sensory exam was normal at the hand. Radiographs of the forearm and ...","{\n""visit motivation"": ""Pain in the left proximal forearm after a fall"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""49"",\n""sex"": ""male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological context"": ""History of le..."
4,"Doctor: Good morning, how are you feeling today?\nPatient: Hi doctor, I'm not feeling great. I have been experiencing pain in my knees for over a year now.\nDoctor: I see. You were referred to our clinic for recurrent attacks of pain in both knees, correct?\nPatient: Yes, that's correct.\nDoctor: Can you tell me more about the pain you have been experiencing?\nPatient: Well, it started about a year ago with severe pain over the medial aspect of my left knee. It lasted for two weeks and it wa...","A 47-year-old male patient was referred to the rheumatology clinic because of recurrent attacks of pain in both knees over 1 year.\nIn September 2016, the patient presented with severe pain over the medial aspect of the left knee for a two-week duration which prevented him from ambulation. The pain increased with weight-bearing physical activity. The patient reported no history of trauma before the onset of the knee pain. Examination showed severe tenderness over the medial side of the knee ...",72232,"A 47-year-old male patient was referred to the rheumatology clinic because of recurrent attacks of pain in both knees over 1 year.\nIn September 2016, the patient presented with severe pain over the medial aspect of the left knee for a two-week duration which prevented him from ambulation. The pain increased with weight-bearing physical activity. The patient reported no history of trauma before the onset of the knee pain. Examination showed severe tenderness over the medial side of the knee ...","{\n""visit motivation"": ""Recurrent attacks of pain in both knees over 1 year"",\n""admission"": [\n{\n""reason"": ""None"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Rheumatology clinic""\n}\n],\n""patient information"": {\n""age"": ""47"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological cont..."
...,...,...,...,...,...
29995,"Doctor: Good morning, sir. Can you tell me what happened?\nPatient: I was stabbed.\nDoctor: I see. Where were you stabbed?\nPatient: Under my left nipple.\nDoctor: Okay. And how big was the stab wound?\nPatient: It was about 5 cm.\nDoctor: I see. And what kind of treatment did you receive before coming here?\nPatient: They inserted a chest drain because I was having trouble breathing.\nDoctor: I see. And was the chest drain clamped during transport?\nPatient: Yes, because of massive hemorrha...","A 28-year-old male was admitted to the emergency department (ED) with a 5 cm stab wound (SW) under his left nipple. Pre-hospital treatment included insertion of a left chest drain due to dyspnoea, but this was clamped during transport because of massive hemorrhage. On admission, he was self-ventilating, with palpable carotid pulses, but without a measurable blood pressure. He was agitated and pale with a Glasgow coma score of 12 since he could open his eyes, localize pain and speak. The bloo...",39279,"A 28-year-old male was admitted to the emergency department (ED) with a 5 cm stab wound (SW) under his left nipple. Pre-hospital treatment included insertion of a left chest drain due to dyspnoea, but this was clamped during transport because of massive hemorrhage. On admission, he was self-ventilating, with palpable carotid pulses, but without a measurable blood pressure. He was agitated and pale with a Glasgow coma score of 12 since he could open his eyes, localize pain and speak. The bloo...","{\n""visit motivation"": ""Stab wound under left nipple"",\n""admission"": [\n{\n""reason"": ""5 cm stab wound under left nipple with complications including massive hemorrhage and dyspnoea"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Emergency department (ED)""\n}\n],\n""patient information"": {\n""age"": ""28"",\n""sex"": ""male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""o..."
29996,"Doctor: Good morning, sir. I am Dr. John. How are you feeling today?\nPatient: I am okay, thank you.\nDoctor: Can you tell me what's wrong with you and why you are here today?\nPatient: Yes, I was diagnosed with falcine meningioma and admitted for craniotomy.\nDoctor: I see. And how long have you been experiencing weakness on the left side?\nPatient: It's been a few weeks now.\nDoctor: Okay. Do you have any other medical conditions besides falcine meningioma and left-sided weakness?\nPatient...","An 82-year-old man (64.5 kg, 175 cm) diagnosed with falcine meningioma was admitted for craniotomy. The patient presented left-sided weakness. Comorbidities included atrial fibrillation and Parkinson's disease. He had undergone prostate surgery under spinal anesthesia 1 year previously. Preoperative thoracic radiographs revealed several small nodular opacities in the upper lobe of the right lung. Low-dose thoracic computed tomography (CT) was performed to further evaluate the possibility of ...",137017,"An 82-year-old man (64.5 kg, 175 cm) diagnosed with falcine meningioma was admitted for craniotomy. The patient presented left-sided weakness. Comorbidities included atrial fibrillation and Parkinson's disease. He had undergone prostate surgery under spinal anesthesia 1 year previously. Preoperative thoracic radiographs revealed several small nodular opacities in the upper lobe of the right lung. Low-dose thoracic computed tomography (CT) was performed to further evaluate the possibility of ...","{\n""visit motivation"": ""Diagnosed with falcine meningioma"",\n""admission"": [\n{\n""reason"": ""Craniotomy"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""None""\n}\n],\n""patient information"": {\n""age"": ""82"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""64.5 kg"",\n""height"": ""175 cm"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological context"": ""Atrial fibrilla..."
29997,"Doctor: Good morning, how are you feeling today?\nPatient: I'm feeling a bit better, thank you.\nDoctor: I see that you were brought to the hospital with cardiac sounding chest pain. Can you tell me more about that?\nPatient: Yes, I had a sharp pain in my chest that wouldn't go away.\nDoctor: I understand. And I see that you underwent an electrocardiogram. Can you tell me what the results showed?\nPatient: Yes, the results showed an elevation in leads II, III, and aVF.\nDoctor: Okay. Based o...","A 54 year-old man with no past medical history was brought to hospital with cardiac sounding chest pain and an electrocardiogram revealing ST elevation in leads II, III and aVF. He was taken for immediate primary coronary intervention for an inferior segment elevation (ST) elevation myocardial infarction via right radial approach. During diagnostic imaging of his left mainstem artery, he suffered a Ventricular Fibrillation (VF) cardiac arrest on table. He had immediate CPR by hand and then t...",98004,"A 54 year-old man with no past medical history was brought to hospital with cardiac sounding chest pain and an electrocardiogram revealing ST elevation in leads II, III and aVF. He was taken for immediate primary coronary intervention for an inferior segment elevation (ST) elevation myocardial infarction via right radial approach. During diagnostic imaging of his left mainstem artery, he suffered a Ventricular Fibrillation (VF) cardiac arrest on table. He had immediate CPR by hand and then t...","{\n""visit motivation"": ""Cardiac sounding chest pain"",\n""admission"": [\n{\n""reason"": ""ST elevation myocardial infarction and cardiac arrest"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Hospital with facilities for primary coronary intervention and intensive care unit""\n}\n],\n""patient information"": {\n""age"": ""54"",\n""sex"": ""Male"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic contex..."
29998,"Doctor: Good morning, how can I help you today?\nPatient: Good morning, Doctor. I have a mass in my right thigh.\nDoctor: Okay, when did you first notice the mass?\nPatient: I first noticed it four years ago.\nDoctor: I see. And did you have a biopsy done on the mass?\nPatient: Yes, I did a needle biopsy.\nDoctor: And what was the result of the biopsy?\nPatient: I was referred to your hospital with a diagnosis of leiomyosarcoma.\nDoctor: Okay, I understand. Can you tell me more about the mas...","A 49-year-old woman visited the clinic due to a mass in her right thigh. The patient first noticed the mass four years prior to presentation. After needle biopsy was performed, she was referred to our hospital with a diagnosis of leiomyosarcoma.\nA physical examination confirmed the mass with a diameter of 4 cm on the lateral side of the right thigh. There was no adhesion with skin and no tenderness.\nMagnetic resonance imaging (MRI) identified a subcutaneous soft tissue mass, which exhibite...",133320,"A 49-year-old woman visited the clinic due to a mass in her right thigh. The patient first noticed the mass four years prior to presentation. After needle biopsy was performed, she was referred to our hospital with a diagnosis of leiomyosarcoma.\nA physical examination confirmed the mass with a diameter of 4 cm on the lateral side of the right thigh. There was no adhesion with skin and no tenderness.\nMagnetic resonance imaging (MRI) identified a subcutaneous soft tissue mass, which exhibite...","{\n""visit motivation"": ""Mass in her right thigh"",\n""admission"": [\n{\n""reason"": ""Diagnosis of leiomyosarcoma"",\n""date"": ""None"",\n""duration"": ""None"",\n""care center details"": ""Referred to our hospital""\n}\n],\n""patient information"": {\n""age"": ""49"",\n""sex"": ""Woman"",\n""ethnicity"": ""None"",\n""weight"": ""None"",\n""height"": ""None"",\n""family medical history"": ""None"",\n""recent travels"": ""None"",\n""socio economic context"": ""None"",\n""occupation"": ""None""\n},\n""patient medical history"": {\n""physiological con..."


In [10]:
if 'summary' in df.columns:
     idx = df['idx'] if 'idx' in df.columns else df.index
     parsed = parse_json_series(df['summary'])
     df_sum = pd.DataFrame(parsed).fillna('')
     df_result = pd.concat([pd.Series(idx).reset_index(drop=True), df_sum.reset_index(drop=True)], axis=1)
     df_result.to_csv('parsed_summary_output.csv', index=False)

In [12]:
parsed_summary_output = os.path.join(cwd, 'data', 'clean', 'parsed_summary_output.csv')
df2 = pd.read_csv(parsed_summary_output)

In [13]:
# 1) Filter only the rows that failed parsing
fail_df = df2[df2['_raw_summary'].notna() & (df2['_raw_summary'] != '')].copy()

# 2) Compute diagnostic columns
fail_df['snippet']      = fail_df['_raw_summary'].str.slice(0, 80).str.replace('\n', '⏎')
fail_df['char_count']   = fail_df['_raw_summary'].str.len()
fail_df['quote_count']  = fail_df['_raw_summary'].str.count('"')
fail_df['braces_diff']  = (
    fail_df['_raw_summary'].str.count(r'{') - fail_df['_raw_summary'].str.count(r'}')
)
fail_df['newline_count']= fail_df['_raw_summary'].str.count('\n')

# 3) Capture the json.loads error for each
def capture_error(msg):
    try:
        json.loads(msg)
        return ''
    except Exception as e:
        return str(e)

fail_df['error_message'] = fail_df['_raw_summary'].apply(capture_error)

# 4) Compute frequency of each error and sort
fail_df['error_frequency'] = fail_df.groupby('error_message')['error_message']\
                                   .transform('count')
report = fail_df.sort_values(['error_frequency', 'char_count'],
                             ascending=[False, False])

# 5) Show the top 50 in console
print(report[['idx', 'snippet', 'char_count', 'newline_count',
              'quote_count', 'braces_diff', 'error_message',
              'error_frequency']].head(300).to_string(index=False))

# 6) Also write out to CSV for deeper inspection
report.to_csv('parsing_diagnostics.csv', index=False)
print("\nDetailed diagnostics written to parsing_diagnostics.csv")

   idx                                                                          snippet  char_count  newline_count  quote_count  braces_diff                                                                  error_message  error_frequency
  2706 {⏎"visit motivation": "Bleeding per rectum and mild abdominal pain",⏎"admission"        4946            158          468            0                         Expecting ',' delimiter: line 50 column 39 (char 1409)                3
 16046 {⏎"visit motivation": "Evaluation of severe dry eye in the left eye",⏎"admission        4511            161          468            0                         Expecting ',' delimiter: line 50 column 39 (char 1409)                3
  6566 {⏎"visit motivation": "Right elbow mass",⏎"admission": [⏎{⏎"reason": "Right elbo        2840             97          276            0                         Expecting ',' delimiter: line 50 column 39 (char 1409)                3
 89035 {⏎"visit motivation": "Recurrent disseminated

In [14]:
df3 = df2[df2['_raw_summary'].isna()].copy()


In [15]:
df3.to_csv('final_parsed_dataset.csv')