# Patient Data Analysis
This notebook analyzes transcripts, limited chart files, and patient profiles.

In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re
import textstat
from collections import Counter
import math

%matplotlib inline

base_path = Path("~/canvas-hyperscribe/evaluations/cases/synthetic_unit_cases/med_management_tpc_o3").expanduser()


## Transcript Analysis

In [8]:
def analyze_transcript(transcript_path: Path):
    """Return one dict with transcript + spec metrics for a single patient."""
    spec_path = transcript_path.parent / "spec.json"
    raw_path  = transcript_path.parent / "transcript.json"  # optional audit

    with open(transcript_path) as f:
        turns = json.load(f)

    with open(spec_path) as f:
        spec = json.load(f)

    # --- basic counts --------------------------------------------------------
    total_turns = len(turns)
    total_chars = sum(len(t["text"]) for t in turns)
    total_words = sum(len(t["text"].split()) for t in turns)

    words_by   = Counter()
    turns_by   = Counter()
    max_turn_len = 0
    speaker_seq = []

    for t in turns:
        spk = t["speaker"]
        speaker_seq.append(spk)
        wc = len(t["text"].split())
        words_by[spk]  += wc
        turns_by[spk]  += 1
        max_turn_len    = max(max_turn_len, wc)

    ct = turns_by.get("Clinician", 0)
    pt = turns_by.get("Patient",   0)
    cw = words_by.get("Clinician", 0)
    pw = words_by.get("Patient",   0)

    # avoid divide-by-zero
    ct = ct or 1
    pt = pt or 1
    cw = cw or 1
    pw = pw or 1

    # --- spec alignment ------------------------------------------------------
    spec_turns   = spec["turn_total"]
    spec_ratio   = spec["ratio"]
    requested_seq = spec["speaker_sequence"]

    turn_count_dev   = total_turns - spec_turns
    ratio_actual     = cw / pw
    ratio_dev_pct    = 100 * (ratio_actual - spec_ratio) / spec_ratio if spec_ratio else math.nan

    # full-sequence match (length must match too)
    seq_match = requested_seq[:total_turns] == speaker_seq and total_turns == spec_turns
    mismatch_positions = sum(a != b for a, b in zip(requested_seq, speaker_seq))

    # longest consecutive run by same speaker
    longest_run = 1
    current_run = 1
    for prev, nxt in zip(speaker_seq, speaker_seq[1:]):
        if prev == nxt:
            current_run += 1
            longest_run = max(longest_run, current_run)
        else:
            current_run = 1

    # --- assemble row --------------------------------------------------------
    row = {
        "Patient": transcript_path.parent.name,
        "Total Turns": total_turns,
        "Total Words": total_words,
        "Total Characters": total_chars,
        "Clinician Turns": ct,
        "Patient Turns": pt,
        "Words/Turn": total_words / total_turns if total_turns else 0,
        "C:P Turn Ratio": ct / pt,
        "C:P Word Ratio": ratio_actual,
        "C:P Words/Turn Ratio": (cw/ct) / (pw/pt),
        "Clinician Word %": 100 * cw / (cw + pw),
        "Max Turn Length": max_turn_len,
        "Longest Same-Speaker Run": longest_run,
        "First Speaker": speaker_seq[0] if speaker_seq else None,
        # spec alignment
        "Spec Turn Target": spec_turns,
        "Turn Delta": turn_count_dev,
        "Spec Word Ratio Target": spec_ratio,
        "Word Ratio Delta%": ratio_dev_pct,
        "Seq Perfect Match": seq_match,
        "Seq Mismatch Positions": mismatch_positions,
        # spec meta
        "Bucket": spec.get("bucket"),
        "Moods": ", ".join(spec.get("mood", [])),
        "Pressure": spec.get("pressure"),
        "Clinician Persona": spec.get("clinician_style"),
        "Patient Persona": spec.get("patient_style"),
    }
    return row

# ---------- run across patient folders --------------------------------------
base_path = Path("~/canvas-hyperscribe/evaluations/cases/synthetic_unit_cases/med_management_tpc_o3").expanduser()

transcript_rows = []
for p in sorted(base_path.glob("Patient_*"), key=lambda x: int(re.search(r'\d+', x.name).group())):
    f = p / "transcript.json"
    if f.exists():
        transcript_rows.append(analyze_transcript(f))

transcript_df = pd.DataFrame(transcript_rows)
transcript_df.to_csv("transcript_analysis_o3.csv", index=False)

display(transcript_df.head())
display(transcript_df.describe(include='all'))


Unnamed: 0,Patient,Total Turns,Total Words,Total Characters,Clinician Turns,Patient Turns,Words/Turn,C:P Turn Ratio,C:P Word Ratio,C:P Words/Turn Ratio,...,Turn Delta,Spec Word Ratio Target,Word Ratio Delta%,Seq Perfect Match,Seq Mismatch Positions,Bucket,Moods,Pressure,Clinician Persona,Patient Persona
0,Patient_1,10,369,2112,3,7,36.9,0.428571,1.170588,2.731373,...,0,1.09,7.393416,True,0,long,"patient is frustrated, clinician is brief",formulary change,over‑explainer,confused and forgetful
1,Patient_2,8,251,1406,7,1,31.375,7.0,1.51,0.215714,...,0,1.52,-0.657895,True,0,medium,"clinician is concerned, patient is frustrated",time pressure on the visit,warm and chatty,assertive and informed
2,Patient_3,12,720,4513,5,7,60.0,0.714286,1.352941,1.894118,...,0,1.35,0.217865,True,0,long,"clinician is rushed, patient is defensive",insurance denied prior authorization,over‑explainer,assertive and informed
3,Patient_4,4,207,1188,1,3,51.75,0.333333,0.754237,2.262712,...,0,0.76,-0.758252,True,0,short,"patient is defensive, clinician is concerned",patient traveling soon,over‑explainer,anxious and talkative
4,Patient_5,8,329,1943,4,4,41.125,1.0,1.788136,1.788136,...,0,1.82,-1.750792,True,0,medium,"clinician is warm, patient is defensive",refill limit reached,brief and efficient,assertive and informed


Unnamed: 0,Patient,Total Turns,Total Words,Total Characters,Clinician Turns,Patient Turns,Words/Turn,C:P Turn Ratio,C:P Word Ratio,C:P Words/Turn Ratio,...,Turn Delta,Spec Word Ratio Target,Word Ratio Delta%,Seq Perfect Match,Seq Mismatch Positions,Bucket,Moods,Pressure,Clinician Persona,Patient Persona
count,40,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,...,40.0,40.0,40.0,40,40.0,40,40,40,40,40
unique,40,,,,,,,,,,...,,,,1,,3,28,6,4,4
top,Patient_1,,,,,,,,,,...,,,,True,,medium,"clinician is concerned, patient is frustrated",formulary change,over‑explainer,assertive and informed
freq,1,,,,,,,,,,...,,,,40,,19,3,9,13,12
mean,,7.325,307.4,1759.625,3.8,3.625,46.802113,1.603285,6.727246,2.589132,...,0.0,1.33775,938.661495,,0.0,,,,,
std,,3.611857,160.648015,940.312187,2.543972,2.283581,23.003955,1.771937,34.590456,5.915531,...,0.0,0.457628,5982.826403,,0.0,,,,,
min,,2.0,62.0,329.0,1.0,1.0,16.333333,0.090909,0.005917,0.016,...,0.0,0.51,-99.583333,,0.0,,,,,
25%,,4.0,196.75,1174.5,2.0,2.0,31.28125,0.5,1.019939,0.722093,...,0.0,1.06,-3.617575,,0.0,,,,,
50%,,7.0,312.5,1735.0,3.0,3.0,43.5,1.0,1.367647,1.17959,...,0.0,1.37,-0.23107,,0.0,,,,,
75%,,8.5,359.0,2011.75,5.0,5.0,53.375,1.85,1.666492,2.298084,...,0.0,1.705,3.915499,,0.0,,,,,


## Chart Analysis

In [9]:
def analyze_chart(chart_path):
    with open(chart_path, 'r') as f:
        chart = json.load(f)

    return {
        "Patient": chart_path.parent.name,
        "Has Demographics": int(bool(chart.get("demographicStr"))),
        "Conditions": len(chart.get("currentConditions", [])),
        "Medications": len(chart.get("currentMedications", [])),
        "Allergies": len(chart.get("currentAllergies", [])),
        "Condition History": len(chart.get("conditionHistory", [])),
        "Family History": len(chart.get("familyHistory", [])),
        "Surgical History": len(chart.get("surgeryHistory", [])),
    }

chart_rows = []
for p in sorted(base_path.glob("Patient_*")):
    f = p / "limited_chart.json"
    if f.exists():
        chart_rows.append(analyze_chart(f))

print(chart_rows)

chart_df = pd.DataFrame(chart_rows)
chart_df.to_csv("chart_analysis.csv")
chart_df.describe()


[{'Patient': 'Patient_1', 'Has Demographics': 1, 'Conditions': 0, 'Medications': 1, 'Allergies': 1, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_10', 'Has Demographics': 1, 'Conditions': 1, 'Medications': 2, 'Allergies': 0, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_11', 'Has Demographics': 1, 'Conditions': 1, 'Medications': 2, 'Allergies': 0, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_12', 'Has Demographics': 1, 'Conditions': 1, 'Medications': 2, 'Allergies': 0, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_13', 'Has Demographics': 1, 'Conditions': 2, 'Medications': 1, 'Allergies': 1, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_14', 'Has Demographics': 1, 'Conditions': 0, 'Medications': 5, 'Allergies': 0, 'Condition History': 0, 'Family History': 0, 'Surgical 

Unnamed: 0,Has Demographics,Conditions,Medications,Allergies,Condition History,Family History,Surgical History
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,1.0,1.4,2.65,0.225,0.0,0.0,0.025
std,0.0,0.955416,1.459715,0.422902,0.0,0.0,0.158114
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,2.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,2.0,0.0,0.0,0.0,0.0
75%,1.0,2.0,3.25,0.0,0.0,0.0,0.0
max,1.0,5.0,6.0,1.0,0.0,0.0,1.0


## Profile Analysis

In [10]:
def analyze_profile(patient_key, text):
    sentences = re.split(r'[.!?]', text)
    words = text.split()

    return {
        "Patient": patient_key,
        "Profile Word Count": len(words),
        "Profile Sentence Count": len([s for s in sentences if s.strip()]),
        "Words per Sentence": len(words) / len(sentences) if sentences else 0,
        "Reading Ease": textstat.flesch_reading_ease(text),
        "Grade Level": textstat.flesch_kincaid_grade(text)
    }

profile_rows = []

# Iterate over Patient folders
for patient_dir in sorted(base_path.glob("Patient_*")):
    profile_path = patient_dir / "profile.json"
    if profile_path.exists():
        with open(profile_path) as f:
            data = json.load(f)
            # Expecting format: { "Patient 1": "profile text" }
            for patient_key, profile_text in data.items():
                profile_rows.append(analyze_profile(patient_key, profile_text))

# Construct dataframe
profile_df = pd.DataFrame(profile_rows)
print(profile_df.head())
profile_df.to_csv("profile_analysis.csv")
profile_df.describe()


      Patient  Profile Word Count  Profile Sentence Count  Words per Sentence  \
0   Patient 1                  54                       3                13.5   
1  Patient 10                  56                       4                11.2   
2  Patient 11                  53                       4                10.6   
3  Patient 12                  54                       4                10.8   
4  Patient 13                  50                       4                10.0   

   Reading Ease  Grade Level  
0     39.731667    12.189259  
1     41.553571    10.941429  
2     54.514552     8.947311  
3     44.299167    10.434259  
4     41.867500    10.525000  


Unnamed: 0,Profile Word Count,Profile Sentence Count,Words per Sentence,Reading Ease,Grade Level
count,40.0,40.0,40.0,40.0,40.0
mean,52.9,3.875,11.032917,41.067684,11.122235
std,4.289522,0.647975,1.645054,11.488754,1.818225
min,45.0,3.0,8.0,11.389005,6.767092
25%,50.0,3.0,10.0,36.296196,10.372917
50%,52.0,4.0,10.8,42.093717,10.901923
75%,55.0,4.0,12.4,47.57056,12.210406
max,65.0,5.0,14.5,68.364515,15.143103
