# Patient Data Analysis
This notebook analyzes transcripts, limited chart files, and patient profiles.

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re
import textstat
from collections import Counter

%matplotlib inline

base_path = Path("~/canvas-hyperscribe/evaluations/cases/synthetic_unit_cases/med_management_v2").expanduser()


## Transcript Analysis

In [None]:
def analyze_transcript(transcript_path):
    with open(transcript_path, 'r') as f:
        turns = json.load(f)

    data = {
        "Patient": transcript_path.parent.name,
        "Total Turns": len(turns),
        "Total Characters": sum(len(t["text"]) for t in turns),
        "Total Words": sum(len(t["text"].split()) for t in turns),
    }

    words_by = Counter()
    turns_by = Counter()
    max_turn_length = 0
    first_speaker = turns[0]["speaker"] if turns else None

    for t in turns:
        speaker = t["speaker"]
        word_count = len(t["text"].split())
        words_by[speaker] += word_count
        turns_by[speaker] += 1
        max_turn_length = max(max_turn_length, word_count)

    pt = turns_by.get("Patient", 1)
    ct = turns_by.get("Clinician", 1)
    pw = words_by.get("Patient", 1)
    cw = words_by.get("Clinician", 1)

    data.update({
        "Clinician Turns": ct,
        "Patient Turns": pt,
        "Words/Turn": data["Total Words"] / data["Total Turns"] if data["Total Turns"] else 0,
        "C:P Turn Ratio": ct / pt,
        "C:P Word Ratio": cw / pw,
        "C:P Words/Turn Ratio": (cw / ct) / (pw / pt),
        "First Speaker": first_speaker,
        "Max Turn Length": max_turn_length,
        "Clinician Word %": 100 * cw / (cw + pw)
    })

    return data

transcript_rows = []
for p in sorted(base_path.glob("Patient_*")):
    f = p / "transcript.json"
    if f.exists():
        transcript_rows.append(analyze_transcript(f))

transcript_df = pd.DataFrame(transcript_rows)
transcript_df.to_csv("transcript_analysis.csv")
transcript_df.describe(include='all')



## Chart Analysis

In [3]:
def analyze_chart(chart_path):
    with open(chart_path, 'r') as f:
        chart = json.load(f)

    return {
        "Patient": chart_path.parent.name,
        "Has Demographics": int(bool(chart.get("demographicStr"))),
        "Conditions": len(chart.get("currentConditions", [])),
        "Medications": len(chart.get("currentMedications", [])),
        "Allergies": len(chart.get("currentAllergies", [])),
        "Condition History": len(chart.get("conditionHistory", [])),
        "Family History": len(chart.get("familyHistory", [])),
        "Surgical History": len(chart.get("surgeryHistory", [])),
    }

chart_rows = []
for p in sorted(base_path.glob("Patient_*")):
    f = p / "limited_chart.json"
    if f.exists():
        chart_rows.append(analyze_chart(f))

print(chart_rows)

chart_df = pd.DataFrame(chart_rows)
chart_df.to_csv("chart_analysis")
chart_df.describe()


[{'Patient': 'Patient_1', 'Has Demographics': 1, 'Conditions': 2, 'Medications': 2, 'Allergies': 0, 'Condition History': 0, 'Family History': 1, 'Surgical History': 0}, {'Patient': 'Patient_10', 'Has Demographics': 1, 'Conditions': 4, 'Medications': 1, 'Allergies': 0, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_11', 'Has Demographics': 1, 'Conditions': 1, 'Medications': 1, 'Allergies': 0, 'Condition History': 0, 'Family History': 1, 'Surgical History': 0}, {'Patient': 'Patient_12', 'Has Demographics': 1, 'Conditions': 2, 'Medications': 2, 'Allergies': 1, 'Condition History': 1, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_13', 'Has Demographics': 1, 'Conditions': 1, 'Medications': 1, 'Allergies': 0, 'Condition History': 0, 'Family History': 0, 'Surgical History': 0}, {'Patient': 'Patient_14', 'Has Demographics': 1, 'Conditions': 1, 'Medications': 1, 'Allergies': 0, 'Condition History': 0, 'Family History': 1, 'Surgical 

Unnamed: 0,Has Demographics,Conditions,Medications,Allergies,Condition History,Family History,Surgical History
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,1.0,1.475,1.675,0.2,0.025,0.55,0.0
std,0.0,0.750641,0.693837,0.405096,0.158114,0.503831,0.0
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,2.0,0.0,0.0,1.0,0.0
75%,1.0,2.0,2.0,0.0,0.0,1.0,0.0
max,1.0,4.0,3.0,1.0,1.0,1.0,0.0


## Profile Analysis

In [5]:
def analyze_profile(patient_key, text):
    sentences = re.split(r'[.!?]', text)
    words = text.split()

    return {
        "Patient": patient_key,
        "Profile Word Count": len(words),
        "Profile Sentence Count": len([s for s in sentences if s.strip()]),
        "Words per Sentence": len(words) / len(sentences) if sentences else 0,
        "Reading Ease": textstat.flesch_reading_ease(text),
        "Grade Level": textstat.flesch_kincaid_grade(text)
    }

profile_rows = []

# Iterate over Patient folders
for patient_dir in sorted(base_path.glob("Patient_*")):
    profile_path = patient_dir / "profile.json"
    if profile_path.exists():
        with open(profile_path) as f:
            data = json.load(f)
            # Expecting format: { "Patient 1": "profile text" }
            for patient_key, profile_text in data.items():
                profile_rows.append(analyze_profile(patient_key, profile_text))

# Construct dataframe
profile_df = pd.DataFrame(profile_rows)
print(profile_df.head())
profile_df.to_csv("profile_analysis.csv")
profile_df.describe()


      Patient  Profile Word Count  Profile Sentence Count  Words per Sentence  \
0   Patient 1                  85                       5           14.166667   
1  Patient 10                  72                       5           12.000000   
2  Patient 11                  70                       4           14.000000   
3  Patient 12                  73                       5           12.166667   
4  Patient 13                  65                       4           13.000000   

   Reading Ease  Grade Level  
0     48.248235    10.752941  
1     17.144000    14.445444  
2     38.001071    12.306429  
3     40.199562    11.279342  
4     27.648942    13.439808  


Unnamed: 0,Profile Word Count,Profile Sentence Count,Words per Sentence,Reading Ease,Grade Level
count,40.0,40.0,40.0,40.0,40.0
mean,68.75,4.45,12.674286,39.646147,11.599578
std,6.554505,0.552384,1.180238,10.035901,1.426524
min,54.0,4.0,10.333333,17.144,9.157676
25%,64.0,4.0,11.958333,32.075313,10.542293
50%,69.5,4.0,12.6,40.785411,11.384047
75%,73.0,5.0,13.4,47.814291,12.583198
max,85.0,6.0,14.8,55.866354,15.193889
