In [1]:
import json

# 1. Load the JSON files
with open('release_evidences.json', 'r', encoding='utf-8') as f:
    evidences = json.load(f)

with open('release_conditions.json', 'r', encoding='utf-8') as f:
    conditions = json.load(f)

# 2. Print basic stats to ensure they loaded correctly
print(f"Total Evidences (Symptoms/Antecedents) loaded: {len(evidences)}")
print(f"Total Conditions (Pathologies) loaded: {len(conditions)}")
print("-" * 40)

# 3. Inspect a specific Binary Evidence (e.g., a cough)
sample_evidence_code = 'E_53' # E_53 is usually a common symptom
if sample_evidence_code in evidences:
    print(f"--- Structure of Evidence: {sample_evidence_code} ---")
    print(json.dumps(evidences[sample_evidence_code], indent=4))
else:
    print(f"Could not find {sample_evidence_code}. Let's try the first item in the dictionary.")
    first_key = list(evidences.keys())[0]
    print(json.dumps(evidences[first_key], indent=4))

Total Evidences (Symptoms/Antecedents) loaded: 223
Total Conditions (Pathologies) loaded: 49
----------------------------------------
--- Structure of Evidence: E_53 ---
{
    "name": "E_53",
    "code_question": "E_53",
    "question_fr": "Avez-vous de la douleur \u00e0 quelque part en lien avec votre raison de consultation?",
    "question_en": "Do you have pain somewhere, related to your reason for consulting?",
    "is_antecedent": false,
    "default_value": 0,
    "value_meaning": {},
    "possible-values": [],
    "data_type": "B"
}


In [2]:
# Search for a Categorical (C) or Multi-choice (M) evidence
complex_evidence = None
for code, details in evidences.items():
    if details.get('data_type') in ['C', 'M']:
        complex_evidence = details
        break

if complex_evidence:
    print(f"--- Structure of Complex Evidence: {complex_evidence['name']} ---")
    print(json.dumps(complex_evidence, indent=4))
else:
    print("No complex data types found.")

--- Structure of Complex Evidence: E_55 ---
{
    "name": "E_55",
    "code_question": "E_53",
    "question_fr": "Avez-vous de la douleur quelque part?",
    "question_en": "Do you feel pain somewhere?",
    "is_antecedent": false,
    "default_value": "V_123",
    "value_meaning": {
        "V_123": {
            "fr": "nulle part",
            "en": "nowhere"
        },
        "V_14": {
            "fr": "aile iliaque(D)",
            "en": "iliac wing(R)"
        },
        "V_15": {
            "fr": "aile iliaque(G)",
            "en": "iliac wing(L)"
        },
        "V_16": {
            "fr": "aine(D)",
            "en": "groin(R)"
        },
        "V_17": {
            "fr": "aine(G)",
            "en": "groin(L)"
        },
        "V_19": {
            "fr": "aisselle(G)",
            "en": "axilla(L)"
        },
        "V_18": {
            "fr": "aisselle(D)",
            "en": "axilla(R)"
        },
        "V_20": {
            "fr": "amygdale(D)",
            "en

In [3]:
import pandas as pd
import ast

# 1. Load the training patients dataset
# We use pd.read_csv even if the file doesn't have a .csv extension
df_train = pd.read_csv('release_train_patients')

# 2. Print the shape to see how many patients we have
print(f"Total Patient Records: {df_train.shape[0]}")
print("-" * 50)

# 3. Extract and print the very first patient row
first_patient = df_train.iloc[0]
print("--- RAW PATIENT DATA (ROW 0) ---")
print(f"AGE: {first_patient['AGE']}")
print(f"SEX: {first_patient['SEX']}")
print(f"PATHOLOGY (The Ground Truth Label): {first_patient['PATHOLOGY']}")
print(f"INITIAL_EVIDENCE (The primary complaint): {first_patient['INITIAL_EVIDENCE']}")

# 4. Extract the full list of evidence for this patient
# DDXPlus stores this column as a string that looks like a list, so we safely evaluate it
evidence_list = ast.literal_eval(first_patient['EVIDENCES'])
print(f"\nFULL EVIDENCES LIST ({len(evidence_list)} items):")
for item in evidence_list:
    print(f" - {item}")

Total Patient Records: 1025602
--------------------------------------------------
--- RAW PATIENT DATA (ROW 0) ---
AGE: 18
SEX: M
PATHOLOGY (The Ground Truth Label): URTI
INITIAL_EVIDENCE (The primary complaint): E_91

FULL EVIDENCES LIST (19 items):
 - E_48
 - E_50
 - E_53
 - E_54_@_V_161
 - E_54_@_V_183
 - E_55_@_V_89
 - E_55_@_V_108
 - E_55_@_V_167
 - E_56_@_4
 - E_57_@_V_123
 - E_58_@_3
 - E_59_@_3
 - E_77
 - E_79
 - E_91
 - E_97
 - E_201
 - E_204_@_V_10
 - E_222


In [4]:
import ast

def translate_row_to_narrative(row_data, evidences_dict):
    """
    Translates a single DDXPlus tabular row into a first-person narrative.
    """
    # 1. Demographics Hook
    age = row_data['AGE']
    sex_str = "male" if row_data['SEX'] == 'M' else "female"
    narrative = [f"I am an {age}-year-old {sex_str}."]
    
    # 2. Extract Data
    initial_ev = row_data['INITIAL_EVIDENCE']
    # Safely evaluate the string representation of the list
    all_evidences = ast.literal_eval(row_data['EVIDENCES']) 
    
    # Remove the initial evidence from the general list to avoid repetition
    if initial_ev in all_evidences:
        all_evidences.remove(initial_ev)
    
    # 3. Setup temporary storage for grouping
    binary_complaints = []
    complex_complaints = {} # Format: {'E_55': ['forehead', 'cheek', 'temple']}

    # 4. Process the Initial Evidence first (The main reason for visit)
    # Note: In a full pipeline, we map this to a custom string. For now, we fetch the English question.
    initial_question = evidences_dict.get(initial_ev, {}).get('question_en', 'I have a specific complaint.')
    narrative.append(f"I came in today because: {initial_question}")

    # 5. Process the remaining evidence list
    for ev_string in all_evidences:
        
        # Handle Complex/Multi-choice types (e.g., E_55_@_V_89)
        if '_@_' in ev_string:
            code, value = ev_string.split('_@_')
            
            # TRAP AVOIDANCE: Skip 'nowhere' or baseline values
            if value == 'V_123' or value == '0':
                continue
                
            # Fetch the human-readable meaning from the dictionary
            evidence_details = evidences_dict.get(code, {})
            value_dict = evidence_details.get('value_meaning', {})
            
            # Extract the English translation of the specific value (e.g., 'forehead')
            meaning_en = value_dict.get(value, {}).get('en', value)
            
            # Group it by the base code
            if code not in complex_complaints:
                complex_complaints[code] = []
            complex_complaints[code].append(meaning_en)
            
        # Handle Simple Binary types (e.g., E_53)
        else:
            question_en = evidences_dict.get(ev_string, {}).get('question_en', ev_string)
            binary_complaints.append(question_en)

    # 6. Assemble the Complex Symptoms (The Grouping Logic)
    for code, values in complex_complaints.items():
        # Example: E_55 usually means "pain location"
        base_question = evidences_dict.get(code, {}).get('question_en', 'I have this related to')
        joined_values = ", ".join(values)
        narrative.append(f"Regarding '{base_question}', my answer is: {joined_values}.")

    # 7. Assemble the Binary Symptoms
    if binary_complaints:
        narrative.append("Additionally, I am experiencing the following:")
        for complaint in binary_complaints:
            narrative.append(f"- {complaint}")

    # Join the final paragraph
    return " ".join(narrative)

# --- EXECUTE ON ROW 0 ---
first_row = df_train.iloc[0]
patient_story = translate_row_to_narrative(first_row, evidences)

print("--- GENERATED PATIENT NARRATIVE ---")
print(patient_story)
print("-" * 35)
print(f"Target Pathology to Predict: {first_row['PATHOLOGY']}")

--- GENERATED PATIENT NARRATIVE ---
I am an 18-year-old male. I came in today because: Do you have a fever (either felt or measured with a thermometer)? Regarding 'Characterize your pain:', my answer is: sensitive, heavy. Regarding 'Do you feel pain somewhere?', my answer is: forehead, cheek(R), temple(L). Regarding 'How intense is the pain?', my answer is: 4. Regarding 'How precisely is the pain located?', my answer is: 3. Regarding 'How fast did the pain appear?', my answer is: 3. Regarding 'Have you traveled out of the country in the last 4 weeks?', my answer is: N. Additionally, I am experiencing the following: - Do you live with 4 or more people? - Have you had significantly increased sweating? - Do you have pain somewhere, related to your reason for consulting? - Do you have a cough that produces colored or more abundant sputum than usual? - Do you smoke cigarettes? - Do you have a sore throat? - Do you have a cough? - Are you exposed to secondhand cigarette smoke on a daily basi

In [7]:
import ast

# --- THE NARRATIVE DICTIONARY ---
# We map the E_codes to conversational first-person stems.
NARRATIVE_STEMS = {
    # The Main Complaints
    'E_91': "I have been running a fever.",
    
    # Complex Groupings (Pain characteristics)
    'E_55': "The pain is specifically located in my",
    'E_54': "I would describe the pain as",
    'E_56': "On a scale of 1 to 10, the pain intensity is a",
    'E_57': "The pain radiates to my",
    'E_58': "On a scale of 1 to 10, the precision of the pain location is a",
    'E_59': "Regarding how fast the pain appeared, on a scale of 1 to 10, it was a",
    
    # Binary Symptoms & Antecedents (Background)
    'E_53': "I am definitely experiencing pain related to this.",
    'E_48': "I live in a household with 4 or more people.",
    'E_50': "I've noticed I am sweating significantly more than usual.",
    'E_77': "I am coughing up sputum that is colored or more abundant than normal.",
    'E_79': "I am a cigarette smoker.",
    'E_97': "My throat is really sore.",
    'E_201': "I have a cough.",
    'E_204': "I am exposed to secondhand smoke on a daily basis.",
    'E_222': "I have not traveled out of the country recently."
}

def generate_human_narrative(row_data, evidences_dict, stems_dict):
    age = row_data['AGE']
    sex_str = "male" if row_data['SEX'] == 'M' else "female"
    narrative = [f"I am an {age}-year-old {sex_str}."]
    
    initial_ev = row_data['INITIAL_EVIDENCE']
    all_evidences = ast.literal_eval(row_data['EVIDENCES']) 
    
    if initial_ev in all_evidences:
        all_evidences.remove(initial_ev)
    
    binary_complaints = []
    complex_complaints = {} 

    # 1. Process Initial Evidence smoothly
    stem = stems_dict.get(initial_ev, f"I have an issue regarding {initial_ev}.")
    narrative.append(f"I came into the clinic today because {stem}")

    for ev_string in all_evidences:
        if '_@_' in ev_string:
            code, value = ev_string.split('_@_')
            if value == 'V_123' or value == '0':
                continue
                
            meaning_en = evidences_dict.get(code, {}).get('value_meaning', {}).get(value, {}).get('en', value)
            
            # Handle N/Y travel edge cases cleanly
            if meaning_en == 'N': meaning_en = 'no'
            if meaning_en == 'Y': meaning_en = 'yes'
            
            if code not in complex_complaints:
                complex_complaints[code] = []
            complex_complaints[code].append(meaning_en)
            
        else:
            # Look up the smooth string, fallback to the raw question if not found
            fallback_question = evidences_dict.get(ev_string, {}).get('question_en', ev_string)
            binary_str = stems_dict.get(ev_string, f"I also noticed: {fallback_question}")
            binary_complaints.append(binary_str)

    # 2. Assemble Complex Symptoms smoothly
    # 2. Assemble Complex Symptoms smoothly (WITH FIXES)
    for code, values in complex_complaints.items():
        base_stem = stems_dict.get(code, f"Regarding {code}, it is")
        
        # FIX 1: The "Stray No" / "Yes" Handler
        # If the value is literally just 'yes' or 'no' (e.g., travel history)
        if len(values) == 1 and values[0].lower() in ['yes', 'y']:
            narrative.append(f"Yes, {base_stem.lower()}.")
            continue
        elif len(values) == 1 and values[0].lower() in ['no', 'n']:
            # We add 'not' to make it negative (e.g., "I am not exposed to...")
            narrative.append(f"No, I am not {base_stem.lower().replace('i am ', '')}.")
            continue

        # FIX 2: The "Double And" Grammatical List Handler
        if len(values) == 1:
            joined_values = values[0]
        elif len(values) == 2:
            joined_values = f"{values[0]} and {values[1]}"
        else:
            # Joins all but the last with commas, then adds ", and [last]"
            joined_values = ", ".join(values[:-1]) + f", and {values[-1]}"
            
        narrative.append(f"{base_stem} {joined_values}.")

    # 3. Assemble Binary Symptoms naturally
    if binary_complaints:
        narrative.append("To give you more context:")
        for complaint in binary_complaints:
            narrative.append(complaint)

    return " ".join(narrative)

# --- EXECUTE AND COMPARE ---
first_row = df_train.iloc[0] # Your 18-year-old URTI patient
polished_story = generate_human_narrative(first_row, evidences, NARRATIVE_STEMS)

print("--- POLISHED PATIENT NARRATIVE ---")
print(polished_story)

--- POLISHED PATIENT NARRATIVE ---
I am an 18-year-old male. I came into the clinic today because I have been running a fever. I would describe the pain as sensitive and heavy. The pain is specifically located in my forehead, cheek(R), and temple(L). On a scale of 1 to 10, the pain intensity is a 4. On a scale of 1 to 10, the precision of the pain location is a 3. Regarding how fast the pain appeared, on a scale of 1 to 10, it was a 3. No, I am not exposed to secondhand smoke on a daily basis.. To give you more context: I live in a household with 4 or more people. I've noticed I am sweating significantly more than usual. I am definitely experiencing pain related to this. I am coughing up sputum that is colored or more abundant than normal. I am a cigarette smoker. My throat is really sore. I have a cough. I have not traveled out of the country recently.


In [8]:
from collections import Counter
import ast

# Count all evidences across the entire dataset
all_codes = []
for index, row in df_train.iterrows():
    ev_list = ast.literal_eval(row['EVIDENCES'])
    for ev in ev_list:
        # We only care about the base code, not the complex values
        base_code = ev.split('_@_')[0] if '_@_' in ev else ev
        all_codes.append(base_code)

# Get the top 20 most common codes
top_codes = Counter(all_codes).most_common(20)

print("--- TOP 20 CODES TO ADD TO NARRATIVE_STEMS ---")
for code, count in top_codes:
    question = evidences.get(code, {}).get('question_en', 'Unknown')
    print(f"'{code}': # Appears {count} times -> Question: {question}")

--- TOP 20 CODES TO ADD TO NARRATIVE_STEMS ---
'E_55': # Appears 3203271 times -> Question: Do you feel pain somewhere?
'E_54': # Appears 1647421 times -> Question: Characterize your pain:
'E_57': # Appears 1355490 times -> Question: Does the pain radiate to another location?
'E_204': # Appears 1025602 times -> Question: Have you traveled out of the country in the last 4 weeks?
'E_56': # Appears 798938 times -> Question: How intense is the pain?
'E_58': # Appears 798938 times -> Question: How precisely is the pain located?
'E_59': # Appears 798938 times -> Question: How fast did the pain appear?
'E_53': # Appears 788079 times -> Question: Do you have pain somewhere, related to your reason for consulting?
'E_133': # Appears 618662 times -> Question: Where is the affected region located?
'E_66': # Appears 399695 times -> Question: Are you experiencing shortness of breath or difficulty breathing in a significant way?
'E_152': # Appears 352462 times -> Question: Where is the swelling locat

In [9]:
import pandas as pd

# 1. Create a list to hold our template rows
mapping_data = []

for code, details in evidences.items():
    raw_question = details.get('question_en', '')
    
    # 2. Smart Auto-Drafting Logic (to save you typing time)
    draft_stem = raw_question
    q_lower = raw_question.lower()
    
    if q_lower.startswith("do you have a"):
        draft_stem = "I have a" + raw_question[13:].replace("?", ".")
    elif q_lower.startswith("do you have"):
        draft_stem = "I have" + raw_question[11:].replace("?", ".")
    elif q_lower.startswith("are you"):
        draft_stem = "I am" + raw_question[7:].replace("?", ".")
    elif q_lower.startswith("do you"):
        draft_stem = "I" + raw_question[6:].replace("?", ".")
    elif q_lower.startswith("how intense"):
        draft_stem = "On a scale of 1 to 10, the intensity is"
    else:
        # Fallback for complex questions
        draft_stem = "Regarding the question '" + raw_question.replace("?", "") + "', my answer is"

    # 3. Append to our data list
    mapping_data.append({
        'E_Code': code,
        'Raw_Question': raw_question,
        'Narrative_Stem': draft_stem
    })

# 4. Export to a CSV file
template_df = pd.DataFrame(mapping_data)
template_file = 'stem_mapping_template.csv'
template_df.to_csv(template_file, index=False)

print(f"Success! {len(template_df)} codes exported to {template_file}.")
print("You can now open this file in VS Code or Excel to review the stems.")

Success! 223 codes exported to stem_mapping_template.csv.
You can now open this file in VS Code or Excel to review the stems.


In [10]:
import pandas as pd

# 1. Load the CSV you just finalized
mapping_df = pd.read_csv('stem_mapping_template.csv')

# 2. Convert it into a Python dictionary for instant lookup
# This completely replaces the manual NARRATIVE_STEMS we wrote earlier
NARRATIVE_STEMS = dict(zip(mapping_df['E_Code'], mapping_df['Narrative_Stem']))

print(f"Successfully loaded {len(NARRATIVE_STEMS)} narrative stems into memory!")
print("-" * 40)

# 3. Let's do a quick sanity check on Row 0 using the full dictionary
first_row = df_train.iloc[0] # Your 18-year-old male
test_story = generate_human_narrative(first_row, evidences, NARRATIVE_STEMS)

print("--- TEST WITH FULL 223 DICTIONARY ---")
print(test_story)

Successfully loaded 223 narrative stems into memory!
----------------------------------------
--- TEST WITH FULL 223 DICTIONARY ---
I am an 18-year-old male. I came into the clinic today because I have a fever (either felt or measured with a thermometer). Regarding the question 'Characterize your pain:', my answer is sensitive and heavy. I feel pain somewhere. forehead, cheek(R), and temple(L). On a scale of 1 to 10, the intensity is 4. Regarding the question 'How precisely is the pain located', my answer is 3. Regarding the question 'How fast did the pain appear', my answer is 3. No, I am not regarding the question 'have you traveled out of the country in the last 4 weeks', my answer is. To give you more context: I live with 4 or more people. Regarding the question 'Have you had significantly increased sweating', my answer is I have pain somewhere, related to your reason for consulting. I have a cough that produces colored or more abundant sputum than usual. I smoke cigarettes. I have

In [11]:
import pandas as pd

# 1. Reload your polished CSV
mapping_df = pd.read_csv('stem_mapping_template.csv')
NARRATIVE_STEMS = dict(zip(mapping_df['E_Code'], mapping_df['Narrative_Stem']))

print(f"Loaded {len(NARRATIVE_STEMS)} narrative stems into memory!")
print("-" * 40)

# 2. Test Row 0 again
first_row = df_train.iloc[0] # Your 18-year-old male
final_story = generate_human_narrative(first_row, evidences, NARRATIVE_STEMS)

print("--- FINAL POLISHED NARRATIVE (ROW 0) ---")
print(final_story)

Loaded 223 narrative stems into memory!
----------------------------------------
--- FINAL POLISHED NARRATIVE (ROW 0) ---
I am an 18-year-old male. I came into the clinic today because I have a fever (either felt or measured with a thermometer). I would describe the pain as sensitive and heavy. The pain is specifically located in my. forehead, cheek(R), and temple(L). On a scale of 1 to 10, the intensity is 4. On a scale of 1 to 10, the precision of the pain location is a 3. Regarding how fast the pain appeared, on a scale of 1 to 10, it was a 3. No, I am not traveling out of the country recently. To give you more context: I live with 4 or more people. Regarding the question 'Have you had significantly increased sweating', my answer is I have pain somewhere, related to your reason for consulting. I have a cough that produces colored or more abundant sputum than usual. I smoke cigarettes. I have a sore throat. I have a cough. I am exposed to secondhand cigarette smoke on a daily basis.


In [12]:
import pandas as pd
import os

# 1. Pipeline Configuration
input_file = 'release_train_patients'
output_file = 'translated_train_data.csv'
chunk_size = 50000

# 2. Initialize the Output File (Only saving Target and Text)
pd.DataFrame(columns=['PATHOLOGY', 'NARRATIVE']).to_csv(output_file, index=False)

print(f"Starting translation pipeline for {input_file}...")

# 3. Process in Chunks
for i, chunk in enumerate(pd.read_csv(input_file, chunksize=chunk_size)):
    print(f"Processing chunk {i+1} (Rows {i*chunk_size} to {(i+1)*chunk_size})...")
    
    # Apply the translation function
    chunk['NARRATIVE'] = chunk.apply(
        lambda row: generate_human_narrative(row, evidences, NARRATIVE_STEMS), 
        axis=1
    )
    
    # Keep only what Phase 2 needs: The target label and the narrative text
    processed_chunk = chunk[['PATHOLOGY', 'NARRATIVE']]
    
    # Append to the CSV on your hard drive
    processed_chunk.to_csv(output_file, mode='a', header=False, index=False)

print("Pipeline completed successfully! Phase 1 is officially done.")

Starting translation pipeline for release_train_patients...
Processing chunk 1 (Rows 0 to 50000)...
Processing chunk 2 (Rows 50000 to 100000)...
Processing chunk 3 (Rows 100000 to 150000)...
Processing chunk 4 (Rows 150000 to 200000)...
Processing chunk 5 (Rows 200000 to 250000)...
Processing chunk 6 (Rows 250000 to 300000)...
Processing chunk 7 (Rows 300000 to 350000)...
Processing chunk 8 (Rows 350000 to 400000)...
Processing chunk 9 (Rows 400000 to 450000)...
Processing chunk 10 (Rows 450000 to 500000)...
Processing chunk 11 (Rows 500000 to 550000)...
Processing chunk 12 (Rows 550000 to 600000)...
Processing chunk 13 (Rows 600000 to 650000)...
Processing chunk 14 (Rows 650000 to 700000)...
Processing chunk 15 (Rows 700000 to 750000)...
Processing chunk 16 (Rows 750000 to 800000)...
Processing chunk 17 (Rows 800000 to 850000)...
Processing chunk 18 (Rows 850000 to 900000)...
Processing chunk 19 (Rows 900000 to 950000)...
Processing chunk 20 (Rows 950000 to 1000000)...
Processing chun

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load the massive dataset we just built
print("Loading the 1-Million row dataset...")
df_full = pd.read_csv('translated_train_data.csv')

# 2. Create a 10,000 row stratified sample
# We use train_test_split purely to mathematically sample the data while maintaining the disease ratios
print("Stratifying and sampling 10,000 rows...")
_, df_sample = train_test_split(
    df_full, 
    test_size=10000, 
    stratify=df_full['PATHOLOGY'], 
    random_state=42
)

# 3. Save the tiny sample to your hard drive
sample_filename = 'tiny_train_sample.csv'
df_sample.to_csv(sample_filename, index=False)

print("-" * 40)
print(f"Success! Tiny sample saved as {sample_filename}")
print(f"Total rows: {df_sample.shape[0]}")
print(f"Total unique pathologies represented: {df_sample['PATHOLOGY'].nunique()} (Should be 49)")

Loading the 1-Million row dataset...
Stratifying and sampling 10,000 rows...
----------------------------------------
Success! Tiny sample saved as tiny_train_sample.csv
Total rows: 10000
Total unique pathologies represented: 49 (Should be 49)
