# FRAG-MED: Dynamic Federated Hospital Splitting

**Dynamic Features:**
- **User Configurable:** Set the exact number of hospitals and top conditions.
- **Data-Driven Specialization:** Automatically assigns the most prevalent conditions in your dataset to hospitals.
- **Affinity Matching:** Routes patients to the hospital that best fits their specific medical history.

In [None]:
import json
import random
import os
import shutil
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')
random.seed(42)
np.random.seed(42)

print("‚úÖ Libraries loaded.")

## 1. User Configuration

In [None]:
# ---------------------------------------------------------
# ‚öôÔ∏è DYNAMIC CONFIGURATION
# ---------------------------------------------------------

# How many hospitals do you want to create?
NUM_HOSPITALS = 10

# How many top conditions should drive the specialization?
# (e.g., if 20, the top 20 conditions will be distributed among the hospitals)
TOP_CONDITIONS_LIMIT = 10

# Directories
SOURCE_DIR = '../data/raw_patients' # Path to the directory containing the patient data 
DEST_DIR = '../data/federated_hospitals' # Path to the directory where the hospitals will be created

print(f"‚öôÔ∏è Configuration:")
print(f"   Generating {NUM_HOSPITALS} hospitals")
print(f"   Using Top {TOP_CONDITIONS_LIMIT} conditions for specialization")
print(f"   Source: {SOURCE_DIR}")
print(f"   Destination: {DEST_DIR}")

## 2. Analyze Data & Generate Dynamic Profiles

In [None]:
def extract_conditions(json_data):
    """Extract simple condition list from patient file."""
    extracted = []
    try:
        if isinstance(json_data, dict) and 'encounters' in json_data:
            for enc in json_data['encounters']:
                for cond in enc.get('conditions', []):
                    if 'code' in cond:
                        extracted.append((cond['code'], cond.get('display_text', 'Unknown')))
    except:
        pass
    return extracted

def analyze_and_generate_profiles(source_dir, num_hospitals, top_n_conditions):
    print("\n" + "="*80)
    print("STEP 1: ANALYZING & GENERATING PROFILES")
    print("="*80)

    # 1. Scan all files to find Top Conditions
    if not os.path.exists(source_dir):
        raise FileNotFoundError(f"‚ùå Source directory {source_dir} not found!")
    
    files = [f for f in os.listdir(source_dir) if f.endswith('.json')]
    print(f"üìã Scanning {len(files)} files for conditions...")

    global_condition_counts = Counter()
    condition_names = {}
    patient_data = {}

    for filename in tqdm(files, desc="Analyzing"):
        filepath = os.path.join(source_dir, filename)
        p_id = filename.replace('.json', '')
        
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        conditions = extract_conditions(data)
        
        # Store patient profile
        patient_weights = Counter()
        for code, name in conditions:
            global_condition_counts[code] += 1
            condition_names[code] = name
            patient_weights[code] += 1 # Base weight
        
        # Boost weights for recurring conditions
        final_weights = {k: v * (2.0 if v > 1 else 1.0) for k, v in patient_weights.items()}
        
        patient_data[p_id] = {
            'filename': filename,
            'weights': final_weights
        }

    # 2. Select Top Conditions for Specialization
    top_conditions = [code for code, _ in global_condition_counts.most_common(top_n_conditions)]
    print(f"\n‚úÖ Identified top {len(top_conditions)} prevalent conditions.")

    # 3. Distribute Conditions to Hospitals
    # We split the top conditions list into chunks, one for each hospital
    hospital_profiles = {}
    chunk_size = len(top_conditions) // num_hospitals
    remainder = len(top_conditions) % num_hospitals
    
    start_idx = 0
    sizes = ['large', 'medium', 'small']
    
    for i in range(num_hospitals):
        # Determine chunk size for this hospital
        current_chunk = chunk_size + (1 if i < remainder else 0)
        end_idx = start_idx + current_chunk
        
        assigned_conditions = top_conditions[start_idx:end_idx]
        start_idx = end_idx
        
        # Create Profile
        h_id = f"hospital_{chr(65+i)}" # Hospital_A, Hospital_B...
        
        # Assign random size
        size = sizes[i % len(sizes)]
        
        # Create affinities (high score for assigned conditions)
        affinities = {code: 3.0 for code in assigned_conditions}
        
        # Generate a display name for the specialization based on the first condition
        if assigned_conditions:
            primary_name = condition_names[assigned_conditions[0]].split('(')[0].strip()
            specialization = f"{primary_name} & General"
        else:
            specialization = "General Care"

        hospital_profiles[h_id] = {
            'size': size,
            'specialization_name': specialization,
            'affinities': affinities
        }
        
        print(f"   üè• {h_id} ({size.upper()}): {specialization}")
        print(f"      Focus: {[condition_names[c][:20] for c in assigned_conditions]}")

    return patient_data, hospital_profiles, condition_names

# Execute Analysis
patient_data, HOSPITAL_PROFILES, condition_names = analyze_and_generate_profiles(SOURCE_DIR, NUM_HOSPITALS, TOP_CONDITIONS_LIMIT)

## 3. Assignment Logic

In [None]:
def calculate_affinity_score(patient_weights, hospital_profile):
    """Calculate score based on dynamic hospital affinities."""
    score = 0.0
    affinities = hospital_profile['affinities']
    
    for code, weight in patient_weights.items():
        if code in affinities:
            score += weight * affinities[code] # High boost for specialty match
        else:
            score += weight * 0.1 # Low baseline for general matching

    # Add randomness (¬±15%)
    return score * random.uniform(0.85, 1.15)

def get_capacities(total_patients, profiles):
    """Dynamic capacity calculation based on hospital sizes."""
    targets = {}
    weights = {'large': 0.4, 'medium': 0.25, 'small': 0.1}
    
    # Calculate total weight shares
    total_share = sum(weights[p['size']] for p in profiles.values())
    
    for h_id, p in profiles.items():
        share = weights[p['size']] / total_share
        targets[h_id] = int(total_patients * share)
        
    # Adjust remainder
    current_total = sum(targets.values())
    diff = total_patients - current_total
    if diff != 0:
        h_keys = list(targets.keys())
        for i in range(abs(diff)):
            targets[h_keys[i % len(h_keys)]] += 1 if diff > 0 else -1
            
    return targets

## 4. Run Assignment & Distribution

In [None]:
print("\n" + "="*80)
print("STEP 2: ASSIGNING PATIENTS")
print("="*80)

total_patients = len(patient_data)
targets = get_capacities(total_patients, HOSPITAL_PROFILES)
hospital_counts = defaultdict(int)
assignments = {}

# 1. Calculate Scores
print("‚öôÔ∏è  Scoring patient affinities...")
patient_scores = {}
for p_id, data in patient_data.items():
    scores = {}
    for h_id, profile in HOSPITAL_PROFILES.items():
        scores[h_id] = calculate_affinity_score(data['weights'], profile)
    patient_scores[p_id] = scores

# 2. Assign
print("‚öôÔ∏è  Assigning to hospitals...")
p_ids = list(patient_data.keys())
random.shuffle(p_ids)

for p_id in tqdm(p_ids, desc="Assigning"):
    scores = patient_scores[p_id]
    
    # Filter out full hospitals
    available = [h for h in HOSPITAL_PROFILES if hospital_counts[h] < targets[h]]
    if not available:
        available = [max(targets.keys(), key=lambda h: targets[h] - hospital_counts[h])]
        
    # Probabilistic selection based on score
    available_scores = {h: scores[h] for h in available}
    total_score = sum(available_scores.values())
    
    if total_score > 0:
        probs = {h: s/total_score for h, s in available_scores.items()}
    else:
        probs = {h: 1/len(available) for h in available}
        
    choice = np.random.choice(list(probs.keys()), p=list(probs.values()))
    assignments[p_id] = choice
    hospital_counts[choice] += 1

# 3. Copy Files
print("\n" + "="*80)
print("STEP 3: DISTRIBUTING FILES")
print("="*80)

# Prepare Dirs
if os.path.exists(DEST_DIR):
    shutil.rmtree(DEST_DIR) # Clean start
os.makedirs(DEST_DIR)

for h_id in HOSPITAL_PROFILES:
    os.makedirs(os.path.join(DEST_DIR, h_id))

# Copy
files_copied = 0
for p_id, h_id in tqdm(assignments.items(), desc="Copying"):
    src = os.path.join(SOURCE_DIR, patient_data[p_id]['filename'])
    dst = os.path.join(DEST_DIR, h_id, patient_data[p_id]['filename'])
    try:
        shutil.copy2(src, dst)
        files_copied += 1
    except Exception as e:
        print(f"Error copying {p_id}: {e}")

print(f"\n‚úÖ Distributed {files_copied} files across {NUM_HOSPITALS} hospitals.")

## 5. Report

In [None]:
summary = []
for h_id in sorted(HOSPITAL_PROFILES.keys()):
    count = hospital_counts[h_id]
    prof = HOSPITAL_PROFILES[h_id]
    
    # Get actual top conditions in this hospital
    h_patients = [p for p, h in assignments.items() if h == h_id]
    cond_counts = Counter()
    for p in h_patients:
        for c, w in patient_data[p]['weights'].items():
            cond_counts[c] += 1
            
    top_3 = [condition_names.get(c, c)[:25] for c, _ in cond_counts.most_common(3)]
    
    summary.append({
        'Hospital': h_id,
        'Size': prof['size'].upper(),
        'Patients': count,
        'Specialization': prof['specialization_name'],
        'Top Observed Conditions': ", ".join(top_3)
    })

df = pd.DataFrame(summary)
print("\n" + "="*80)
print("FINAL HOSPITAL DISTRIBUTION")
print("="*80)
print(df.to_string(index=False))