In [16]:
import pandas as pd

patients = pd.read_csv('output/csv/patients.csv')
observations = pd.read_csv('output/csv/observations.csv')
conditions = pd.read_csv('output/csv/conditions.csv')


In [None]:
import pandas as pd


observations = pd.read_csv('output/csv/observations.csv')
metrics_of_interest = [
    'Diastolic Blood Pressure',
    'Systolic Blood Pressure',
    'Heart rate',
    'Respiratory rate',
    'Hemoglobin [Mass/volume] in Blood',
    'Hematocrit [Volume Fraction] of Blood by Automated count',
    'Leukocytes [
    'Erythrocytes [
    'Platelets [
    'Tobacco smoking status',
    'Glucose [Mass/volume] in Blood',
    'Cholesterol [Mass/volume] in Serum or Plasma',
    'Triglycerides',
    'Low Density Lipoprotein Cholesterol',
    'Cholesterol in HDL [Mass/volume] in Serum or Plasma',
    'Natriuretic peptide.B prohormone N-Terminal [Mass/volume] in Blood by Immunoassay',
    'Troponin I.cardiac [Mass/volume] in Serum or Plasma by High sensitivity method',
    'Mean blood pressure'
]


print("Columns in observations DataFrame:", observations.columns.tolist())
filtered_observations = observations[observations['DESCRIPTION'].isin(metrics_of_interest)].copy()
print(filtered_observations[['PATIENT', 'DESCRIPTION', 'VALUE', 'DATE']].head())


In [None]:
smoking_status_map = {
    'Never smoked tobacco (finding)': 0,
    'Ex-smoker (finding)': 1,
    'Smokes tobacco daily (finding)': 2
}

tobacco_data = filtered_observations[filtered_observations['DESCRIPTION'] == 'Tobacco smoking status']
tobacco_data['VALUE'] = tobacco_data['VALUE'].map(smoking_status_map)
tobacco_data = tobacco_data.dropna(subset=['VALUE'])


if tobacco_data.empty:
    print("No valid entries for 'Tobacco smoking status' after mapping.")
else:
    print(f"Entries found for 'Tobacco smoking status': {len(tobacco_data)}")
    print(tobacco_data[['PATIENT', 'VALUE', 'DATE']].head())


filtered_observations = pd.concat([filtered_observations, tobacco_data], ignore_index=True)


print("Combined filtered observations:")
print(filtered_observations[['PATIENT', 'DESCRIPTION', 'VALUE', 'DATE']].head())


In [21]:
filtered_observations.loc[:, 'VALUE'] = pd.to_numeric(filtered_observations['VALUE'], errors='coerce')
filtered_observations = filtered_observations.dropna(subset=['VALUE'])

In [22]:
def compute_probabilities(observation_data, metric_name, threshold):
    """
    Compute initial and transition probabilities for a given metric.
    """
    
    metric_data = observation_data[observation_data['DESCRIPTION'] == metric_name]
    if len(metric_data) == 0:
        print(f"No data available for metric: {metric_name}")
        return {
            'prob_below': 0.0,
            'prob_above': 0.0,
            'transitions': [0.0, 0.0, 0.0, 0.0]
        }
    
    
    prob_below_threshold = len(metric_data[metric_data['VALUE'] <= threshold]) / len(metric_data)
    prob_above_threshold = len(metric_data[metric_data['VALUE'] > threshold]) / len(metric_data)
    
    
    total_transitions = len(metric_data) - 1  
    if total_transitions == 0:
        print(f"Insufficient data for transitions in metric: {metric_name}")
        return {
            'prob_below': prob_below_threshold,
            'prob_above': prob_above_threshold,
            'transitions': [0.0, 0.0, 0.0, 0.0]
        }
    
    below_to_below = len(metric_data[(metric_data['VALUE'] <= threshold) & 
                                     (metric_data['VALUE'].shift(-1) <= threshold)]) / total_transitions
    below_to_above = len(metric_data[(metric_data['VALUE'] <= threshold) & 
                                     (metric_data['VALUE'].shift(-1) > threshold)]) / total_transitions
    above_to_below = len(metric_data[(metric_data['VALUE'] > threshold) & 
                                     (metric_data['VALUE'].shift(-1) <= threshold)]) / total_transitions
    above_to_above = len(metric_data[(metric_data['VALUE'] > threshold) & 
                                     (metric_data['VALUE'].shift(-1) > threshold)]) / total_transitions
    
    return {
        'prob_below': prob_below_threshold,
        'prob_above': prob_above_threshold,
        'transitions': [below_to_below, below_to_above, above_to_below, above_to_above]
    }


In [None]:
tobacco_data = observations[observations['DESCRIPTION'] == 'Tobacco smoking status']
print(tobacco_data['VALUE'].unique())

In [None]:
import pandas as pd

# Load the observations data (from Synthea output)
# Example assumes filtered_observations contains relevant patient metrics.
# Columns: ['PATIENT', 'DATE', 'DESCRIPTION', 'VALUE']

# Calculate the dynamic threshold for metrics
metric_thresholds = {
    'Systolic Blood Pressure': filtered_observations[filtered_observations['DESCRIPTION'] == 'Systolic Blood Pressure']['VALUE'].quantile(0.90),
    'Diastolic Blood Pressure': filtered_observations[filtered_observations['DESCRIPTION'] == 'Diastolic Blood Pressure']['VALUE'].quantile(0.90),
    'Heart rate': filtered_observations[filtered_observations['DESCRIPTION'] == 'Heart rate']['VALUE'].quantile(0.90),
    'Respiratory rate': filtered_observations[filtered_observations['DESCRIPTION'] == 'Respiratory rate']['VALUE'].quantile(0.90),
    'Hemoglobin [Mass/volume] in Blood': filtered_observations[filtered_observations['DESCRIPTION'] == 'Hemoglobin [Mass/volume] in Blood']['VALUE'].quantile(0.10),
    'Hematocrit [Volume Fraction] of Blood by Automated count': filtered_observations[filtered_observations['DESCRIPTION'] == 'Hematocrit [Volume Fraction] of Blood by Automated count']['VALUE'].quantile(0.10),
    'Leukocytes [#/volume] in Blood by Automated count': filtered_observations[filtered_observations['DESCRIPTION'] == 'Leukocytes [#/volume] in Blood by Automated count']['VALUE'].quantile(0.90),
    'Erythrocytes [#/volume] in Blood by Automated count': filtered_observations[filtered_observations['DESCRIPTION'] == 'Erythrocytes [#/volume] in Blood by Automated count']['VALUE'].quantile(0.10),
    'Platelets [#/volume] in Blood by Automated count': filtered_observations[filtered_observations['DESCRIPTION'] == 'Platelets [#/volume] in Blood by Automated count']['VALUE'].quantile(0.90),
    'Glucose [Mass/volume] in Blood': filtered_observations[filtered_observations['DESCRIPTION'] == 'Glucose [Mass/volume] in Blood']['VALUE'].quantile(0.90),
    'Cholesterol [Mass/volume] in Serum or Plasma': filtered_observations[filtered_observations['DESCRIPTION'] == 'Cholesterol [Mass/volume] in Serum or Plasma']['VALUE'].quantile(0.90),
    'Triglycerides': filtered_observations[filtered_observations['DESCRIPTION'] == 'Triglycerides']['VALUE'].quantile(0.90),
    'Low Density Lipoprotein Cholesterol': filtered_observations[filtered_observations['DESCRIPTION'] == 'Low Density Lipoprotein Cholesterol']['VALUE'].quantile(0.90),
    'Cholesterol in HDL [Mass/volume] in Serum or Plasma': filtered_observations[filtered_observations['DESCRIPTION'] == 'Cholesterol in HDL [Mass/volume] in Serum or Plasma']['VALUE'].quantile(0.10)
}

# Display the computed thresholds
print("Dynamic Metric Thresholds:")
for metric, threshold in metric_thresholds.items():
    print(f"{metric}: {threshold}")


# Function to compute probabilities for each metric
dbns = {}
for metric, threshold in metric_thresholds.items():
    probabilities = compute_probabilities(filtered_observations, metric, threshold)
    dbns[metric] = probabilities
    print(f"Probabilities for {metric}: {probabilities}")


In [25]:
# !pip3 install pgmpy

In [None]:
from pgmpy.estimators import HillClimbSearch, BicScore
from pgmpy.models import BayesianNetwork
import pandas as pd


data = pd.read_csv('output/csv/observations.csv')



print(data.head)

hc = HillClimbSearch(data)
best_model = hc.estimate(scoring_method=BicScore(data))


print("Learned Metric Relationships:")
print(best_model.edges())


relationships = best_model.edges()
with open('metric_relationships.json', 'w') as f:
    json.dump([{"parent": parent, "child": child} for parent, child in relationships], f)


In [None]:
import pandas as pd

# Load the conditions and observations data from Synthea
conditions = pd.read_csv('output/csv/conditions.csv')
observations = pd.read_csv('output/csv/observations.csv')

# Define the metrics to analyze and their abnormal thresholds
metrics = {
    'Heart rate': 0.90,  # Top 10% for high heart rate
    'Systolic Blood Pressure': 0.90,
    'Diastolic Blood Pressure': 0.90,
    'Respiratory rate': 0.90,  # Top 10% for abnormal respiratory rate
    'Glucose [Mass/volume] in Blood': 0.90,
    'Cholesterol [Mass/volume] in Serum or Plasma': 0.90,
    'Triglycerides': 0.90,
    'Body mass index (BMI) [Ratio]': 0.90
}

# Convert the VALUE column to numeric, coercing errors to NaN
observations['VALUE'] = pd.to_numeric(observations['VALUE'], errors='coerce')

# Initialize an empty list to store the data
data_for_csv = []

# Iterate over each metric and calculate conditional probabilities
for metric, quantile in metrics.items():
    # Filter out metric data and calculate the abnormal threshold
    metric_data = observations[observations['DESCRIPTION'] == metric]['VALUE'].dropna()
    metric_threshold = metric_data.quantile(quantile)

    # Filter for abnormal readings based on the metric
    if metric == "Respiratory rate":
        abnormal_readings = observations[
            (observations['DESCRIPTION'] == metric) & 
            (observations['VALUE'] > metric_threshold)  # Higher respiratory rate is abnormal
        ]
    else:
        abnormal_readings = observations[
            (observations['DESCRIPTION'] == metric) & 
            (observations['VALUE'] > metric_threshold)
        ]

    # Get the patient IDs with abnormal readings
    abnormal_patients = abnormal_readings['PATIENT'].unique()

    # Filter conditions related to these patients
    related_conditions = conditions[conditions['PATIENT'].isin(abnormal_patients)]

    # Calculate the total number of patients with the abnormal metric
    total_patients_with_abnormal_metric = len(abnormal_patients)

    # Calculate the probability of each condition given the abnormal metric
    condition_probabilities = related_conditions['DESCRIPTION'].value_counts(normalize=True)

    # Append the results to the data list with conditional probabilities
    for condition, probability in condition_probabilities.items():
        data_for_csv.append([metric, condition, probability])

# Create a DataFrame from the data
df_conditional_probabilities = pd.DataFrame(
    data_for_csv, columns=['metric', 'condition', 'conditional_probability']
)

# Save the DataFrame to a CSV file
output_csv_path = 'output/conditional_probabilities_for_metrics.csv'
df_conditional_probabilities.to_csv(output_csv_path, index=False)

print(f"CSV file with conditional probabilities for each metric saved at: {output_csv_path}")


In [None]:
observations = pd.read_csv('output/csv/observations.csv')
print(observations.head)
uniquemetrics = observations['DESCRIPTION'].unique()
print(uniquemetrics)

In [None]:
patient_id='8a2ab9dc-d34e-9f31-9a98-14bcf27330c7'
data = observations[(observations['PATIENT'] == patient_id) &
                        (observations['DESCRIPTION'].str.contains('Oxygen saturation in Arterial blood', case=False))]
print(data)

In [52]:
import pandas as pd

# Load conditions and observations data
conditions = pd.read_csv('output/csv/conditions.csv')
observations = pd.read_csv('output/csv/observations.csv')

# Search for stroke-related terms in the DESCRIPTION column of conditions
stroke_related_terms = ['stroke']
mask = conditions['DESCRIPTION'].str.contains('|'.join(stroke_related_terms), case=False, na=False)

# Filter stroke-related patients
stroke_patients = conditions[mask]

# Display the first few rows of stroke-related patients
# print("\nStroke-Related Patients (Conditions Data):")
# print(stroke_patients[['PATIENT', 'DESCRIPTION', 'CODE']].head())

# Get unique patient IDs with stroke-related conditions
stroke_patient_ids = stroke_patients['PATIENT'].unique()
# print(f"\nFound {len(stroke_patient_ids)} patient(s) with stroke-related conditions.")
# print(f"Stroke Patient IDs: {stroke_patient_ids}")

# Check available metrics for each stroke patient from the observations data
for patient_id in stroke_patient_ids:
    # print(f"\nChecking metrics for Patient ID: {patient_id}")

    # Filter observations for this patient
    patient_observations = observations[observations['PATIENT'] == patient_id]
    if 'Oxygen saturation in Arterial blood' in patient_observations['DESCRIPTION'].unique():
        print(patient_id)

    # if not patient_observations.empty:
    #     print(f"Available Metrics for Patient {patient_id}:")
    #     print(patient_observations[['DESCRIPTION', 'VALUE']].dropna())
    # else:
    #     print(f"No observations found for Patient {patient_id}.")


In [None]:
import pandas as pd
probabilities = pd.read_csv('output/conditional_probabilities_for_metrics.csv')
print(probabilities.head())  # Inspect the first few rows
uniquemetrics = probabilities['metric'].unique()
print(uniquemetrics)

In [7]:
def get_probabilities(metric):
    metric_probs = probabilities[probabilities['metric'] == metric]
    if metric_probs.empty:
        print(f"No probabilities found for metric: {metric}")
    return metric_probs.set_index('condition')['conditional_probability'].to_dict()

In [None]:
heart_rate_probs = get_probabilities('Heart rate')
if not heart_rate_probs:
    print("No heart rate probabilities found")
else:
    print(f"Heart rate probabilities: {heart_rate_probs}")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import pymc as pm
import time
import signal
import sys

# Load Synthea data
observations = pd.read_csv('output/csv/observations.csv')
probabilities = pd.read_csv('output/conditional_probabilities_for_metrics.csv')

# Function to get probabilities from the CSV
def get_probabilities(metric, threshold=0.05):
    metric_probs = probabilities[probabilities['metric'] == metric]
    filtered_probs = metric_probs[metric_probs['conditional_probability'] > threshold]
    return filtered_probs.set_index('condition')['conditional_probability'].to_dict()

# Function to collect blood pressure data
def collect_blood_pressure_data(patient_id):
    data = observations[(observations['PATIENT'] == patient_id) &
                        (observations['DESCRIPTION'].str.contains('Blood Pressure'))]
    data['VALUE'] = pd.to_numeric(data['VALUE'], errors='coerce')
    return data.dropna(subset=['VALUE'])

# Function to build the Bayesian model
def build_blood_pressure_causal_model(bp_data, bp_probs):
    systolic_data = bp_data[bp_data['DESCRIPTION'] == 'Systolic Blood Pressure']['VALUE']
    diastolic_data = bp_data[bp_data['DESCRIPTION'] == 'Diastolic Blood Pressure']['VALUE']

    mean_systolic = systolic_data.mean()
    std_systolic = systolic_data.std()
    mean_diastolic = diastolic_data.mean()
    std_diastolic = diastolic_data.std()

    with pm.Model() as model:
        systolic_prior = pm.Normal('systolic_prior', mu=mean_systolic, sigma=std_systolic)
        diastolic_prior = pm.Normal('diastolic_prior', mu=mean_diastolic, sigma=std_diastolic)
        trace = pm.sample(1000, chains=2, tune=500, return_inferencedata=False)

        bp_risk = np.mean(trace['systolic_prior'] > 140) + np.mean(trace['diastolic_prior'] > 90)

    return bp_risk, bp_probs


patient_id = '8a2ab9dc-d34e-9f31-9a98-14bcf27330c7'
chain = {}

bp_data = collect_blood_pressure_data(patient_id)

bp_probs = get_probabilities('bSystolic Blood Pressure')
bp_risk, cause_probabilities = build_blood_pressure_causal_model(bp_data, bp_probs)
bp_status = "high" if bp_risk > 0.5 else "normal"



In [None]:
import pandas as pd
import json

# Load data (same as used in distributed devices)
observations = pd.read_csv('output/csv/observations.csv')
probabilities = pd.read_csv('output/conditional_probabilities_for_metrics.csv')

def compute_central_chain(patient_id):
    """
    Generate a causal chain centrally using all metrics.
    """
    chain = []
    visited_metrics = set()
    current_metric = "Heart rate"  # Start point (can be dynamic)

    while current_metric:
        print(f"Processing {current_metric} in central chain...")
        
        # Collect data for the metric
        metric_data = observations[(observations['PATIENT'] == patient_id) &
                                    (observations['DESCRIPTION'] == current_metric)]
        if metric_data.empty:
            print(f"No data for {current_metric}. Skipping.")
            break
        
        # Get probabilities for the current metric
        likelihoods = probabilities[probabilities['metric'] == current_metric]
        if likelihoods.empty:
            print(f"No probabilities available for {current_metric}.")
            break

        # Calculate relevance to other metrics
        relevance_scores = {}
        for _, row in likelihoods.iterrows():
            if row['condition'] not in visited_metrics:
                relevance_scores[row['condition']] = row['conditional_probability']

        if not relevance_scores:
            print(f"No relevant metrics found for {current_metric}. Ending chain.")
            break

        # Determine the next metric based on relevance scores
        next_metric = max(relevance_scores, key=relevance_scores.get)
        visited_metrics.add(current_metric)

        # Add to chain
        chain.append({
            "metric": current_metric,
            "relevance": relevance_scores,
            "next_metric": next_metric
        })

        current_metric = next_metric

    print("Central chain generation complete.")
    return chain

# Test central chain generation
patient_id = '86def1b6-28c8-d5e4-39e7-d18bc696eb17'
central_chain = compute_central_chain(patient_id)

# Save the central chain for comparison
with open('central_chain.json', 'w') as f:
    json.dump(central_chain, f, indent=2)


In [None]:
medication_data = observations[observations['DESCRIPTION'] == 'Medication review due (situation)']
print(medication_data.head())