In [2]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')

In [None]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)
print(type(dataset['Sex'][0]))
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)

In [None]:
prior = len(dataset[dataset['MetabolicSyndrome'] == 1]) / len(dataset)
print(prior)

In [None]:
def safe_division(numerator, denominator):
    """
    Safely perform division, returning 0 if denominator is 0
    """
    try:
        return numerator / denominator if denominator != 0 else 0
    except:
        return 0

def calculate_likelihoods(dataset):
    """
    Calculate likelihoods for all variables based on clinical thresholds
    with error handling for zero cases
    """
    likelihoods = {}
    
    
    # Age (different thresholds by sex)
    male_ms = len(dataset[(dataset['MetabolicSyndrome']==1) & (dataset['Sex']==0)])
    female_ms = len(dataset[(dataset['MetabolicSyndrome']==1) & (dataset['Sex']==1)])
    total_ms = len(dataset[dataset['MetabolicSyndrome']==1])
    
    likelihoods['age_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['Age'] >= 45)]),
        male_ms
    )
    
    likelihoods['age_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['Age'] >= 55)]),
        female_ms
    )
    
    # Sex (overall prevalence by sex)
    likelihoods['sex_male'] = safe_division(male_ms, total_ms)
    likelihoods['sex_female'] = safe_division(female_ms, total_ms)
    
    # race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}
    likelihoods['race_white'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 0)
                    ]), total_ms)
    likelihoods['race_asian'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 1)
                    ]), total_ms)
    likelihoods['race_black'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 2)
                    ]), total_ms) 
    likelihoods['race_MexAmerican'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 4)
                    ]), total_ms)
        
    likelihoods['race_Hispanic'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 5)
                    ]), total_ms)

    
    
    # BMI
    likelihoods['bmi'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['BMI'] >= 30)]),
        total_ms
    )
    
    # Waist Circumference (gender-specific)
    likelihoods['waist_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['WaistCirc'] >= 102)]),
        male_ms
    )
    
    likelihoods['waist_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['WaistCirc'] >= 88)]),
        female_ms
    )
    
    # Blood Glucose
    likelihoods['glucose'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['BloodGlucose'] >= 100)]),
        total_ms
    )
    
    # HDL (gender-specific)
    likelihoods['hdl_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['HDL'] < 40)]),
        male_ms
    )
    
    likelihoods['hdl_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['HDL'] < 50)]),
        female_ms
    )
    
    # Triglycerides
    likelihoods['triglycerides'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Triglycerides'] >= 150)]),
        total_ms
    )
    
    # Albuminuria
    likelihoods['albuminuria'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Albuminuria'] == 1)]),
        total_ms
    )
    
    # UrAlbCr
    likelihoods['uralbcr'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['UrAlbCr'] >= 30)]),
        total_ms
    )
    
    # Uric Acid (gender-specific)
    likelihoods['uric_acid_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['UricAcid'] > 7.0)]),
        male_ms
    )
    
    likelihoods['uric_acid_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['UricAcid'] > 6.0)]),
        female_ms
    )
    
    return likelihoods

def format_results(likelihoods):
    """
    Format the likelihoods into a readable DataFrame
    """
    results = pd.DataFrame(likelihoods.items(), columns=['Measure', 'Likelihood'])
    results['Likelihood'] = results['Likelihood'].round(3)
    results['Percentage'] = (results['Likelihood'] * 100).round(1).astype(str) + '%'
    return results.sort_values('Likelihood', ascending=False)

likelihoods = calculate_likelihoods(dataset)
results_df = format_results(likelihoods)
print(results_df)

In [None]:
# Calculate likelihoods using your existing function
likelihoods = calculate_likelihoods(dataset)

# Calculate prior
prior = len(dataset[dataset['MetabolicSyndrome'] == 1]) / len(dataset)

# Calculate posteriors using Bayes' Theorem
# P(MetSyn|Feature) = P(Feature|MetSyn) * P(MetSyn) / P(Feature)
posteriors = {}

for feature, likelihood in likelihoods.items():
    # For each feature, need to calculate P(Feature) in total population
    if feature.startswith('age_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['Age'] >= 45)])
    elif feature.startswith('age_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['Age'] >= 55)])
    elif feature.startswith('waist_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['WaistCirc'] >= 102)])
    elif feature.startswith('waist_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['WaistCirc'] >= 88)])
    elif feature.startswith('hdl_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['HDL'] < 40)])
    elif feature.startswith('hdl_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['HDL'] < 50)])
    elif feature.startswith('uric_acid_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['UricAcid'] > 7.0)])
    elif feature.startswith('uric_acid_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['UricAcid'] > 6.0)])
    elif feature == 'bmi':
        total_positive = len(dataset[dataset['BMI'] >= 30])
    elif feature == 'glucose':
        total_positive = len(dataset[dataset['BloodGlucose'] >= 100])
    elif feature == 'triglycerides':
        total_positive = len(dataset[dataset['Triglycerides'] >= 150])
    elif feature == 'albuminuria':
        total_positive = len(dataset[dataset['Albuminuria'] == 1])
    elif feature == 'uralbcr':
        total_positive = len(dataset[dataset['UrAlbCr'] >= 30])
    
    else:
        continue

    # Calculate P(Feature) - total probability of feature in population
    p_feature = total_positive / len(dataset)
    
    # Apply Bayes' Theorem
    # P(MetSyn|Feature) = P(Feature|MetSyn) * P(MetSyn) / P(Feature)
    if p_feature > 0:
        posterior = (likelihood * prior) / p_feature
        posteriors[feature] = posterior
    else:
        posteriors[feature] = 0

# Format results
results = pd.DataFrame(posteriors.items(), columns=['Parameter', 'Posterior'])
results['Posterior'] = results['Posterior'].round(3)
results['Percentage'] = (results['Posterior'] * 100).round(1).astype(str) + '%'
results = results.sort_values('Posterior', ascending=False)

print(f"Prior probability of Metabolic Syndrome: {prior:.3f} ({prior*100:.1f}%)\n")
print("Posterior probabilities by parameter:")
print(results)