In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')

In [2]:
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)
print(type(dataset['Sex'][0]))
dataset.iloc[:, 2] = dataset.iloc[:, 2].fillna(dataset.iloc[:, 2].mean())
dataset.iloc[:, 4] = dataset.iloc[:, 4].fillna(dataset.iloc[:, 4].mean())
dataset.iloc[:, 5] = dataset.iloc[:, 5].fillna(dataset.iloc[:, 5].mean())


<class 'numpy.int64'>


  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


In [3]:
prior = len(dataset[dataset['MetabolicSyndrome'] == 1]) / len(dataset)
print(prior)

0.3423573511037068


In [4]:
def safe_division(numerator, denominator):
    """
    Safely perform division, returning 0 if denominator is 0
    """
    try:
        return numerator / denominator if denominator != 0 else 0
    except:
        return 0

def calculate_likelihoods(dataset):
    """
    Calculate likelihoods for all variables based on clinical thresholds
    with error handling for zero cases
    """
    likelihoods = {}
    
    
    # Age (different thresholds by sex)
    male_ms = len(dataset[(dataset['MetabolicSyndrome']==1) & (dataset['Sex']==0)])
    female_ms = len(dataset[(dataset['MetabolicSyndrome']==1) & (dataset['Sex']==1)])
    total_ms = len(dataset[dataset['MetabolicSyndrome']==1])
    
    likelihoods['age_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['Age'] >= 45)]),
        male_ms
    )
    
    likelihoods['age_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['Age'] >= 55)]),
        female_ms
    )
    
    # Sex (overall prevalence by sex)
    likelihoods['sex_male'] = safe_division(male_ms, total_ms)
    likelihoods['sex_female'] = safe_division(female_ms, total_ms)
    
    # race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}
    likelihoods['race_white'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 0)
                    ]), total_ms)
    likelihoods['race_asian'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 1)
                    ]), total_ms)
    likelihoods['race_black'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 2)
                    ]), total_ms) 
    likelihoods['race_MexAmerican'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 4)
                    ]), total_ms)
        
    likelihoods['race_Hispanic'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) &
            (dataset['Race'] == 5)
                    ]), total_ms)

    
    
    # BMI
    likelihoods['bmi'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['BMI'] >= 30)]),
        total_ms
    )
    
    # Waist Circumference (gender-specific)
    likelihoods['waist_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['WaistCirc'] >= 102)]),
        male_ms
    )
    
    likelihoods['waist_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['WaistCirc'] >= 88)]),
        female_ms
    )
    
    # Blood Glucose
    likelihoods['glucose'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['BloodGlucose'] >= 100)]),
        total_ms
    )
    
    # HDL (gender-specific)
    likelihoods['hdl_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['HDL'] < 40)]),
        male_ms
    )
    
    likelihoods['hdl_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['HDL'] < 50)]),
        female_ms
    )
    
    # Triglycerides
    likelihoods['triglycerides'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Triglycerides'] >= 150)]),
        total_ms
    )
    
    # Albuminuria
    likelihoods['albuminuria'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Albuminuria'] == 1)]),
        total_ms
    )
    
    # UrAlbCr
    likelihoods['uralbcr'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['UrAlbCr'] >= 30)]),
        total_ms
    )
    
    # Uric Acid (gender-specific)
    likelihoods['uric_acid_male'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==0) &
            (dataset['UricAcid'] > 7.0)]),
        male_ms
    )
    
    likelihoods['uric_acid_female'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome']==1) & 
            (dataset['Sex']==1) &
            (dataset['UricAcid'] > 6.0)]),
        female_ms
    )
    
    return likelihoods

def format_results(likelihoods):
    """
    Format the likelihoods into a readable DataFrame
    """
    results = pd.DataFrame(likelihoods.items(), columns=['Measure', 'Likelihood'])
    results['Likelihood'] = results['Likelihood'].round(3)
    results['Percentage'] = (results['Likelihood'] * 100).round(1).astype(str) + '%'
    return results.sort_values('Likelihood', ascending=False)

likelihoods = calculate_likelihoods(dataset)
results_df = format_results(likelihoods)
print(results_df)

             Measure  Likelihood Percentage
11      waist_female       0.933      93.3%
12           glucose       0.855      85.5%
10        waist_male       0.768      76.8%
0           age_male       0.673      67.3%
9                bmi       0.624      62.4%
14        hdl_female       0.610      61.0%
1         age_female       0.551      55.1%
15     triglycerides       0.547      54.7%
2           sex_male       0.510      51.0%
3         sex_female       0.490      49.0%
13          hdl_male       0.468      46.8%
4         race_white       0.410      41.0%
19  uric_acid_female       0.320      32.0%
18    uric_acid_male       0.291      29.1%
6         race_black       0.220      22.0%
17           uralbcr       0.203      20.3%
16       albuminuria       0.159      15.9%
7   race_MexAmerican       0.125      12.5%
5         race_asian       0.099       9.9%
8      race_Hispanic       0.021       2.1%


In [None]:
# Calculate likelihoods using your existing function
likelihoods = calculate_likelihoods(dataset)

# Calculate prior
prior = len(dataset[dataset['MetabolicSyndrome'] == 1]) / len(dataset)

# Calculate posteriors using Bayes' Theorem
# P(MetSyn|Feature) = P(Feature|MetSyn) * P(MetSyn) / P(Feature)
posteriors = {}

for feature, likelihood in likelihoods.items():
    # For each feature, need to calculate P(Feature) in total population
    if feature.startswith('age_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['Age'] >= 45)])
    elif feature.startswith('age_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['Age'] >= 55)])
    elif feature.startswith('waist_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['WaistCirc'] >= 102)])
    elif feature.startswith('waist_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['WaistCirc'] >= 88)])
    elif feature.startswith('hdl_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['HDL'] < 40)])
    elif feature.startswith('hdl_female'):
        
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['HDL'] < 50)])
    elif feature == 'race_white':
        total_positive = len(dataset[dataset['Race'] == 0])
    elif feature == 'race_asian':
        total_positive = len(dataset[dataset['Race'] == 1])

    elif feature.startswith('uric_acid_male'):
        total_positive = len(dataset[(dataset['Sex']==0) & (dataset['UricAcid'] > 7.0)])
    elif feature.startswith('uric_acid_female'):
        total_positive = len(dataset[(dataset['Sex']==1) & (dataset['UricAcid'] > 6.0)])
    elif feature == 'bmi':
        total_positive = len(dataset[dataset['BMI'] >= 30])
    elif feature == 'glucose':
        total_positive = len(dataset[dataset['BloodGlucose'] >= 100])
    elif feature == 'triglycerides':
        total_positive = len(dataset[dataset['Triglycerides'] >= 150])
    elif feature == 'albuminuria':
        total_positive = len(dataset[dataset['Albuminuria'] == 1])
    elif feature == 'uralbcr':
        total_positive = len(dataset[dataset['UrAlbCr'] >= 30])
    
    else:
        continue

    # Calculate P(Feature) - total probability of feature in population
    p_feature = total_positive / len(dataset)
    
    # Apply Bayes' Theorem
    # P(MetSyn|Feature) = P(Feature|MetSyn) * P(MetSyn) / P(Feature)
    if p_feature > 0:
        posterior = (likelihood * prior) / p_feature
        posteriors[feature] = posterior
    else:
        posteriors[feature] = 0

# Format results
results = pd.DataFrame(posteriors.items(), columns=['Parameter', 'Posterior'])
results['Posterior'] = results['Posterior'].round(3)
results['Percentage'] = (results['Posterior'] * 100).round(1).astype(str) + '%'
results = results.sort_values('Posterior', ascending=False)

print(f"Prior probability of Metabolic Syndrome: {prior:.3f} ({prior*100:.1f}%)\n")
print("Posterior probabilities by parameter:")
print(results)

Prior probability of Metabolic Syndrome: 0.342 (34.2%)

Posterior probabilities by parameter:
           Parameter  Posterior Percentage
7         hdl_female      1.378     137.8%
6           hdl_male      1.354     135.4%
3         waist_male      1.319     131.9%
12  uric_acid_female      1.235     123.5%
4       waist_female      0.995      99.5%
1         age_female      0.993      99.3%
11    uric_acid_male      0.981      98.1%
0           age_male      0.833      83.3%
8      triglycerides      0.749      74.9%
2                bmi      0.599      59.9%
5            glucose      0.587      58.7%
10           uralbcr      0.535      53.5%
9        albuminuria      0.516      51.6%


In [1]:
import pandas as pd
import numpy as np

def safe_division(numerator, denominator):
    """
    Safely perform division, returning 0 if denominator is 0
    """
    try:
        return numerator / denominator if denominator != 0 else 0
    except:
        return 0

def calculate_likelihoods(dataset):
    """
    Calculate likelihoods for all variables based on clinical thresholds
    with error handling for zero cases
    """
    # Total number of metabolic syndrome cases
    total_cases = len(dataset[dataset['MetabolicSyndrome'] == 1])
    total_samples = len(dataset)
    
    # Separate male and female subsets for metabolic syndrome
    male_ms_cases = len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 0)])
    female_ms_cases = len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 1)])
    
    likelihoods = {}
    
    # Age Likelihoods
    likelihoods['age_male_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 0) & (dataset['Age'] >= 45)]),
        total_cases
    )
    
    likelihoods['age_female_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 1) & (dataset['Age'] >= 55)]),
        total_cases
    )
    
    # Race Likelihoods
    race_mapping = {0: 'White', 1: 'Asian', 2: 'Black', 3: 'MexAmerican', 4: 'Hispanic', 5: 'Other'}
    
    for race_code, race_name in race_mapping.items():
        likelihoods[f'race_{race_name.lower()}'] = safe_division(
            len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Race'] == race_code)]),
            total_cases
        )
    
    # BMI Likelihood
    likelihoods['bmi_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['BMI'] >= 30)]),
        total_cases
    )
    
    # Waist Circumference Likelihoods
    likelihoods['waist_male_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 0) & (dataset['WaistCirc'] >= 102)]),
        male_ms_cases
    )
    
    likelihoods['waist_female_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 1) & (dataset['WaistCirc'] >= 88)]),
        female_ms_cases
    )
    
    # Blood Glucose Likelihood
    likelihoods['glucose_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['BloodGlucose'] >= 100)]),
        total_cases
    )
    
    # HDL Likelihoods
    likelihoods['hdl_male_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 0) & (dataset['HDL'] < 40)]),
        male_ms_cases
    )
    
    likelihoods['hdl_female_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 1) & (dataset['HDL'] < 50)]),
        female_ms_cases
    )
    
    # Triglycerides Likelihood
    likelihoods['triglycerides_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Triglycerides'] >= 150)]),
        total_cases
    )
    
    # Albuminuria Likelihood
    likelihoods['albuminuria_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Albuminuria'] == 1)]),
        total_cases
    )
    
    # UrAlbCr Likelihood
    likelihoods['uralbcr_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['UrAlbCr'] >= 30)]),
        total_cases
    )
    
    # Uric Acid Likelihoods
    likelihoods['uric_acid_male_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 0) & (dataset['UricAcid'] > 7.0)]),
        male_ms_cases
    )
    
    likelihoods['uric_acid_female_risk'] = safe_division(
        len(dataset[(dataset['MetabolicSyndrome'] == 1) & (dataset['Sex'] == 1) & (dataset['UricAcid'] > 6.0)]),
        female_ms_cases
    )
    
    return likelihoods

def calculate_posteriors(dataset, likelihoods):
    """
    Calculate posterior probabilities using Bayes' Theorem
    """
    # Prior probability of Metabolic Syndrome
    prior = len(dataset[dataset['MetabolicSyndrome'] == 1]) / len(dataset)
    total_samples = len(dataset)
    
    posteriors = {}
    
    # Probability of feature in the entire population
    def calculate_feature_prob(condition):
        return len(dataset[condition]) / total_samples
    
    # Calculate posterior for each likelihood
    feature_conditions = {
        'age_male_risk': (dataset['Sex'] == 0) & (dataset['Age'] >= 45),
        'age_female_risk': (dataset['Sex'] == 1) & (dataset['Age'] >= 55),
        'bmi_risk': dataset['BMI'] >= 30,
        'waist_male_risk': (dataset['Sex'] == 0) & (dataset['WaistCirc'] >= 102),
        'waist_female_risk': (dataset['Sex'] == 1) & (dataset['WaistCirc'] >= 88),
        'glucose_risk': dataset['BloodGlucose'] >= 100,
        'hdl_male_risk': (dataset['Sex'] == 0) & (dataset['HDL'] < 40),
        'hdl_female_risk': (dataset['Sex'] == 1) & (dataset['HDL'] < 50),
        'triglycerides_risk': dataset['Triglycerides'] >= 150,
        'albuminuria_risk': dataset['Albuminuria'] == 1,
        'uralbcr_risk': dataset['UrAlbCr'] >= 30,
        'uric_acid_male_risk': (dataset['Sex'] == 0) & (dataset['UricAcid'] > 7.0),
        'uric_acid_female_risk': (dataset['Sex'] == 1) & (dataset['UricAcid'] > 6.0)
    }
    
    race_mapping = {0: 'White', 1: 'Asian', 2: 'Black', 3: 'MexAmerican', 4: 'Hispanic', 5: 'Other'}
    for race_code, race_name in race_mapping.items():
        feature_conditions[f'race_{race_name.lower()}'] = dataset['Race'] == race_code
    
    # Bayes' Theorem: P(A|B) = P(B|A) * P(A) / P(B)
    for feature, condition in feature_conditions.items():
        # Probability of the feature in the entire population
        p_feature = calculate_feature_prob(condition)
        
        # Likelihood of the feature given Metabolic Syndrome
        likelihood = likelihoods.get(feature, 0)
        
        # Calculate posterior probability
        if p_feature > 0:
            posterior = (likelihood * prior) / p_feature
            posteriors[feature] = min(1.0, posterior)  # Ensure posterior doesn't exceed 1
        else:
            posteriors[feature] = 0
    
    return prior, posteriors

# Main execution
def analyze_metabolic_syndrome(dataset):
    # Calculate likelihoods
    likelihoods = calculate_likelihoods(dataset)
    
    # Format and print likelihoods
    likelihood_df = pd.DataFrame(likelihoods.items(), columns=['Measure', 'Likelihood'])
    likelihood_df['Likelihood'] = likelihood_df['Likelihood'].round(3)
    likelihood_df['Percentage'] = (likelihood_df['Likelihood'] * 100).round(1).astype(str) + '%'
    print("Likelihood Analysis:")
    print(likelihood_df.sort_values('Likelihood', ascending=False))
    print("\n")
    
    # Calculate posteriors
    prior, posteriors = calculate_posteriors(dataset, likelihoods)
    
    # Format and print posteriors
    posterior_df = pd.DataFrame(posteriors.items(), columns=['Parameter', 'Posterior'])
    posterior_df['Posterior'] = posterior_df['Posterior'].round(3)
    posterior_df['Percentage'] = (posterior_df['Posterior'] * 100).round(1).astype(str) + '%'
    posterior_df = posterior_df.sort_values('Posterior', ascending=False)
    
    print(f"Prior probability of Metabolic Syndrome: {prior:.3f} ({prior*100:.1f}%)\n")
    print("Posterior Probabilities by Parameter:")
    print(posterior_df)
    
    return likelihood_df, posterior_df

# Prepare the dataset (assuming the preprocessing steps from the original code)
# You would load the dataset here as in the original code
# dataset = pd.read_csv('path_to_dataset.csv')
# Preprocessing steps (encoding, filling missing values, etc.)

# Call the analysis function
# analyze_metabolic_syndrome(dataset)

import os

# Defining the path for the csv file
path = os.path.join("dataset.csv")

# Storing the dataframe in a variable named dataset
dataset = pd.read_csv(path)

# Dropping the unnecessary columns
dataset = dataset.drop('seqn', axis='columns')
dataset = dataset.drop('Marital', axis='columns')
# Encoding the categorical variables and filling in the missing values
sex_mapping = {'Male': 0, 'Female': 1}
race_mapping = {'White': 0, 'Asian': 1, 'Black': 2, 'MexAmerican': 3, 'Hispanic': 4, 'Other': 5}

dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
dataset['Race'] = dataset['Race'].replace(race_mapping)
print(type(dataset['Sex'][0]))
dataset = dataset.fillna(2)
dataset = dataset.fillna(4)
dataset = dataset.fillna(5)
prior = len(dataset[dataset['MetabolicSyndrome'] == 1]) / len(dataset)
print(prior)
analyze_metabolic_syndrome(dataset)

<class 'numpy.int64'>
0.3423573511037068
Likelihood Analysis:
                  Measure  Likelihood Percentage
10      waist_female_risk       0.933      93.3%
11           glucose_risk       0.855      85.5%
9         waist_male_risk       0.768      76.8%
8                bmi_risk       0.624      62.4%
13        hdl_female_risk       0.610      61.0%
14     triglycerides_risk       0.547      54.7%
12          hdl_male_risk       0.468      46.8%
2              race_white       0.410      41.0%
0           age_male_risk       0.343      34.3%
18  uric_acid_female_risk       0.320      32.0%
17    uric_acid_male_risk       0.291      29.1%
1         age_female_risk       0.270      27.0%
4              race_black       0.220      22.0%
16           uralbcr_risk       0.203      20.3%
15       albuminuria_risk       0.159      15.9%
6           race_hispanic       0.125      12.5%
5        race_mexamerican       0.125      12.5%
3              race_asian       0.099       9.9%
7      

  dataset['Sex'] = dataset['Sex'].replace(sex_mapping)
  dataset['Race'] = dataset['Race'].replace(race_mapping)


(                  Measure  Likelihood Percentage
 0           age_male_risk       0.343      34.3%
 1         age_female_risk       0.270      27.0%
 2              race_white       0.410      41.0%
 3              race_asian       0.099       9.9%
 4              race_black       0.220      22.0%
 5        race_mexamerican       0.125      12.5%
 6           race_hispanic       0.125      12.5%
 7              race_other       0.021       2.1%
 8                bmi_risk       0.624      62.4%
 9         waist_male_risk       0.768      76.8%
 10      waist_female_risk       0.933      93.3%
 11           glucose_risk       0.855      85.5%
 12          hdl_male_risk       0.468      46.8%
 13        hdl_female_risk       0.610      61.0%
 14     triglycerides_risk       0.547      54.7%
 15       albuminuria_risk       0.159      15.9%
 16           uralbcr_risk       0.203      20.3%
 17    uric_acid_male_risk       0.291      29.1%
 18  uric_acid_female_risk       0.320      32.0%,