In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load the dataset
data = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")

# Check the first few rows and column names
print(data.head())
print(data.columns)

# Summary statistics of the dataset
print(data.describe())
print("Colnum : ", len(data.columns))
print("Rownum : ", len(data))

In [96]:
# Remove duplicates
print("Number of rows before removing duplicates: ", len(data))
data.drop_duplicates(inplace=True)
print("Number of rows after removing duplicates: ", len(data))

# Encode Diabetes_012 into binary values (0 and 1)
data['Diabetes'] = np.where(data['Diabetes_012'].isin([1, 2]), 1, 0)
data.drop(columns=['Diabetes_012'], inplace=True)
print("Diabetes values:")
print(data['Diabetes'].value_counts(dropna=False))

Number of rows before removing duplicates:  253680
Number of rows after removing duplicates:  229781
Diabetes values:
0    190055
1     39726
Name: Diabetes, dtype: int64


In [None]:
# Unique values in each column
print(data.nunique())

# Unique values for Diabetes_012
print(data['Diabetes'].value_counts(dropna=False))

# Check data types
print(data.dtypes)

# Check for missing values
print(data.isnull().sum())

In [98]:
# Encode categorical variables
categorical_cols = ['Education', 'Income', 'GenHlth']
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Calculate correlation matrix for non-binary columns


1. Join P.

Aka je pravdepodobnost, ze ten ktory ma vacsi príjem ma lepsiu kvlitu vzdelavania a ma lepsi zdravotny stav  ?
P(I,E,G) = P(I) * P(E|I) * P(G|E) * P(D) * P(pa) * P(Ph) - parent child


2. Marginal p.

Aka je pravdepoobnost, ze zdravotny stav cloveka je dobry ? 
P(E) = P(E|I) * P(I)                        - parent child          
P(Ph) = P(Dw) * P(Pa) * P(Ph | Dw, Pa)
P(Gh) = P(E) * P(Ph) * P(Gh | E, Ph)

P(Gh) = P(I,Dw,Pa,E,Ph,Gh) ??

3. Join

Aka je pravdepodobnost, ze mlady s vysokym cholesterolom ma nizky krvny tlak?


4. Conditional
Aka je pravdepodobnost, ze clovek ma diabetes, za predpoklad ze ma nizky krvny tlakm vysoky cholesterol a je mlady?

5. conditional
Aka je pravdepodobnost, ze clovek ma dobry fyzicky stav za predpoklad ze casto poprechadza ale nie je fizycky aktivna?


In [None]:

# Define the bins for one-hot encoding
physhlth_bin = [0,15,31]
age_bin = [0,6,14]
income_bin = [0,4,8]
education_bin = [0,3,6]
GenHlth_bin = [0,2,5]
labels = [0, 1]

# Make a copy of the DataFrame 'data'
data_bayes = data.copy()

# Apply one-hot encoding to the 'PhysHlth' column
data_bayes['PhysHlth'] = pd.cut(data_bayes['PhysHlth'], bins=physhlth_bin, labels=labels, right=False)
data_bayes['Age'] = pd.cut(data_bayes['Age'], bins=age_bin, labels=labels, right=False)
data_bayes['Income'] = pd.cut(data_bayes['Income'], bins=income_bin, labels=labels, right=False)
data_bayes['Education'] = pd.cut(data_bayes['Education'], bins=education_bin, labels=labels, right=False)
data_bayes['GenHlth'] = pd.cut(data_bayes['GenHlth'], bins=GenHlth_bin, labels=labels, right=False)


print(data_bayes['GenHlth'].value_counts(dropna=False))


variables_parent_leaf = ['Age',  'Income',  'DiffWalk', 'PhysActivity']
vars_mid = ["Education", 'PhysHlth','HighBP']
parent_distributions = {}
print(data_bayes.nunique())



In [None]:
print(data_bayes)

In [None]:

for variable in variables_parent_leaf:
    # Calculate the probability distribution for the current variable
    prob_dist = data_bayes[variable].value_counts(normalize=True).sort_index()
    
    # Create a DataFrame for the current variable
    df = pd.DataFrame({'Value': prob_dist.index, 'Probability': prob_dist.values})
    
    # Save the DataFrame into probability_list dictionary with index of variable name
    parent_distributions[variable] = df

    # Display the probability distribution table
    print(f"P({variable})")
    print(df)
    
    # Calculate and display the sum of probabilities
    print(f"Sum of probabilities: {prob_dist.sum()}\n")

# Print the probability_list dictionary
print(parent_distributions)

EDUCATION CPT

In [None]:
education = pd.DataFrame(index=[0, 1, 2, 3], columns=['Income', 'Education'])

for i in range(4):
    Income_val = i // 2 
    Education_val = i % 2 
    
    query_str = f"Income == {Income_val} and Education == {Education_val}"
    filtered_rows = data_bayes.query(query_str)
    
    count = len(filtered_rows)
    
    education.at[i, 'Income'] = Income_val
    education.at[i, 'Education'] = Education_val
    education.at[i, 'Probability'] = count / len(data_bayes)

parent_distributions['Education'] = education

print(parent_distributions['Education'])
print(education['Probability'].sum())



HIGH BP CPT

In [None]:
# Initialize an empty DataFrame for the conditional probability table (CPT)
highp = pd.DataFrame(index=[0, 1, 2, 3, 4, 5, 6, 7], columns=['HighBP', 'HighChol', 'Age'])

# Iterate over all possible combinations of HighBP, HighChol, and Age
for i in range(8):
    walk_val = i // 4  # 0 or 1
    activity_val = (i % 4) // 2  # 0 or 1
    Age_val = i % 2  # 0 or 1
    
    # Query the DataFrame to filter rows based on current values of HighBP, HighChol, and Age
    query_str = f"HighBP == {walk_val} and HighChol == {activity_val} and Age == {Age_val}"
    filtered_rows = data_bayes.query(query_str)
    
    # Count the number of rows in the filtered DataFrame
    count = len(filtered_rows)
    
    # Store the count in the corresponding row of the cpt DataFrame
    highp.at[i, 'HighBP'] = walk_val
    highp.at[i, 'HighChol'] = activity_val
    highp.at[i, 'Age'] = Age_val
    highp.at[i, 'Probability'] = count / len(data_bayes)

parent_distributions['HighBP'] = highp
print(highp['Probability'].sum())

print(parent_distributions['HighBP'])


 PHYSHLTH CPT

In [None]:
physhlth = pd.DataFrame(index=[0, 1, 2, 3, 4, 5, 6, 7], columns=['DiffWalk', 'PhysActivity', 'PhysHlth'])

for i in range(8):
    walk_val = i // 4  
    activity_val = (i % 4) // 2 
    phhealth_val = i % 2  # 
    
    query_str = f"DiffWalk == {walk_val} and PhysActivity == {activity_val} and PhysHlth == {phhealth_val}"
    filtered_rows = data_bayes.query(query_str)
    
    # Count the number of rows in the filtered DataFrame
    count = len(filtered_rows)
    
    # Store the count in the corresponding row of the cpt DataFrame
    physhlth.at[i, 'DiffWalk'] = walk_val
    physhlth.at[i, 'PhysActivity'] = activity_val
    physhlth.at[i, 'PhysHlth'] = phhealth_val
    physhlth.at[i, 'Probability'] = count / len(data_bayes)

parent_distributions['PhysHlth'] = physhlth

print(parent_distributions['PhysHlth'])
print(physhlth['Probability'].sum())



GENHLTH

In [None]:
genhlth = pd.DataFrame(index=[0, 1, 2, 3, 4, 5, 6, 7], columns=[ 'PhysHlth','Education', 'GenHlth'])

for i in range(8):
    PhysHlth_val = i // 4  # 0 or 1
    Education_val = (i % 4) // 2  # 0 or 1
    GenHlth_val = i % 2  # 0 or 1
    
    # Query the DataFrame to filter rows based on current values of PhysHlth, Education, and GenHlth
    query_str = f"PhysHlth == {PhysHlth_val} and Education == {Education_val} and GenHlth == {GenHlth_val}"
    filtered_rows = data_bayes.query(query_str)
    
    count = len(filtered_rows)
    
    genhlth.at[i, 'PhysHlth'] = PhysHlth_val
    genhlth.at[i, 'Education'] = Education_val
    genhlth.at[i, 'GenHlth'] = GenHlth_val
    genhlth.at[i, 'Probability'] = count / len(data_bayes)
# Display the resulting table
print(genhlth)
parent_distributions['GenHlth'] = genhlth
print(genhlth['Probability'].sum())

DIABETES

In [None]:
diabetes = pd.DataFrame(index=[0, 1, 2, 3, 4, 5, 6, 7], columns=[ 'HighBP', 'GenHlth', 'Diabetes'])

for i in range(8):
    highbpval = i // 4  
    eduval = (i % 4) // 2  
    genhlthval = i % 2  
    
    # Query the DataFrame to filter rows based on current values of PhysHlth, Education, and GenHlth
    query_str = f"HighBP == {PhysHlth_val} and GenHlth == {Education_val} and Diabetes == {GenHlth_val}"
    filtered_rows = data_bayes.query(query_str)
    
    count = len(filtered_rows)
    
    diabetes.at[i, 'HighBP'] = highbpval
    diabetes.at[i, 'GenHlth'] = eduval
    diabetes.at[i, 'Diabetes'] = genhlthval
    diabetes.at[i, 'Probability'] = count / len(data_bayes)
# Display the resulting table
print(diabetes)
parent_distributions['Diabetes'] = diabetes
print(diabetes['Probability'].sum())

In [107]:
for key, value in parent_distributions.items():
    print(f"{key}: \n {value} \n ")

Age: 
   Value  Probability
0     0     0.212763
1     1     0.787237 
 
Income: 
   Value  Probability
0     0     0.249925
1     1     0.750075 
 
DiffWalk: 
    Value  Probability
0    0.0     0.814493
1    1.0     0.185507 
 
PhysActivity: 
    Value  Probability
0    0.0     0.266645
1    1.0     0.733355 
 
Education: 
   Income Education  Probability
0      0         0     0.040247
1      0         1     0.209678
2      1         0     0.019292
3      1         1     0.730783 
 
HighBP: 
   HighBP HighChol Age  Probability
0      0        0   0     0.144176
1      0        0   1     0.230646
2      0        1   0     0.028588
3      0        1   1     0.142148
4      1        0   0     0.024184
5      1        0   1     0.159234
6      1        1   0     0.015815
7      1        1   1     0.255208 
 
PhysHlth: 
   DiffWalk PhysActivity PhysHlth  Probability
0        0            0        0     0.158808
1        0            0        1     0.017817
2        0            1        