In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
# Load the dataset
data = pd.read_csv("diabetes_012_health_indicators_BRFSS2015.csv")

# Check the first few rows and column names
print(data.head())
print(data.columns)

# Summary statistics of the dataset
print(data.describe())
print("Počet stĺpcov : ", len(data.columns))
print("Počet riadkov : ", len(data))

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

In [4]:
# Remove duplicates
print("Number of rows before removing duplicates: ", len(data))
data.drop_duplicates(inplace=True)
print("Number of rows after removing duplicates: ", len(data))

# Encode Diabetes_012 into binary values (0 and 1)
data['Diabetes'] = np.where(data['Diabetes_012'].isin([1, 2]), 1, 0)
data.drop(columns=['Diabetes_012'], inplace=True)
print("Diabetes values:")
print(data['Diabetes'].value_counts(dropna=False))

Number of rows before removing duplicates:  253680
Number of rows after removing duplicates:  229781
Diabetes values:
Diabetes
0    190055
1     39726
Name: count, dtype: int64


In [5]:
# Unique values in each column
print(f"Počet unikátnych hodnôt pre stĺpce:\n{data.nunique()}\n")

# Unique values for Diabetes_012
print(f"Počty hodnôt pre diabetes\n{data['Diabetes'].value_counts(dropna=False)}\n")

# Check data types
print(f"Dátové typy premenných:\n{data.dtypes}\n")

# Check for missing values
print(data.isnull().sum())

Počet unikátnych hodnôt pre stĺpce:
HighBP                   2
HighChol                 2
CholCheck                2
BMI                     84
Smoker                   2
Stroke                   2
HeartDiseaseorAttack     2
PhysActivity             2
Fruits                   2
Veggies                  2
HvyAlcoholConsump        2
AnyHealthcare            2
NoDocbcCost              2
GenHlth                  5
MentHlth                31
PhysHlth                31
DiffWalk                 2
Sex                      2
Age                     13
Education                6
Income                   8
Diabetes                 2
dtype: int64

Počty hodnôt pre diabetes
Diabetes
0    190055
1     39726
Name: count, dtype: int64

Dátové typy premenných:
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity          

# MiniProjekt 3 - Bayesovsé siete

Vedci sa v roku XXXX rozhodli spraviť experimentálny prieskum na základe ich domnienok:
- Že s pribúdajúcim vekom sa u ľudí zvyšuje riziko vysokého krvného tlaku **(HighBP)**. 
- Že vysoký cholesterol má priamy vplyv na vyšší krvný tlak **(HighBP)**.
- Okrem toho sa domnievali, že ľudia s vysokým cholesterolom sa cítia menej všeobecne zdraví **(GenHlth)**.
- Že úroveň príjmu sa odzrkadluje na úrovni vzdelania
    - A úroveň vzdelania vplýva na všeobecné zdravie **(GenHlth)**.

Problémy s Pohyblivosťou(DiffWalk) a nizkou Fyzickou Aktivitu(PhysActivity): 
- Vedci sa domnievali, že ľudia s problémami s chôdzou **(DiffWalk)** majú horšie fyzické zdravie **(PhysHlth)**.
- Druhou domnienkou bolo, že ľudia s nižšou fyzickou aktivitou **(PhysActivity**) majú nižšie fyzické zdravie **(PhysHlth)**.
    - A úroveň fyzického zdravia **(PhysHlth)** sa odrzrkadľuje na všeobecnom zdraví **(GenHlth)**. 

Zároveň prišli k domniekám, že: 
- Vyšší krvný tlak **(HighBP)** má vplyv na diabetes **(Diabetes)**.
- Všeobecné zdravie má vplyv na diabetes **(GenHlth)**.


Tieto predpoklady možno vizualizovať nasledujúcim stromom. 

![Bayesovský strom](images/Bayesian.drawio.png "Bayesov strom")



Unitkátne hodnoty pre nami sledované premenné

In [6]:
columns_in_tree = ['Age', 'HighBP', 'HighChol', 'Income', 'DiffWalk', 'PhysActivity',
                   'PhysHlth', 'Education', 'GenHlth', 'Diabetes']

for col in columns_in_tree:
    unique_vals = data[col].unique()  # Retrieve unique values
    print(f"Unique values for {col}: {sorted(unique_vals)}")

Unique values for Age: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
Unique values for HighBP: [0.0, 1.0]
Unique values for HighChol: [0.0, 1.0]
Unique values for Income: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
Unique values for DiffWalk: [0.0, 1.0]
Unique values for PhysActivity: [0.0, 1.0]
Unique values for PhysHlth: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]
Unique values for Education: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
Unique values for GenHlth: [1.0, 2.0, 3.0, 4.0, 5.0]
Unique values for Diabetes: [0, 1]


Vytvorenie binánrych stĺpcov

In [7]:
# Define bit ranges for each column
age_bin = [1.0,7.0,14.0]
income_bin = [1.0, 4.5, 9.0]
physhlth_bin = [0.0, 15.5, 31.0]
education_bin = [1.0, 3.5, 7.0]
gen_hlth_bin = [1.0, 3.5, 6.0]


labels = [0, 1]

# Make a copy of the DataFrame 'data'
data_bayes = data.copy()

# Apply binning with `pd.cut` for columns requiring manual binning
data_bayes['Age'] = pd.cut(data_bayes['Age'], bins=age_bin, labels=labels, right=False)
data_bayes['Income'] = pd.cut(data_bayes['Income'], bins=income_bin, labels=labels, right=False)
data_bayes['PhysHlth'] = pd.cut(data_bayes['PhysHlth'], bins=physhlth_bin, labels=labels, right=False)
data_bayes['Education'] = pd.cut(data_bayes['Education'], bins=education_bin, labels=labels, right=False)
data_bayes['GenHlth'] = pd.cut(data_bayes['GenHlth'], bins=gen_hlth_bin, labels=labels, right=False)

binary_columns = ['HighBP', 'HighChol', 'DiffWalk', 'PhysActivity', 'Diabetes']
for col in binary_columns:
    data_bayes[col] = data_bayes[col].astype(int)

print(f"Dáta v strome po zmene(počty unikátnych hodnôt):\n{data_bayes[columns_in_tree].nunique()}")

parent_nodes  = ['Age',  'Income',  'DiffWalk', 'PhysActivity', 'HighChol']
mid_modes = ["Education", 'PhysHlth','HighBP']
parent_distributions = {}

for col in columns_in_tree:
    unique_vals = data_bayes[col].unique()  # Retrieve unique values
    print(f"Unique values for {col}: {sorted(unique_vals)}")


Dáta v strome po zmene(počty unikátnych hodnôt):
Age             2
HighBP          2
HighChol        2
Income          2
DiffWalk        2
PhysActivity    2
PhysHlth        2
Education       2
GenHlth         2
Diabetes        2
dtype: int64
Unique values for Age: [0, 1]
Unique values for HighBP: [0, 1]
Unique values for HighChol: [0, 1]
Unique values for Income: [0, 1]
Unique values for DiffWalk: [0, 1]
Unique values for PhysActivity: [0, 1]
Unique values for PhysHlth: [0, 1]
Unique values for Education: [0, 1]
Unique values for GenHlth: [0, 1]
Unique values for Diabetes: [0, 1]


Vyrátanie pravdepodobností pre parent nodes

In [8]:
parent_distributions = {}

# For each parent node, calculate marginal probabilities
parent_nodes = ['Age', 'Income', 'DiffWalk', 'PhysActivity', 'HighChol']
for node in parent_nodes:
    counts = data_bayes[node].value_counts(normalize=True, dropna=False).round(3)
    parent_distributions[node] = counts.to_dict()
    print(f"Rozloženie pravdepodobností pre {node}: {parent_distributions[node]}")

Rozloženie pravdepodobností pre Age: {1: 0.712, 0: 0.288}
Rozloženie pravdepodobností pre Income: {1: 0.75, 0: 0.25}
Rozloženie pravdepodobností pre DiffWalk: {0: 0.814, 1: 0.186}
Rozloženie pravdepodobností pre PhysActivity: {1: 0.733, 0: 0.267}
Rozloženie pravdepodobností pre HighChol: {0: 0.558, 1: 0.442}


In [9]:
occurrences = data_bayes.groupby(['Education', 'Income']).size().reset_index(name='count')
occurrences['P(Education|Income)'] = occurrences['count'] / occurrences.groupby('Income')['count'].transform('sum')
print(occurrences)

  Education Income   count  P(Education|Income)
0         0      0    9248             0.161036
1         0      1    4433             0.025720
2         1      0   48180             0.838964
3         1      1  167920             0.974280


Na ukážku (len raz:D) aj s formulou  $$ P(Education|Income) = {P(Education,Income) \over P(Income)} $$

In [10]:
grouped_joint = data_bayes.groupby(['DiffWalk', 'PhysActivity', 'PhysHlth']).size().reset_index(name='count')
grouped_marginal = data_bayes.groupby(['DiffWalk', 'PhysActivity']).size().reset_index(name='total')

grouped = pd.merge(grouped_joint, grouped_marginal, on=['DiffWalk', 'PhysActivity'])
grouped['P(PhysHlth|DiffWalk,PhysActivity)'] = (grouped['count'] / grouped['total']).round(6)

# Print in the desired format
print("DiffWalk PhysActivity PhysHlth count P(PhysHlth|DiffWalk,PhysActivity)")
for _, row in grouped.iterrows():
    print(f"{row['DiffWalk']}         {row['PhysActivity']}      {row['PhysHlth']}    {row['count']}     {row['P(PhysHlth|DiffWalk,PhysActivity)']:.6f}")

DiffWalk PhysActivity PhysHlth count P(PhysHlth|DiffWalk,PhysActivity)
0.0         0.0      0.0    37200.0     0.916595
0.0         0.0      1.0    3385.0     0.083405
0.0         1.0      0.0    140431.0     0.958116
0.0         1.0      1.0    6139.0     0.041884
1.0         0.0      0.0    11156.0     0.539328
1.0         0.0      1.0    9529.0     0.460672
1.0         1.0      0.0    14851.0     0.676861
1.0         1.0      1.0    7090.0     0.323139


In [11]:
P_diffwalk = data_bayes['DiffWalk'].value_counts(normalize=True)
P_physactivity = data_bayes['PhysActivity'].value_counts(normalize=True)

# Conditional probability of PhysHlth given DiffWalk and PhysActivity
conditional_physhealth = data_bayes.groupby(['DiffWalk', 'PhysActivity'])['PhysHlth'].value_counts(normalize=True).unstack()

# Compute the marginal probability of PhysHlth
marginal_physhealth = 0
for diffwalk_val in P_diffwalk.index:
    for physactivity_val in P_physactivity.index:
        if (diffwalk_val, physactivity_val) in conditional_physhealth.index:
            P_physhlth_given_parents = conditional_physhealth.loc[(diffwalk_val, physactivity_val)]
            marginal_physhealth += P_physhlth_given_parents * P_diffwalk[diffwalk_val] * P_physactivity[physactivity_val]

print(f"Marginálna pravdepodobnosť \n{marginal_physhealth}")

Marginálna pravdepodobnosť 
PhysHlth
0    0.89012
1    0.10988
Name: (0, 1), dtype: float64


In [12]:
# P(Education, Income)
joint_prob = data_bayes.groupby(['Education', 'Income']).size().reset_index(name='count')

#P(Income)
marginal_prob_y = data_bayes.groupby('Income').size().reset_index(name='count')
marginal_prob_y['P(Income)'] = marginal_prob_y['count'] / data_bayes.shape[0]

#P(Education|Income)
conditional_prob = pd.merge(joint_prob, marginal_prob_y[['Income', 'P(Income)']], on='Income')
joint_prob['P(Education|Income)'] = joint_prob['count'] / joint_prob.groupby('Income')['count'].transform('sum')

# Display
print(joint_prob)

  Education Income   count  P(Education|Income)
0         0      0    9248             0.161036
1         0      1    4433             0.025720
2         1      0   48180             0.838964
3         1      1  167920             0.974280


In [13]:
grouped_joint = data_bayes.groupby(['Education', 'HighChol', 'GenHlth', 'PhysHlth']).size().reset_index(name='count')
grouped_marginal = data_bayes.groupby(['Education', 'HighChol', 'PhysHlth']).size().reset_index(name='total')

grouped = pd.merge(grouped_joint, grouped_marginal, on=['Education', 'HighChol', 'PhysHlth'])
grouped['P(GenHlth|Education,HighChol,PhysHlth)'] = (grouped['count'] / grouped['total']).round(6)

# Print in the desired format
print("Education HighChol GenHlth PhysHlth count P(GenHlth|Education,HighChol,PhysHlth)")
for _, row in grouped.iterrows():
    print(f"{row['Education']}         {row['HighChol']}      {row['GenHlth']}      {row['PhysHlth']}    {row['count']}     {row['P(GenHlth|Education,HighChol,PhysHlth)']:.6f}")

Education HighChol GenHlth PhysHlth count P(GenHlth|Education,HighChol,PhysHlth)
0.0         0.0      0.0      0.0    4082.0     0.730102
0.0         0.0      1.0      0.0    1509.0     0.269898
0.0         0.0      0.0      1.0    242.0     0.219203
0.0         0.0      1.0      1.0    862.0     0.780797
0.0         1.0      0.0      0.0    2903.0     0.573942
0.0         1.0      1.0      0.0    2155.0     0.426058
0.0         1.0      0.0      1.0    276.0     0.143154
0.0         1.0      1.0      1.0    1652.0     0.856846
1.0         0.0      0.0      0.0    102339.0     0.917946
1.0         0.0      1.0      0.0    9148.0     0.082054
1.0         0.0      0.0      1.0    3550.0     0.351799
1.0         0.0      1.0      1.0    6541.0     0.648201
1.0         1.0      0.0      0.0    69402.0     0.851537
1.0         1.0      1.0      0.0    12100.0     0.148463
1.0         1.0      0.0      1.0    3363.0     0.258295
1.0         1.0      1.0      1.0    9657.0     0.741705


# 1. Join Probability

- Aká je pravdepodobnost, že človek:
    - má väčší príjem **a** 
    - má lepšiu kvlitu vzdelania **a**
    - lepší zdravotný stav  ? 

Podľa parent child:

## P(Income<sup>+</sup>, Education<sup>+</sup>, GenHlth<sup>+</sup>)= P(Income<sup>+</sup>) × P(Education<sup>+</sup> ∣ Income<sup>+</sup>) × ∑<sub>HighChol, PhysHlth</sub> P(GenHlth<sup>+</sup> ∣ Education<sup>+</sup>, HighChol, PhysHlth) × P(HighChol) × P(PhysHlth)



![Otázka1](images/otazka1.jpg "Otázka 1 - papier")

### Výsledok je 12.78%

# 2. Conditional P

- Aka je pravdepodobnosť, ze človek má dobré fyzické zdravie za predpokladu že nemá problém s prechádzaním sa ale nie je fyzicky aktívny?

In [46]:
# Group data by 'DiffWalk', 'PhysActivity', and 'PhysHlth' and count occurrences
grouped_joint = data_bayes.groupby(['DiffWalk', 'PhysActivity', 'PhysHlth']).size().reset_index(name='count')

# Calculate the total number of observations across all groups
total_count = grouped_joint['count'].sum()

# Add a new column to calculate probabilities by dividing counts by the total count
grouped_joint['Probability'] = grouped_joint['count'] / total_count

# Display the grouped data with the new probabilities
print(grouped_joint)

   DiffWalk  PhysActivity PhysHlth   count  Probability
0         0             0        0   37200     0.161893
1         0             0        1    3385     0.014731
2         0             1        0  140431     0.611151
3         0             1        1    6139     0.026717
4         1             0        0   11156     0.048551
5         1             0        1    9529     0.041470
6         1             1        0   14851     0.064631
7         1             1        1    7090     0.030855


In [47]:
grouped_joint = data_bayes.groupby(['DiffWalk', 'PhysActivity']).size().reset_index(name='count')

# Calculate the total number of observations across all groups
total_count = grouped_joint['count'].sum()

# Add a new column to calculate probabilities by dividing counts by the total count
grouped_joint['Probability'] = grouped_joint['count'] / total_count
print(grouped_joint)

   DiffWalk  PhysActivity   count  Probability
0         0             0   40585     0.176625
1         0             1  146570     0.637868
2         1             0   20685     0.090020
3         1             1   21941     0.095487


### P(PhysicalHlth<sup>+</sup> | DiffWalk<sup>-</sup>, PhysActivity<sup>-</sup>)
![Otázka1](images/otazka2.jpg "Otázka 1 - papier")

### Výsledok je cca 8.3% , zaokrúhlili sme deliteľa na 2 desatinné ... cca 8.3% - 8.45%



# 3. Marginal P.

- Aka je pravdepoobnost, že zdravotný stav cloveka je dobry ? 


### P(GenHlth<sup>+</sup>)



# 4. Join Porbability

- Aká je pravdepodobnost, ze človek:
    - je mladý **a** 
    - má vysoký cholesterol **a**
    - má nizky krvný tlak?


Podľa parent child:
## P(Age<sup>-</sup>, HighChol<sup>+</sup>, GenHlth<sup>+</sup>) = P(Income<sup>+</sup>)×P(Education<sup>+</sup> ∣ Income<sup>+</sup>)×P(GenHlth<sup>+</sup> ∣ Education<sup>+</sup>)





In [None]:
#TODO

# 5. Conditional P

- Aká je pravdepodobnosť, že človek má diabetes, za predpokladu že má nízky krvný tlak, vysoký cholesterol a je mladý?

In [None]:
#TODO

In [None]:
#TODO