In [None]:
!kaggle datasets download -d alexteboul/diabetes-health-indicators-dataset

Dataset URL: https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset
License(s): CC0-1.0
Downloading diabetes-health-indicators-dataset.zip to /content
 83% 5.00M/6.03M [00:00<00:00, 41.4MB/s]
100% 6.03M/6.03M [00:00<00:00, 47.2MB/s]


In [None]:
# Instalasi paket
!pip install semopy

Collecting semopy
  Downloading semopy-2.3.11.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numdifftools (from semopy)
  Downloading numdifftools-0.9.41-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.2/100.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: semopy
  Building wheel for semopy (setup.py) ... [?25l[?25hdone
  Created wheel for semopy: filename=semopy-2.3.11-py3-none-any.whl size=1659681 sha256=878f40b667c10b4ac62cc337d98c43552e29b1f1439f7c67a46a3e4c2bfa055a
  Stored in directory: /root/.cache/pip/wheels/53/ec/0d/0b294c02d8c4e9e80afea58839f2c1b4706770594bc99ec045
Successfully built semopy
Installing collected packages: numdifftools, semopy
Successfully installed numdifftools-0.9.41 semopy-2.3.11


In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

In [None]:
zip_path = '/content/diabetes-health-indicators-dataset.zip'

with ZipFile(zip_path, 'r') as zip_ref:
  zip_ref.extractall('/content/')

In [None]:
file_path = '/content/diabetes_binary_health_indicators_BRFSS2015.csv'

df = pd.read_csv(file_path).astype(int)
pd.set_option('display.max_columns', None)

In [None]:
print("Jumlah missing values per kolom:")
print(df.isnull().sum())

Jumlah missing values per kolom:
Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [None]:
# Mengisi missing values dengan mean kolom sebagai contoh (bisa menggunakan metode lain yang lebih sesuai)
df.fillna(df.mean(), inplace=True)

In [None]:
# Periksa matriks korelasi untuk kolinearitas
print("\nMatriks Korelasi:")
print(df.corr())


Matriks Korelasi:
                      Diabetes_binary    HighBP  HighChol  CholCheck  \
Diabetes_binary              1.000000  0.263129  0.200276   0.064761   
HighBP                       0.263129  1.000000  0.298199   0.098508   
HighChol                     0.200276  0.298199  1.000000   0.085642   
CholCheck                    0.064761  0.098508  0.085642   1.000000   
BMI                          0.216843  0.213748  0.106722   0.034495   
Smoker                       0.060789  0.096991  0.091299  -0.009929   
Stroke                       0.105816  0.129575  0.092620   0.024158   
HeartDiseaseorAttack         0.177282  0.209361  0.180765   0.044206   
PhysActivity                -0.118133 -0.125267 -0.078046   0.004190   
Fruits                      -0.040779 -0.040555 -0.040859   0.023849   
Veggies                     -0.056584 -0.061266 -0.039874   0.006121   
HvyAlcoholConsump           -0.057056 -0.003972 -0.011543  -0.023730   
AnyHealthcare                0.016255  0.0384

In [None]:
model_desc= """
  LifestyleFactors =~ Smoker + PhysActivity + Fruits + Veggies + HvyAlcoholConsump
  HealthStatus =~ Diabetes_binary + HighBP + HighChol + Stroke + HeartDiseaseorAttack + BMI + DiffWalk + GenHlth
  HealthcareAccess =~ CholCheck + AnyHealthcare + NoDocbcCost
  MentalPhysicalHealth =~ MentHlth + PhysHlth
  Demographics =~ Sex + Age + Education + Income

  LifestyleFactors ~~ HealthStatus
  LifestyleFactors ~~ HealthcareAccess
  LifestyleFactors ~~ MentalPhysicalHealth
  HealthStatus ~~ HealthcareAccess
  HealthStatus ~~ MentalPhysicalHealth
  HealthcareAccess ~~ MentalPhysicalHealth
"""

In [None]:
from semopy import Model, calc_stats

# Membuat objek model
model = Model(model_desc)

# Melakukan fitting data ke model
model.fit(df)

# Mendapatkan hasil estimasi
results = model.inspect()
print("Hasil Estimasi Model:")
print(results)


Hasil Estimasi Model:
                    lval  op                  rval   Estimate  Std. Err  \
0                 Smoker   ~      LifestyleFactors   1.000000         -   
1           PhysActivity   ~      LifestyleFactors  -1.763458  0.021465   
2                 Fruits   ~      LifestyleFactors  -1.122163  0.015866   
3                Veggies   ~      LifestyleFactors  -1.034517   0.01392   
4      HvyAlcoholConsump   ~      LifestyleFactors  -0.049205  0.005007   
5        Diabetes_binary   ~          HealthStatus   1.000000         -   
6                 HighBP   ~          HealthStatus   1.501264  0.011114   
7               HighChol   ~          HealthStatus   1.079186  0.009825   
8                 Stroke   ~          HealthStatus   0.382744  0.003798   
9   HeartDiseaseorAttack   ~          HealthStatus   0.760189  0.006158   
10                   BMI   ~          HealthStatus  14.863227  0.132207   
11              DiffWalk   ~          HealthStatus   1.788420  0.010864   
12 

In [None]:
from semopy import calc_stats

# Menghitung statistik model
stats = calc_stats(model)
print("\nIndeks Kesesuaian Model:")
print(stats)


Indeks Kesesuaian Model:
       DoF  DoF Baseline          chi2  chi2 p-value  chi2 Baseline       CFI  \
Value  199           231  189983.44216           0.0  720841.842823  0.736634   

            GFI      AGFI       NFI       TLI     RMSEA        AIC  \
Value  0.736442  0.694061  0.736442  0.694284  0.061314  106.50218   

              BIC   LogLik  
Value  670.468941  0.74891  


In [None]:
# Definisikan model untuk LifestyleFactors
model_reviced_LifestyleFactors = """
 LifestyleFactors =~ Smoker + PhysActivity + Fruits + Veggies + HvyAlcoholConsump
"""

# Membuat objek model untuk LifestyleFactors
model_LifestyleFactors = Model(model_reviced_LifestyleFactors)

# Melakukan fitting data ke model LifestyleFactors
model_LifestyleFactors.fit(df)

# Menghitung indeks kesesuaian untuk LifestyleFactors
fit_stats_LifestyleFactors = calc_stats(model_LifestyleFactors)
print("\nIndeks Kesesuaian Model untuk LifestyleFactors:")
print(fit_stats_LifestyleFactors)

#2
# Definisikan model untuk HealthStatus
model_reviced_HealthStatus = """
HealthStat =~ HighBP + HighChol + Stroke + HeartDiseaseorAttack
"""

# Membuat objek model untuk HealthStatus
model_HealthStatus = Model(model_reviced_HealthStatus)

# Melakukan fitting data ke model HealthStatus
model_HealthStatus.fit(df)

# Menghitung indeks kesesuaian untuk HealthStatus
fit_stats_HealthStatus = calc_stats(model_HealthStatus)
print("\nIndeks Kesesuaian Model untuk HealthStatus:")
print(fit_stats_HealthStatus)


#3
# Definisikan model untuk HealthcareAccess
model_reviced_HealthcareAccess = """
HealthcareAccess =~ CholCheck + AnyHealthcare + NoDocbcCost + BMI
"""

# Membuat objek model untuk HealthcareAccess
model_HealthcareAccess = Model(model_reviced_HealthcareAccess)

# Melakukan fitting data ke model HealthcareAccess
model_HealthcareAccess.fit(df)

# Menghitung indeks kesesuaian untuk HealthcareAccess
fit_stats_HealthcareAccess = calc_stats(model_HealthcareAccess)
print("\nIndeks Kesesuaian Model untuk HealthcareAccess:")
print(fit_stats_HealthcareAccess)


#4
# Definisikan model untuk MentalPhysicalHealth
model_reviced_MentalPhysicalHealth = """
MentalPhysicalHealth =~ MentHlth + PhysHlth + GenHlth + DiffWalk
"""

# Membuat objek model untuk MentalPhysicalHealth
model_MentalPhysicalHealth = Model(model_reviced_MentalPhysicalHealth)

# Melakukan fitting data ke model MentalPhysicalHealth
model_MentalPhysicalHealth.fit(df)

# Menghitung indeks kesesuaian untuk MentalPhysicalHealth
fit_stats_MentalPhysicalHealth = calc_stats(model_MentalPhysicalHealth)
print("\nIndeks Kesesuaian Model untuk MentalPhysicalHealth:")
print(fit_stats_MentalPhysicalHealth)


#5
# Definisikan model untuk Demographics
model_reviced_Demographics = """
Demographics =~ Sex + Age + Education + Income
"""

# Membuat objek model untuk Demographics
model_Demographics = Model(model_reviced_Demographics)

# Melakukan fitting data ke model Demographics
model_Demographics.fit(df)

# Menghitung indeks kesesuaian untuk Demographics
fit_stats_Demographics = calc_stats(model_Demographics)
print("\nIndeks Kesesuaian Model untuk Demographics:")
print(fit_stats_Demographics)


Indeks Kesesuaian Model untuk LifestyleFactors:
       DoF  DoF Baseline         chi2  chi2 p-value  chi2 Baseline       CFI  \
Value    5            10  4283.699569           0.0   32266.981244  0.867356   

            GFI      AGFI       NFI       TLI    RMSEA        AIC         BIC  \
Value  0.867242  0.734484  0.867242  0.734712  0.05808  19.966228  124.404517   

         LogLik  
Value  0.016886  

Indeks Kesesuaian Model untuk HealthStatus:
       DoF  DoF Baseline         chi2  chi2 p-value  chi2 Baseline       CFI  \
Value    2             6  4067.594794           0.0    52197.25333  0.922102   

            GFI      AGFI       NFI       TLI     RMSEA        AIC        BIC  \
Value  0.922073  0.766218  0.922073  0.766306  0.089517  15.967931  99.518563   

         LogLik  
Value  0.016034  

Indeks Kesesuaian Model untuk HealthcareAccess:
       DoF  DoF Baseline        chi2  chi2 p-value  chi2 Baseline       CFI  \
Value    2             6  1337.66614           0.0    1914