In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/raw/diabetes_012_health_indicators_BRFSS2015.csv")

df['Diabetes_binary'] = df['Diabetes_012'].apply(lambda x: 0 if x == 0 else 1)

X = df.drop(['Diabetes_012', 'Diabetes_binary'], axis=1)
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


#  Feature Engineering

Bu notebook, veri setinin modele daha uygun hale getirilmesi amacıyla uygulanan tüm ön işleme (preprocessing) adımlarını içerir.


In [3]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

feature_importance.head(10)


Unnamed: 0,feature,importance
3,BMI,0.183368
18,Age,0.121902
20,Income,0.098068
15,PhysHlth,0.081636
13,GenHlth,0.070805
19,Education,0.070708
14,MentHlth,0.064107
0,HighBP,0.045867
8,Fruits,0.03302
4,Smoker,0.032707


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

balanced_model = LogisticRegression(
    max_iter=500,
    class_weight="balanced"
)

balanced_model.fit(X_train, y_train)
y_pred = balanced_model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      0.72      0.82     42741
           1       0.34      0.76      0.47      7995

    accuracy                           0.73     50736
   macro avg       0.64      0.74      0.65     50736
weighted avg       0.85      0.73      0.76     50736



In [None]:
# Uygulanan Feature Engineering Adımları

- Sürekli değişkenlerin **StandardScaler** ile ölçeklenmesi
- Kategorik değişkenlerin encode edilmesi
- Gerekiyorsa BMI kategorisi gibi türetilmiş değişkenlerin denenmesi
- Pipeline yapısı içerisinde tüm işlemlerin tekrar edilebilir hale getirilmesi

Bu adımlar, modelin stabilitesini ve yorumlanabilirliğini artırmayı hedefler.
