In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import  StandardScaler 
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.linear_model import LogisticRegression

diabetes = pd.read_csv("diabetes_binary_health_indicators_BRFSS2015.csv")
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_cleaned = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_cleaned

diabetes_cleaned = drop_outliers_iqr(diabetes, 'BMI')
diabetes_cleaned = drop_outliers_iqr(diabetes_cleaned, 'GenHlth')

x_train, x_test = train_test_split(diabetes_cleaned, test_size=0.20, random_state=42)
x_test,y_test = x_test.drop(['Diabetes_binary'],axis=1),x_test['Diabetes_binary']
x_train,y_train = x_train.drop(['Diabetes_binary'],axis=1),x_train['Diabetes_binary']

scaler = StandardScaler()

scaler.fit(x_train)
x_train_scaler = scaler.transform(x_train)
x_test_scaler = scaler.transform(x_test)

selector = SelectKBest(f_classif, k=10)
X_new = selector.fit_transform(x_train, y_train)

selected_features = x_train.columns[selector.get_support()]
print("Best features:", selected_features)

X_train_selected = selector.transform(x_train)
X_test_selected = selector.transform(x_test)

scaler = StandardScaler()

scaler.fit(X_train_selected)
x_train_scaler = scaler.transform(X_train_selected)
x_test_scaler = scaler.transform(X_test_selected)


model = BalancedBaggingClassifier(
    estimator=LogisticRegression(),
    n_estimators=100,
    sampling_strategy='auto',  
    replacement=False,
    random_state=42
)
model.fit(x_train_scaler, y_train)

y_train_pred = model.predict(x_train_scaler)
y_test_pred = model.predict(x_test_scaler)


train_accuracy = accuracy_score(y_train, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(classification_report(y_test, y_test_pred, target_names=['No Diabetes', 'Diabetes']))

Best features: Index(['HighBP', 'HighChol', 'BMI', 'HeartDiseaseorAttack', 'GenHlth',
       'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income'],
      dtype='object')
Training Accuracy: 72.23%
Test Accuracy: 72.24%
              precision    recall  f1-score   support

 No Diabetes       0.96      0.72      0.82     40903
    Diabetes       0.27      0.77      0.40      5689

    accuracy                           0.72     46592
   macro avg       0.62      0.74      0.61     46592
weighted avg       0.87      0.72      0.77     46592



In [2]:
from xgboost import XGBClassifier

model = XGBClassifier(
    scale_pos_weight=6, 
    n_estimators=100,
    random_state=42
)
model.fit(x_train_scaler, y_train)

y_train_pred = model.predict(x_train_scaler)
y_test_pred = model.predict(x_test_scaler)


train_accuracy = accuracy_score(y_train, y_train_pred) * 100
test_accuracy = accuracy_score(y_test, y_test_pred) * 100

print(f"Training Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(classification_report(y_test, y_test_pred, target_names=['No Diabetes', 'Diabetes']))

Training Accuracy: 75.77%
Test Accuracy: 74.54%
              precision    recall  f1-score   support

 No Diabetes       0.95      0.75      0.84     40903
    Diabetes       0.29      0.73      0.41      5689

    accuracy                           0.75     46592
   macro avg       0.62      0.74      0.62     46592
weighted avg       0.87      0.75      0.79     46592

