In [1]:
import pandas as pd
import numpy as np
import pyreadstat
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
path = r"C:\Users\018464615\Downloads\SEM1\CMPE-255\Project\cleanedDataSet.csv"

In [15]:
df = pd.read_csv(path)

print(df.head())
print(df.shape)

   hysterectomy AgeGroup  EverGotHIVTest    BMI  NumChildren  arthritis  \
0           0.0    55–64             1.0  26.54          0.0        1.0   
1           0.0    35–44             1.0  25.85          0.0        1.0   
2           1.0    55–64             0.0  18.89          0.0        0.0   
3           1.0    55–64             0.0  28.80          0.0        1.0   
4           0.0    35–44             1.0  26.63          2.0        0.0   

                     EducationLevel  asthma  mental_health_status  \
0                  College Graduate     0.0                   1.0   
1                  College Graduate     0.0                   1.0   
2  Some College or Technical School     0.0                   1.0   
3  Some College or Technical School     0.0                   1.0   
4  Some College or Technical School     0.0                   1.0   

   heartDisease_CHD_or_MI  ...  EmotionalSupport  ReceivedFoodStamps  \
0                     0.0  ...               2.0              

In [16]:
for col in df.columns:
    print(f"Column: {col}")

Column: hysterectomy
Column: AgeGroup
Column: EverGotHIVTest
Column: BMI
Column: NumChildren
Column: arthritis
Column: EducationLevel
Column: asthma
Column: mental_health_status
Column: heartDisease_CHD_or_MI
Column: physical_health_status
Column: BingeDrinker
Column: OverweightOrObese
Column: HeavyDrinker
Column: GoodHealth
Column: BiologicalSex
Column: PhysicalActivityLast30Days
Column: HeightInches
Column: WeightKg
Column: CancerType
Column: NumAdultsHousehold
Column: DepressiveDisorder
Column: COPD
Column: KidneyDisease
Column: Cancer_MelanomaOrOther
Column: SkinCancerNonMelanoma
Column: Stroke
Column: HasDiabetes
Column: EmploymentStatus
Column: BlindOrDifficultySeeing
Column: DeafOrDifficultyHearing
Column: DifficultyConcentrating
Column: DifficultyDoingErrandsAlone
Column: DifficultyDressingBathing
Column: DifficultyWalkingClimbing
Column: HealthRating
Column: MentalHealthBadDays
Column: PhysicalHealthBadDays
Column: EmotionalSupport
Column: ReceivedFoodStamps
Column: LifeSatisf

In [17]:
X = df.drop(columns=["HasDiabetes"])
y = df["HasDiabetes"]

In [18]:
X = X.copy()

# Convert categorical features to string so CatBoost is happy
for col in cat_features:
    if col in X.columns:
        X[col] = X[col].astype(str)
    else:
        print(f"Warning: {col} not found in X")


In [19]:
cat_features = [
    "AgeGroup",
    "NumChildren",
    "EducationLevel",
    "mental_health_status",
    "physical_health_status",
    "BiologicalSex",
    "CancerType",
    "NumAdultsHousehold",
    "EmploymentStatus",
    "HealthRating",
    "EmotionalSupport",
    "LifeSatisfaction",
    "FoodInsecurity",
    "Race",
    "MARITAL_GROUPED",
    "IncomeCategoryLabel",
    "SmokingGroup",
]

#Setting datatype to category
for col in cat_features:
    if col in X.columns:
        X[col] = X[col].astype("category")
    else:
        print(f"Warning: {col} not found in X")


In [9]:
from sklearn.model_selection import train_test_split

In [20]:
# Step 1: train vs temp (valid + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.3,          # 70% train, 30% temp
    random_state=42,
    stratify=y              # keeps class ratio
)

# Step 2: split temp into validation and test
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,          # 15% valid, 15% test
    random_state=42,
    stratify=y_temp         # keeps class ratio
)

print("Train size:", len(X_train))
print("Valid size:", len(X_valid))
print("Test size:", len(X_test))

print("\nClass proportions:")
print("Train:\n", y_train.value_counts(normalize=True))
print("Valid:\n", y_valid.value_counts(normalize=True))
print("Test:\n", y_test.value_counts(normalize=True))


Train size: 67991
Valid size: 14570
Test size: 14570

Class proportions:
Train:
 HasDiabetes
0.0    0.869203
1.0    0.130797
Name: proportion, dtype: float64
Valid:
 HasDiabetes
0.0    0.869252
1.0    0.130748
Name: proportion, dtype: float64
Test:
 HasDiabetes
0.0    0.869183
1.0    0.130817
Name: proportion, dtype: float64


In [12]:
from collections import Counter

In [21]:
class_counts = Counter(y_train)
neg = class_counts[0]
pos = class_counts[1]

pos_weight = neg / pos
print("neg/pos:",neg,"/",pos)
print("Train class counts:", class_counts)
print("Positive class weight:", pos_weight)


neg/pos: 59098 / 8893
Train class counts: Counter({0.0: 59098, 1.0: 8893})
Positive class weight: 6.645451478691105


In [22]:
# got indices of categorical columns (since CatBoost needs positions, not names)
cat_feature_indices = [X_train.columns.get_loc(c) for c in cat_features if c in X_train.columns]

train_pool = Pool(X_train, y_train, cat_features=cat_feature_indices)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_feature_indices)

model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[1.0, pos_weight],  # 0 -> weight 1.0, 1 -> higher weight
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=valid_pool)

0:	test: 0.7855701	best: 0.7855701 (0)	total: 349ms	remaining: 8m 42s
100:	test: 0.8144955	best: 0.8144955 (100)	total: 16.4s	remaining: 3m 46s
200:	test: 0.8169683	best: 0.8169683 (200)	total: 32.2s	remaining: 3m 28s
300:	test: 0.8177524	best: 0.8177524 (300)	total: 47.8s	remaining: 3m 10s
400:	test: 0.8184346	best: 0.8184346 (400)	total: 1m 3s	remaining: 2m 53s
500:	test: 0.8188448	best: 0.8189304 (484)	total: 1m 19s	remaining: 2m 38s
600:	test: 0.8189985	best: 0.8190356 (572)	total: 1m 36s	remaining: 2m 24s
700:	test: 0.8189044	best: 0.8190356 (572)	total: 1m 56s	remaining: 2m 13s
800:	test: 0.8188609	best: 0.8190356 (572)	total: 2m 18s	remaining: 2m
900:	test: 0.8187235	best: 0.8190356 (572)	total: 2m 39s	remaining: 1m 46s
1000:	test: 0.8187267	best: 0.8190356 (572)	total: 3m	remaining: 1m 30s
1100:	test: 0.8185225	best: 0.8190356 (572)	total: 3m 24s	remaining: 1m 14s
1200:	test: 0.8183740	best: 0.8190356 (572)	total: 3m 47s	remaining: 56.7s
1300:	test: 0.8181675	best: 0.8190356 (5

<catboost.core.CatBoostClassifier at 0x2a196817e00>

In [44]:
The initial CatBoost model trained with all 48 available features produced strong predictive performance; however, using such a large feature set introduces unnecessary complexity, increases the risk of overfitting, and makes the model less interpretable. 
So reducing the feature set to the most informative predictors is both beneficial and more practical for real-world deployment.

SyntaxError: invalid syntax (905546357.py, line 1)

In [None]:
#This is with all 48 features, which is not a good idea, we need to feature engineer/selection

In [23]:
importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns

feat_imp = (
    pd.DataFrame({"feature": feature_names, "importance": importances})
      .sort_values("importance", ascending=False)
)

print(feat_imp.head(20))

                       feature  importance
1                     AgeGroup   23.604867
34                HealthRating   21.949335
3                          BMI    9.677601
18                    WeightKg    4.735888
43                        Race    3.166247
23               KidneyDisease    3.157471
27            EmploymentStatus    2.659958
9       heartDisease_CHD_or_MI    2.495058
11                BingeDrinker    1.977750
45         IncomeCategoryLabel    1.785558
15               BiologicalSex    1.694016
37            EmotionalSupport    1.562674
12           OverweightOrObese    1.459239
16  PhysicalActivityLast30Days    1.357768
13                HeavyDrinker    1.348616
46                SmokingGroup    1.292466
17                HeightInches    1.286102
20          NumAdultsHousehold    1.179252
6               EducationLevel    1.103268
40              FoodInsecurity    0.996941


In [24]:

TARGET_COL = "HasDiabetes"
top20 = feat_imp["feature"].head(20).tolist()
print("Top 20 features:\n", top20)
# Target + Feature Selection
y = df[TARGET_COL]
X = df[top20].copy()

Top 20 features:
 ['AgeGroup', 'HealthRating', 'BMI', 'WeightKg', 'Race', 'KidneyDisease', 'EmploymentStatus', 'heartDisease_CHD_or_MI', 'BingeDrinker', 'IncomeCategoryLabel', 'BiologicalSex', 'EmotionalSupport', 'OverweightOrObese', 'PhysicalActivityLast30Days', 'HeavyDrinker', 'SmokingGroup', 'HeightInches', 'NumAdultsHousehold', 'EducationLevel', 'FoodInsecurity']


In [25]:
cat_features = [
    "AgeGroup",
    "HealthRating",
    "Race",
    "EmploymentStatus",
    "BiologicalSex",
    "EmotionalSupport",
    "OverweightOrObese",
    "PhysicalActivityLast30Days",
    "IncomeCategoryLabel",
    "SmokingGroup",
    "EducationLevel",
    "FoodInsecurity"
]

for c in cat_features:
    X[c] = X[c].astype(str)

# Train / Valid / Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print("Train:", X_train.shape, "Valid:", X_valid.shape, "Test:", X_test.shape)

# Class Imbalance Handling
class_counts = Counter(y_train)
neg = class_counts[0]
pos = class_counts[1]
pos_weight = neg / pos
print("Class weight:", pos_weight)

cat_feature_indices = [X_train.columns.get_loc(c) for c in cat_features]

train_pool = Pool(X_train, y_train, cat_features=cat_feature_indices)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_feature_indices)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_feature_indices)

# Train CatBoost
model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.03,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[1, pos_weight],
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=valid_pool)

# Evaluate
val_proba = model.predict_proba(X_valid)[:, 1]
val_pred  = model.predict(X_valid)
print("\nValidation AUC:", roc_auc_score(y_valid, val_proba))
print(classification_report(y_valid, val_pred))

test_proba = model.predict_proba(X_test)[:, 1]
test_pred  = model.predict(X_test)
print("\nTest AUC:", roc_auc_score(y_test, test_proba))
print(classification_report(y_test, test_pred))


Train: (67991, 20) Valid: (14570, 20) Test: (14570, 20)
Class weight: 6.645451478691105
0:	test: 0.7899622	best: 0.7899622 (0)	total: 155ms	remaining: 2m 3s
100:	test: 0.8139393	best: 0.8139393 (100)	total: 13.3s	remaining: 1m 32s
200:	test: 0.8157924	best: 0.8157924 (200)	total: 26.1s	remaining: 1m 17s
300:	test: 0.8164773	best: 0.8164773 (300)	total: 38.9s	remaining: 1m 4s
400:	test: 0.8170087	best: 0.8170087 (400)	total: 52.1s	remaining: 51.8s
500:	test: 0.8172468	best: 0.8172890 (487)	total: 1m 5s	remaining: 39.3s
600:	test: 0.8172017	best: 0.8173144 (555)	total: 1m 19s	remaining: 26.4s
700:	test: 0.8170662	best: 0.8173144 (555)	total: 1m 33s	remaining: 13.2s
799:	test: 0.8167523	best: 0.8173144 (555)	total: 1m 46s	remaining: 0us

bestTest = 0.8173143793
bestIteration = 555

Shrink model to first 556 iterations.

Validation AUC: 0.8173143793267453
              precision    recall  f1-score   support

         0.0       0.95      0.70      0.81     12665
         1.0       0.28    

In [26]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [28]:
base_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[1, pos_weight],
    random_seed=42,
    verbose=False
)

In [29]:
param_dist = {
    "depth": [4, 5, 6, 7, 8],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "iterations": [400, 600, 800, 1000],
    "border_count": [64, 128, 254],
    "bagging_temperature": [0, 0.25, 0.5, 1.0],
}

In [30]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [31]:
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=15,              # you can bump to 30+ if you have time
    scoring="roc_auc",
    cv=cv,
    refit=True,
    verbose=2,
    n_jobs=-1               # use all cores
)

search.fit(
    X_train, y_train,
    cat_features=cat_feature_indices,
    eval_set=(X_valid, y_valid)
)

print("Best params:", search.best_params_)
print("Best CV AUC:", search.best_score_)
best_model = search.best_estimator_


Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params: {'learning_rate': 0.03, 'l2_leaf_reg': 5, 'iterations': 1000, 'depth': 4, 'border_count': 128, 'bagging_temperature': 0}
Best CV AUC: 0.8194275816856537


In [36]:
def evaluate_thresholds(y_true, y_proba):
    thresholds = np.arange(0.01, 1.00, 0.01)
    results = []

    for th in thresholds:
        preds = (y_proba >= th).astype(int)

        precision = precision_score(y_true, preds, zero_division=0)
        recall = recall_score(y_true, preds, zero_division=0)
        f1 = f1_score(y_true, preds, zero_division=0)

        results.append([th, precision, recall, f1])

    results_df = pd.DataFrame(results, columns=["threshold", "precision", "recall", "f1"])
    return results_df

In [35]:
from sklearn.metrics import precision_score, recall_score, f1_score


In [37]:
th_results = evaluate_thresholds(y_valid, val_proba)

In [38]:
best_f1_row = th_results.loc[th_results["f1"].idxmax()]
print("Best threshold for F1:", best_f1_row)

Best threshold for F1: threshold    0.640000
precision    0.348458
recall       0.616798
f1           0.445329
Name: 63, dtype: float64


In [39]:
best_recall_row = th_results.loc[th_results["recall"].idxmax()]
print("Best threshold for Recall:", best_recall_row)

Best threshold for Recall: threshold    0.010000
precision    0.130928
recall       1.000000
f1           0.231541
Name: 0, dtype: float64


In [40]:
best_precision_row = th_results.loc[th_results["precision"].idxmax()]
print("Best threshold for Precision:", best_precision_row)

Best threshold for Precision: threshold    0.960000
precision    1.000000
recall       0.001050
f1           0.002098
Name: 95, dtype: float64


In [41]:
chosen_th = best_f1_row["threshold"]
val_preds_opt = (val_proba >= chosen_th).astype(int)

print(classification_report(y_valid, val_preds_opt))

              precision    recall  f1-score   support

         0.0       0.93      0.83      0.88     12665
         1.0       0.35      0.62      0.45      1905

    accuracy                           0.80     14570
   macro avg       0.64      0.72      0.66     14570
weighted avg       0.86      0.80      0.82     14570



In [43]:
test_proba = best_model.predict_proba(X_test)[:, 1]
test_preds = (test_proba >= chosen_th).astype(int)

print("Test performance with optimized threshold:")
print(classification_report(y_test, test_preds))

Test performance with optimized threshold:
              precision    recall  f1-score   support

         0.0       0.93      0.82      0.87     12664
         1.0       0.34      0.60      0.43      1906

    accuracy                           0.79     14570
   macro avg       0.63      0.71      0.65     14570
weighted avg       0.85      0.79      0.82     14570



In [None]:
# overall performance almost same
# precision for diabetes yes increased