In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Target
pcos_df = pd.read_excel("../data/tabular/PCOS.xlsx")

y = pcos_df["pcos"]

# Features
X = pcos_df.drop(columns=["pcos"])

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
pcos_df.rename(columns={"pcos_y/n": "pcos"}, inplace=True)


In [None]:
pcos_df.drop(
    columns=["sl._no", "patient_file_no.", "unnamed:_44"],
    inplace=True
)


In [None]:
print(pcos_df.columns)


In [None]:
y = pcos_df["pcos"]
X = pcos_df.drop(columns=["pcos"])

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
categorical_cols = [
    "blood_group",
    "cycler/i",
    "pregnanty/n",
    "weight_gainy/n",
    "hair_growthy/n",
    "skin_darkening_y/n",
    "hair_lossy/n",
    "pimplesy/n",
    "fast_food_y/n",
    "reg.exercisey/n"
]


In [None]:
cat_features = [X.columns.get_loc(col) for col in categorical_cols]
cat_features


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)


In [None]:
X["ii____beta-hcgmiu/ml"] = (
    X["ii____beta-hcgmiu/ml"]
    .astype(str)
    .str.replace(r"[^\d.-]", "", regex=True)
)

X["ii____beta-hcgmiu/ml"] = pd.to_numeric(
    X["ii____beta-hcgmiu/ml"],
    errors="coerce"
)



In [None]:
X["ii____beta-hcgmiu/ml"].dtype


In [None]:
import pandas as pd
import numpy as np

# Define categorical columns (already known)
categorical_cols = [
    "blood_group",
    "cycler/i",
    "pregnanty/n",
    "weight_gainy/n",
    "hair_growthy/n",
    "skin_darkening_y/n",
    "hair_lossy/n",
    "pimplesy/n",
    "fast_food_y/n",
    "reg.exercisey/n"
]

# All remaining columns are numeric
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# Clean numeric columns
for col in numeric_cols:
    X[col] = (
        X[col]
        .astype(str)
        .str.strip()
        .str.replace(r"[^\d.-]", "", regex=True)
    )
    X[col] = pd.to_numeric(X[col], errors="coerce")


In [None]:
X.dtypes[X.dtypes == "object"]


In [None]:
for col in categorical_cols:
    X[col] = X[col].astype("Int64")  # pandas nullable integer

X[categorical_cols].dtypes



In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=100
)


In [None]:
cat_features = [X.columns.get_loc(col) for col in categorical_cols]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test)
)


In [None]:
# Recreate X and y cleanly
y = pcos_df["pcos"]
X = pcos_df.drop(columns=["pcos"])


In [None]:
categorical_cols = [
    "blood_group",
    "cycler/i",
    "pregnanty/n",
    "weight_gainy/n",
    "hair_growthy/n",
    "skin_darkening_y/n",
    "hair_lossy/n",
    "pimplesy/n",
    "fast_food_y/n",
    "reg.exercisey/n"
]


In [None]:
for col in categorical_cols:
    X[col] = X[col].astype("Int64").astype(str)


In [None]:
numeric_cols = [col for col in X.columns if col not in categorical_cols]

for col in numeric_cols:
    X[col] = (
        X[col]
        .astype(str)
        .str.strip()
        .str.replace(r"[^\d.-]", "", regex=True)
    )
    X[col] = pd.to_numeric(X[col], errors="coerce")


In [None]:
print(X[categorical_cols].dtypes)
print(X[numeric_cols].dtypes.head())


In [None]:
X.dtypes[X.dtypes == "object"]


In [None]:
cat_features = [X.columns.get_loc(col) for col in categorical_cols]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=100
)


In [None]:
model.fit(
    X_train,
    y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test)
)


In [None]:
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix
)

# Probability predictions
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Default threshold = 0.5
y_pred = (y_pred_proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
threshold = 0.35
y_pred_035 = (y_pred_proba >= threshold).astype(int)

print("Threshold:", threshold)
print(classification_report(y_test, y_pred_035))
print(confusion_matrix(y_test, y_pred_035))


In [None]:
import shap

# Create SHAP explainer
explainer = shap.TreeExplainer(model)

# Compute SHAP values
shap_values = explainer.shap_values(X_train)


In [None]:
shap.summary_plot(
    shap_values,
    X_train,
    plot_type="bar",
    show=True
)


In [None]:
shap.summary_plot(
    shap_values,
    X_train,
    show=True
)


In [None]:
model.save_model("../models/catboost_pcos_model.cbm")


In [None]:
import pandas as pd

df = pd.read_excel("../data/tabular/PCOS.xlsx")

# This must match your training step    
X = df.drop(columns=["PCOS (Y/N)"])

print(list(X.columns))