In [2]:
import pandas as pd

df1 = pd.read_excel("../data/tabular/PCOS.xlsx")
df2 = pd.read_excel("../data/tabular/PCOS_infertility.xlsx")

df = pd.concat([df1, df2], ignore_index=True)

In [3]:
def normalize_target(x):
    if str(x).strip().upper() in ["Y", "YES", "1"]:
        return 1
    return 0

df["target"] = df["PCOS (Y/N)"].apply(normalize_target)


In [4]:
df = df.drop(columns=["PCOS (Y/N)"], errors="ignore")


In [5]:
CATEGORICAL_COLS = [
    "Blood Group","Cycle(R/I)","Pregnant(Y/N)",
    "Weight gain(Y/N)","hair growth(Y/N)",
    "Skin darkening (Y/N)","Hair loss(Y/N)",
    "Pimples(Y/N)","Fast food (Y/N)","Reg.Exercise(Y/N)"
]

NUMERIC_COLS = [
    col for col in df.columns
    if col not in CATEGORICAL_COLS + ["target"]
]

for col in NUMERIC_COLS:
    df[col] = (
        df[col]
        .astype(str)
        .str.strip()
        .str.replace(r"[^\d\.-]", "", regex=True)
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

df[NUMERIC_COLS] = df[NUMERIC_COLS].fillna(df[NUMERIC_COLS].median())


In [6]:
for col in CATEGORICAL_COLS:
    df[col] = (
        df[col]
        .astype(str)
        .str.upper()
        .str.strip()
        .replace({"YES":"Y","NO":"N"})
    )


In [7]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

X = df.drop(columns=["target"])
y = df["target"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = CatBoostClassifier(
    iterations=1200,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="AUC",
    cat_features=CATEGORICAL_COLS,
    random_seed=42,
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_val, y_val))
model.save_model("../models/catboost_pcos_tabular_clean.cbm")


0:	test: 0.6859444	best: 0.6859444 (0)	total: 172ms	remaining: 3m 26s
100:	test: 0.8655701	best: 0.8698148 (41)	total: 4.12s	remaining: 44.8s
200:	test: 0.8797993	best: 0.8797993 (200)	total: 7.98s	remaining: 39.6s
300:	test: 0.8856840	best: 0.8856840 (300)	total: 12.1s	remaining: 36.1s
400:	test: 0.8834652	best: 0.8872275 (330)	total: 16.3s	remaining: 32.4s
500:	test: 0.8794135	best: 0.8872275 (330)	total: 20.4s	remaining: 28.4s
600:	test: 0.8766159	best: 0.8872275 (330)	total: 24.5s	remaining: 24.4s
700:	test: 0.8771947	best: 0.8872275 (330)	total: 28.7s	remaining: 20.4s
800:	test: 0.8758441	best: 0.8872275 (330)	total: 32.8s	remaining: 16.3s
900:	test: 0.8772911	best: 0.8872275 (330)	total: 36.7s	remaining: 12.2s
1000:	test: 0.8763265	best: 0.8872275 (330)	total: 40.5s	remaining: 8.05s
1100:	test: 0.8766159	best: 0.8872275 (330)	total: 44.3s	remaining: 3.98s
1199:	test: 0.8743971	best: 0.8872275 (330)	total: 48.2s	remaining: 0us

bestTest = 0.8872274744
bestIteration = 330

Shrink m