In [None]:
# Core
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve
)

# Explainability
import shap

# Display
from IPython.display import display

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

print("Environment ready.")


In [None]:
df = pd.read_csv("urinalysis_cleaned.csv")

print("Dataset loaded.")
display(df.head())


In [None]:
clinical_cols = [
    "Protein", "Glucose", "Ketones",
    "Leukocytes", "Blood",
    "Nitrite", "Bacteria", "Crystals"
]

clinical_cols = [c for c in clinical_cols if c in df.columns]

df["abnormal"] = (df[clinical_cols] > 0).any(axis=1).astype(int)

display(df["abnormal"].value_counts())


In [None]:
feature_cols = [
    "pH",
    "Specific Gravity",
    "Protein",
    "Glucose",
    "Ketones",
    "Leukocytes",
    "Blood",
    "Nitrite",
    "Bacteria",
    "Crystals"
]

feature_cols = [c for c in feature_cols if c in df.columns]

X = df[feature_cols]
y = df["abnormal"]

print("Features used:", feature_cols)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]


In [None]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]


In [None]:
def evaluate_model(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_prob)
    }

results = pd.DataFrame.from_dict({
    "Logistic Regression": evaluate_model(y_test, y_pred_lr, y_prob_lr),
    "Random Forest": evaluate_model(y_test, y_pred_rf, y_prob_rf)
}, orient="index")

display(results)


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.heatmap(
    confusion_matrix(y_test, y_pred_lr),
    annot=True, fmt="d", ax=axes[0]
)
axes[0].set_title("Logistic Regression")

sns.heatmap(
    confusion_matrix(y_test, y_pred_rf),
    annot=True, fmt="d", ax=axes[1]
)
axes[1].set_title("Random Forest")

plt.show()


In [None]:
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)

plt.plot(fpr_lr, tpr_lr, label="Logistic Regression")
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()


In [None]:
importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": rf.feature_importances_
}).sort_values("Importance", ascending=False)

display(importance_df)

sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Random Forest Feature Importance")
plt.show()


In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values[1], X_test, plot_type="bar")


In [None]:
import joblib

joblib.dump(rf, "random_forest_urinalysis.pkl")
joblib.dump(log_reg, "logistic_regression_urinalysis.pkl")

results.to_csv("model_performance_summary.csv")

print("Models and results saved.")
