In [None]:
# Core
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display
from IPython.display import display

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

print("Environment ready.")


In [None]:
df = pd.read_csv("urinalysis_cleaned.csv")

print("Dataset loaded.")
display(df.head())


In [None]:
risk_features = [
    "Leukocytes",
    "Nitrite",
    "Bacteria",
    "Protein",
    "Blood",
    "Ketones"
]

risk_features = [c for c in risk_features if c in df.columns]

print("Risk features used:", risk_features)


In [None]:
risk_weights = {
    "Leukocytes": 3,
    "Nitrite": 3,
    "Bacteria": 3,
    "Protein": 2,
    "Blood": 2,
    "Ketones": 1
}


In [None]:
def compute_risk_score(row):
    score = 0
    for feature, weight in risk_weights.items():
        if feature in row and row[feature] > 0:
            score += weight
    return score


In [None]:
df["risk_score"] = df.apply(compute_risk_score, axis=1)

display(df[["risk_score"]].describe())


In [None]:
sns.histplot(df["risk_score"], bins=15, kde=True)
plt.title("Distribution of Clinical Risk Scores")
plt.xlabel("Risk Score")
plt.ylabel("Frequency")
plt.show()


In [None]:
def risk_category(score):
    if score <= 2:
        return "Low"
    elif score <= 6:
        return "Moderate"
    else:
        return "High"

df["risk_category"] = df["risk_score"].apply(risk_category)

display(df["risk_category"].value_counts())


In [None]:
sns.countplot(x="risk_category", data=df, order=["Low", "Moderate", "High"])
plt.title("Risk Category Distribution")
plt.xlabel("Risk Tier")
plt.ylabel("Number of Samples")
plt.show()


In [None]:
clinical_cols = [
    "Protein", "Glucose", "Ketones",
    "Leukocytes", "Blood",
    "Nitrite", "Bacteria", "Crystals"
]

clinical_cols = [c for c in clinical_cols if c in df.columns]

df["abnormal"] = (df[clinical_cols] > 0).any(axis=1).astype(int)

risk_vs_abnormal = pd.crosstab(
    df["risk_category"],
    df["abnormal"],
    normalize="index"
)

display(risk_vs_abnormal)


In [None]:
try:
    preds = pd.read_csv("model_predictions.csv")
    merged = df.join(preds)

    sns.boxplot(
        x="risk_category",
        y="predicted_probability",
        data=merged
    )
    plt.title("Risk Category vs Model Predicted Probability")
    plt.show()
except:
    print("Optional model predictions not found.")


In [None]:
risk_summary = df.groupby("risk_category")[risk_features].mean().round(2)

display(risk_summary)


In [None]:
df.to_csv("urinalysis_risk_stratified.csv", index=False)

print("Risk-stratified dataset saved.")
