In [None]:
# Core
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistics
from scipy.stats import spearmanr, chi2_contingency

# Display
from IPython.display import display

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

print("Environment ready.")


In [None]:
df = pd.read_csv("urinalysis_cleaned.csv")

print("Dataset loaded.")
display(df.head())


In [None]:
# Numerical variables
numerical_cols = [
    "pH",
    "Specific Gravity"
]

numerical_cols = [c for c in numerical_cols if c in df.columns]

# Ordinal / Binary clinical variables
clinical_cols = [
    "Protein",
    "Glucose",
    "Ketones",
    "Leukocytes",
    "Blood",
    "Nitrite",
    "Bacteria",
    "Crystals"
]

clinical_cols = [c for c in clinical_cols if c in df.columns]

print("Numerical columns:", numerical_cols)
print("Clinical categorical/ordinal columns:", clinical_cols)


In [None]:
corr_data = df[numerical_cols + clinical_cols].dropna()

spearman_corr = corr_data.corr(method="spearman")

display(spearman_corr)


In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(
    spearman_corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0
)
plt.title("Spearman Correlation Matrix")
plt.show()


In [None]:
results = []

for i in range(len(clinical_cols)):
    for j in range(i+1, len(clinical_cols)):
        col1, col2 = clinical_cols[i], clinical_cols[j]
        subset = df[[col1, col2]].dropna()
        if subset.shape[0] > 10:
            corr, p = spearmanr(subset[col1], subset[col2])
            results.append([col1, col2, corr, p])

corr_results = pd.DataFrame(
    results,
    columns=["Variable 1", "Variable 2", "Spearman r", "p-value"]
)

display(corr_results.sort_values("p-value").head(10))


In [None]:
chi_results = []

for i in range(len(clinical_cols)):
    for j in range(i+1, len(clinical_cols)):
        col1, col2 = clinical_cols[i], clinical_cols[j]
        contingency = pd.crosstab(df[col1], df[col2])

        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            chi2, p, dof, _ = chi2_contingency(contingency)
            chi_results.append([col1, col2, chi2, p])

chi_df = pd.DataFrame(
    chi_results,
    columns=["Variable 1", "Variable 2", "Chi-square", "p-value"]
)

display(chi_df.sort_values("p-value").head(10))


In [None]:
def cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r, k) - 1)))

cramer_results = []

for col1 in clinical_cols:
    for col2 in clinical_cols:
        if col1 != col2:
            table = pd.crosstab(df[col1], df[col2])
            if table.shape[0] > 1 and table.shape[1] > 1:
                v = cramers_v(table)
                cramer_results.append([col1, col2, v])

cramer_df = pd.DataFrame(
    cramer_results,
    columns=["Variable 1", "Variable 2", "Cramér's V"]
)

display(cramer_df.sort_values("Cramér's V", ascending=False).head(10))


In [None]:
if "Leukocytes" in df.columns and "Bacteria" in df.columns:
    sns.countplot(x="Leukocytes", hue="Bacteria", data=df)
    plt.title("Leukocytes vs Bacteria")
    plt.show()


In [None]:
if "Protein" in df.columns and "Specific Gravity" in df.columns:
    sns.boxplot(x="Protein", y="Specific Gravity", data=df)
    plt.title("Protein Level vs Specific Gravity")
    plt.show()


In [None]:
summary = corr_results[
    (corr_results["p-value"] < 0.05) &
    (corr_results["Spearman r"].abs() > 0.3)
].sort_values("Spearman r", ascending=False)

display(summary)


In [None]:
spearman_corr.to_csv("spearman_correlation_matrix.csv")
chi_df.to_csv("chi_square_results.csv", index=False)
cramer_df.to_csv("cramers_v_results.csv", index=False)

print("Correlation and association results exported.")
