In [None]:
# Core
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Time series
from statsmodels.tsa.seasonal import seasonal_decompose

# Display
from IPython.display import display

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("Environment ready.")


In [None]:
df = pd.read_csv("urinalysis_cleaned.csv")

print("Dataset loaded.")
display(df.head())


In [None]:
date_cols = [c for c in df.columns if "date" in c.lower()]

if not date_cols:
    raise ValueError("No date column found. Please verify dataset.")

date_col = date_cols[0]
df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

print("Using date column:", date_col)


In [None]:
df = df.sort_values(date_col)
df_ts = df.set_index(date_col)

print("Time range:")
print(df_ts.index.min(), "to", df_ts.index.max())


In [None]:
monthly = df_ts.resample("M").agg({
    "pH": "mean",
    "Specific Gravity": "mean",
    "Leukocytes": "mean",
    "Nitrite": "mean",
    "Protein": "mean",
    "Bacteria": "mean"
})

display(monthly.head())


In [None]:
monthly_counts = df_ts.resample("M").size()

monthly_counts.plot(title="Monthly Urinalysis Test Volume")
plt.ylabel("Number of Tests")
plt.show()


In [None]:
monthly[["pH", "Specific Gravity"]].plot(
    title="Monthly Trends: pH and Specific Gravity"
)
plt.show()


In [None]:
monthly[["Leukocytes", "Nitrite", "Bacteria"]].plot(
    title="Monthly Trends: UTI-Related Indicators"
)
plt.show()


In [None]:
rolling = monthly.rolling(window=3).mean()

rolling.plot(title="3-Month Rolling Average (Key Indicators)")
plt.show()


In [None]:
if monthly_counts.shape[0] >= 24:
    decomposition = seasonal_decompose(
        monthly_counts,
        model="additive",
        period=12
    )
    decomposition.plot()
    plt.show()
else:
    print("Not enough data points for seasonal decomposition.")


In [None]:
# Define abnormal urinalysis (any positive clinical marker)
clinical_cols = [
    "Protein", "Glucose", "Ketones",
    "Leukocytes", "Blood", "Nitrite",
    "Bacteria", "Crystals"
]

clinical_cols = [c for c in clinical_cols if c in df_ts.columns]

df_ts["Abnormal"] = (df_ts[clinical_cols] > 0).any(axis=1).astype(int)

abnormal_rate = df_ts.resample("M")["Abnormal"].mean()

abnormal_rate.plot(
    title="Monthly Proportion of Abnormal Urinalysis Results"
)
plt.ylabel("Abnormal Rate")
plt.show()


In [None]:
covid_cutoff = "2020-03-01"

pre_covid = abnormal_rate[abnormal_rate.index < covid_cutoff]
post_covid = abnormal_rate[abnormal_rate.index >= covid_cutoff]

plt.plot(pre_covid, label="Pre-COVID")
plt.plot(post_covid, label="Post-COVID")
plt.legend()
plt.title("Abnormal Urinalysis Rate: Pre vs Post COVID")
plt.show()


In [None]:
yearly_summary = df_ts.groupby(df_ts.index.year).agg({
    "pH": "mean",
    "Specific Gravity": "mean",
    "Leukocytes": "mean",
    "Nitrite": "mean",
    "Bacteria": "mean",
    "Abnormal": "mean"
})

display(yearly_summary)


In [None]:
monthly.to_csv("monthly_urinalysis_trends.csv")
abnormal_rate.to_csv("monthly_abnormal_rate.csv")

print("Time series outputs saved.")
