In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.dates import AutoDateLocator
from matplotlib.dates import DayLocator, DateFormatter

In [None]:
# Load the dataset
df = pd.read_csv("../data/plant_health_data.csv")

# Drop na and sort by plantId (I think it is by default though)
df = df.dropna(subset=["Timestamp"]).sort_values("Timestamp")
plant_ids = sorted(df["Plant_ID"].unique())

# To get the feature columns
non_feature = {"Timestamp", "Plant_ID", "Plant_Health_Status"}
feature_columns = [c for c in df.columns if c not in non_feature]

# Convert timestamp to datetime for proper time series plotting
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Display basic info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of unique plants: {df['Plant_ID'].nunique()}")
print(f"Plant IDs: {sorted(df['Plant_ID'].unique())}")
print(f"Date range: {df['Timestamp'].min()} to {df['Timestamp'].max()}")
print(f"Number of records per plant:")
print(df['Plant_ID'].value_counts().sort_index())

# Summary stats about the features, showing mean, std and different percentile values
summary = (df[feature_columns]
           .describe(percentiles=[0.25, 0.5, 0.75])
           .T[['count','mean','std','min','25%','50%','75%','max']]
           .round(2))
print("\nPer-feature summary stats (overall):")
print(summary)

df.head(10)

# Line chart of feature values over time
Since we have 10 plants, 11 features, and 120 timestamps per feature, this generates many plots. But it's valuable for noticing trends between the plants such as:
- Consistent high / low readings across multiple plants for extended periods of time
- Similar high / low trends between plants across similar timeframes
- Sudden deviations in values for a single plant (jumping from the high end to the low end of values suddently)
- Identifying if any features fluctuate greater than others

In [None]:
# TODO: I think this is a good plot to show the key distributions of feature values for each plant, but there are 10 plants and 12 features
# How can we prioritize this to showcase this data and not bloat the entire notebook?
def plot_line_chart(df_feat, feature, plant_id):
    sub = df_feat[df_feat["Plant_ID"] == plant_id]
    fig, ax = plt.subplots(figsize=(15, 5))
    ax.plot(sub["Timestamp"], sub[feature], marker="o", linewidth=1)
    ax.set_title(f"{feature} — Plant {plant_id}")
    ax.set_xlabel("Date")
    ax.set_ylabel(feature)
    ax.grid(True, alpha=0.3)
    fig.autofmt_xdate(rotation=45)
    plt.tight_layout()
    plt.show()

for feature in feature_columns:
    print(f"\n{feature}")
    for pid in plant_ids:
        plot_line_chart(df, feature, pid)

# Box Plots for each timestamp per feature
These plots show a box plot for each timestamp of each feature. The values in each box plot show the range in values of each plant at each timestamp, meaning there are 10 plant values in each box. It also shows a mean trend line across the plots, showing the average values throughout the 30-day period. This is helpful for:
- Identifying common trends at certain times between all plants (ie. certain times where the values are higher / lower across all plants)
- The mean line is helpful to quickly identify these interesting timeframes
- Ex of an interesting point: Ambient temperature box plot just before 2024-10-12, there is a big dip across all plants (highest whisker is lower than any other point)

In [None]:
# This shows off a boxplot per timestamp for each feature, with a mean trend line running through the plot
# The boxplots show off the distribution of values of each plant at each timestamp for each feature
# Cons: This doesnt show us each individual plant value
def plot_feature_timestamp_boxplot(df, feature, showfliers=False):
    # Floor the timestamp ms to ensure they all line up between plants
    data = df.copy()
    data["Timestamp"] = pd.to_datetime(data["Timestamp"], errors="coerce").dt.floor("s")

    # One list of values per timestamp across plants
    ts = data["Timestamp"].drop_duplicates().sort_values()
    box_data = [data.loc[data["Timestamp"].eq(t), feature].to_numpy() for t in ts]

    # convert timestamps to Matplotlib date numbers for positioning
    positions = mdates.date2num(ts)

    gap = float(np.median(np.diff(positions)))
    width = gap * 0.5

    fig, ax = plt.subplots(figsize=(15, 5))

    # Building the boxes
    bp = ax.boxplot(
        box_data,
        positions=positions,
        widths=width,
        manage_ticks=False,
        showfliers=showfliers,
        patch_artist=True,
    )

    # Styling each box to give a more readable view than the default values
    for box in bp["boxes"]:
        box.set(facecolor="#cfe2ff", edgecolor="#2554a3", alpha=0.7, linewidth=1.0)
    for med in bp["medians"]:
        med.set(color="#000000", linewidth=1.6)
    for whisk in bp["whiskers"]:
        whisk.set(color="#2554a3", linewidth=1.0)
    for cap in bp["caps"]:
        cap.set(color="#2554a3", linewidth=1.0)

    # To show the mean trend line across box plots
    means = [np.nanmean(x) if len(x) else np.nan for x in box_data]
    ax.plot(ts, means, linewidth=2, alpha=0.9)

    # Axis formatting
    ax.xaxis_date()
    loc = AutoDateLocator()
    ax.xaxis.set_major_locator(mdates.DayLocator(interval=3))  # every day
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    ax.grid(True, alpha=0.3)
    ax.set_title(f"{feature} — box per timestamp (distribution across 10 plants)")
    ax.set_xlabel("Date")
    ax.set_ylabel(feature)
    plt.tight_layout()
    plt.show()
    
for f in feature_columns:
    plot_feature_timestamp_boxplot(df, f)

# Daily box plots per day
This is just another idea I had to see the box plots in a bit of a cleaner view. Instead of a box plot per timestamp per feature, I decided maybe we grouped by days instead. So now there are ~ 10 plants * 4 times per day = 40 values per day. This limits us to just 30 boxes over the view, making it simpler to look at, but harder to capture smaller trends that may happen per timestamp. I also added the mean trend line again to still capture the big picture trends.

In [None]:
def plot_daily_boxes_all_plants(df, feature, showfliers=False, tick_every=1):
    data = df.copy()
    data["Timestamp"] = pd.to_datetime(data["Timestamp"], errors="coerce").dt.floor("s")

    # Bucket to days
    data["Day"] = data["Timestamp"].dt.normalize()
    day_index = np.sort(data["Day"].unique())

    # Values across all plants for each day
    box_data = [data.loc[data["Day"].eq(day), feature].to_numpy() for day in day_index]

    # x positions in date units + box width of half a day
    positions = mdates.date2num(day_index)
    width = 0.5 if len(positions) <= 1 else (positions[1] - positions[0]) * 0.5

    fig_w = max(14.0, len(day_index) * 0.5)
    fig, ax = plt.subplots(figsize=(fig_w, 5))

    bp = ax.boxplot(
        box_data,
        positions=positions,
        widths=width,
        manage_ticks=False,
        showfliers=showfliers,
        patch_artist=True,
    )
    # Again styling to be more readable
    for box in bp["boxes"]:
        box.set(facecolor="#cfe2ff", edgecolor="#2554a3", alpha=0.7, linewidth=1.0)
    for med in bp["medians"]: med.set(color="#000", linewidth=1.6)
    for whisk in bp["whiskers"]: whisk.set(color="#2554a3", linewidth=1.0)
    for cap in bp["caps"]: cap.set(color="#2554a3", linewidth=1.0)

    # Mean line across days
    means = [np.nanmean(v) if len(v) else np.nan for v in box_data]
    ax.plot(day_index, means, linewidth=2.0, alpha=0.95)

    ax.xaxis.set_major_locator(DayLocator(interval=tick_every))
    ax.xaxis.set_major_formatter(DateFormatter("%Y-%m-%d"))
    fig.autofmt_xdate(rotation=45)

    ax.grid(True, alpha=0.3)
    ax.set_title(f"{feature} — daily boxes across all plants + mean")
    ax.set_xlabel("Date")
    ax.set_ylabel(feature)
    plt.tight_layout()
    plt.show()

for f in feature_columns:
    plot_daily_boxes_all_plants(df, f, tick_every=1)

# Scatter plot per day
This groups all the values of each plant by day. It shows the range of values in each day, and we can notice trends where the values across all plants arre more similar. I also added a mean trend line again. 

This doesn't show the individual plant values, I tried doing this with different coloured dots, but they all blend together and I did not notice any real trends with this.

In [None]:
def plot_feature_day_strip_with_centered_bands(df, feature):
    data = df.copy()
    data["Timestamp"] = pd.to_datetime(data["Timestamp"], errors="coerce").dt.floor("s")
    # Collapse to day centers
    data["Day"] = data["Timestamp"].dt.normalize()
    days = np.sort(data["Day"].unique())
    x_day = mdates.date2num(days)

    jitter_frac = 0.25 # How wide each jitter should be relative to day spacing
    spacing = 1.0
    jitter_span = spacing * jitter_frac

    rng = np.random.default_rng(7)
    x = mdates.date2num(data["Day"].to_numpy()) + rng.uniform(-jitter_span, jitter_span, size=len(data))
    y = data[feature].to_numpy()

    fig_w = max(12, len(days) * 0.5)
    fig, ax = plt.subplots(figsize=(fig_w, 5))

    # For cleaner viewing I am alternating bands centered on day midpoints
    half_width = 0.5 * spacing
    for i, x_center in enumerate(x_day):
        if i % 2 == 0: # To shade every other day
            left = x_center - half_width
            right = x_center + half_width
            ax.axvspan(left, right, color="#8aa6ff", alpha=0.08, zorder=0)

    # Points
    ax.scatter(x, y, s=18, alpha=0.75, zorder=1)

    # Daily mean line
    means = data.groupby("Day")[feature].mean()
    ax.plot(means.index, means.values, linewidth=2.0, alpha=0.9, zorder=2)

    # Date axis
    ax.xaxis.set_major_locator(DayLocator(interval=1))
    ax.xaxis.set_major_formatter(DateFormatter("%Y-%m-%d"))
    fig.autofmt_xdate(rotation=45)

    # Padding x-limits by half a band so edge bands are fully visible
    ax.set_xlim(x_day[0] - half_width, x_day[-1] + half_width)

    ax.grid(True, axis="y", alpha=0.3)
    ax.set_title(f"{feature} — per-day strip with centered alternating bands")
    ax.set_xlabel("Day")
    ax.set_ylabel(feature)
    plt.tight_layout()
    plt.show()

for f in feature_columns:
    plot_feature_day_strip_with_centered_bands(df, f)

# Stacked histogram per feature
I figured it may be helpful to just strictly look at the distribtuion of values and ignoring specific timestamps. We can do different variations of these, I just showed a stacked histogram here to specifically look at the distributions of values across all plants. 

In [None]:
def plot_feature_histogram(df, feature):
    d = df.copy()
    d[feature] = pd.to_numeric(d[feature], errors="coerce")
    d = d.dropna(subset=[feature])

    plt.figure(figsize=(10, 5))
    sns.histplot(
        data=d,
        x=feature,
        hue="Plant_ID",
        multiple="stack",
        bins=20,
        kde=True,
        palette="tab10",
        alpha=0.7
    )
    plt.title(f"{feature} — Distribution Across All Plants")
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

for f in feature_columns:
    print(f"\n{f}")
    plot_feature_histogram(df, f)
