In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import sys
import subprocess
import warnings

# Install necessary libraries specifically for Colab
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "tsfresh", "prophet", "plotly", "scipy", "kaleido"])

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from prophet import Prophet
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

In [2]:
df = pd.read_csv("/content/cleaned_health_dataset.csv")


In [3]:
df

Unnamed: 0,User_ID,Full Name,Date,Age,Gender,Height (cm),Weight (kg),Steps_Taken,Calories_Burned,Hours_Slept,Water_Intake (Liters),Active_Minutes,Heart_Rate (bpm),Workout_Type,Stress_Level (1-10),Mood
0,1,Sara Martinez,2023-01-01,19,Female,191,80,11405,1862,7.021201,3.440508,80.0,84.0,Yoga,2,Happy
1,1,Sara Martinez,2023-01-02,19,Female,191,80,7010,1806,6.500790,3.103590,47.0,72.0,Cardio,9,Neutral
2,1,Sara Martinez,2023-01-03,19,Female,191,80,18942,2274,8.167817,3.814843,46.0,85.0,Yoga,6,Sad
3,1,Sara Martinez,2023-01-04,19,Female,191,80,13778,2656,7.453552,3.508636,105.0,77.0,Yoga,2,Sad
4,1,Sara Martinez,2023-01-05,19,Female,191,80,10247,3571,5.132654,2.739198,53.0,61.0,Yoga,8,Sad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36495,100,Sara Jones,2023-12-27,54,Male,151,107,10905,3922,6.500790,2.781033,61.0,96.0,Yoga,3,Sad
36496,100,Sara Jones,2023-12-28,54,Male,151,107,2245,3151,7.652822,2.739198,93.0,74.0,Yoga,10,Happy
36497,100,Sara Jones,2023-12-29,54,Male,151,107,11892,2906,7.173268,3.237390,43.0,67.0,Cardio,9,Happy
36498,100,Sara Jones,2023-12-30,54,Male,151,107,12967,1863,4.011179,3.650197,85.0,61.0,Yoga,4,Happy


In [4]:
# Standardize column names for easier coding
column_mapping = {
    'User_ID': 'person_id',
    'Date': 'timestamp',
    'Heart_Rate (bpm)': 'heart_rate',
    'Steps_Taken': 'daily_steps',
    'Hours_Slept': 'sleep_duration',
    'Active_Minutes': 'active_minutes',
    'Stress_Level (1-10)': 'stress_level',
    'Gender': 'gender',
    'Mood': 'mood',
    'Workout_Type': 'workout_type'
}
df = df.rename(columns=column_mapping)
# Clean any remaining formatting issues
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Convert Date
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['person_id', 'timestamp']).reset_index(drop=True)

# Encode Categoricals (for Clustering)
le = LabelEncoder()
df_encoded = df.copy()
for col in ['gender', 'mood', 'workout_type']:
    if col in df_encoded.columns:
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

print(f"    Data Ready: {len(df)} rows, {df['person_id'].nunique()} users.")

    Data Ready: 36500 rows, 100 users.


In [5]:
print(" Running Milestone 2 Clustering (Behavioral Baselines)...")

# Aggregate user stats
user_stats = df_encoded.groupby('person_id').agg({
    'heart_rate': ['mean', 'max'],
    'daily_steps': ['mean'],
    'sleep_duration': ['mean'],
    'active_minutes': ['mean']
})
user_stats.columns = ['_'.join(col) for col in user_stats.columns]
user_stats = user_stats.reset_index()

# K-Means Clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(user_stats.drop(columns=['person_id']))
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
user_stats['cluster'] = kmeans.fit_predict(X_scaled)

# Create a mapping dictionary: {User_ID: Cluster_ID}
cluster_map = dict(zip(user_stats['person_id'], user_stats['cluster']))

print("    Clustering Complete. User Personas identified.")

 Running Milestone 2 Clustering (Behavioral Baselines)...
    Clustering Complete. User Personas identified.


In [12]:
print("Initializing Anomaly Detection (Prophet + Domain Rules)...")

def analyze_user_health(target_user):
    """
    Runs the full detection pipeline for a single user.
    Returns: Dataframe with 'is_anomaly' flags and reasons.
    """
    # 1. Isolate User Data
    user_df = df[df['person_id'] == target_user].copy()
    user_cluster = cluster_map.get(target_user, -1)

    # 2. Prophet Modeling (Time-Series Filter)
    # We model Heart Rate trends
    prophet_input = user_df[['timestamp', 'heart_rate']].rename(columns={'timestamp': 'ds', 'heart_rate': 'y'})

    # Interval width 0.99 means we assume 99% of data is normal, flagging top 1% as outliers
    model = Prophet(interval_width=0.99, daily_seasonality=True)
    model.fit(prophet_input)
    forecast = model.predict(prophet_input)

    # Merge forecast back to original data
    results = pd.merge(user_df, forecast[['ds', 'yhat_lower', 'yhat_upper']], left_on='timestamp', right_on='ds')

    # 3. Detection Logic (The "Triple Filter")
    def label_anomaly(row):
        reasons = []
        is_anomaly = False

        # --- Filter A: Statistical (Prophet) ---
        if row['heart_rate'] > row['yhat_upper']:
            is_anomaly = True
            reasons.append("Trend Spike (Statistical)")
        elif row['heart_rate'] < row['yhat_lower']:
            is_anomaly = True
            reasons.append("Trend Drop (Statistical)")

        # --- Filter B: Medical (Domain Limits) ---
        # Rule 1: High HR (>100) while sedentary (Active Minutes < 15) -> Potential Tachycardia
        if row['heart_rate'] > 100:
            is_anomaly = True
            reasons.append("High HR at Rest (Medical)")

        # Rule 2: Critical Sleep (< 4 hours)
        if row['sleep_duration'] < 5:
            # We don't flag the whole row as an anomaly just for sleep,
            # but we note it. If combined with HR spike, it's critical.
            reasons.append("Critical Sleep Deprivation")
            if is_anomaly: # If HR is already weird, this makes it worse
                 reasons.append("Compound Risk")

        # --- Filter C: Behavioral (Cluster Context) ---
        # (Optional refinement: Check if user deviates from their cluster norm)

        if is_anomaly or len(reasons) > 0:
            return pd.Series([1, " + ".join(reasons)])
        else:
            return pd.Series([0, "Normal"])

    results[['is_anomaly', 'anomaly_reason']] = results.apply(label_anomaly, axis=1)
    return results


Initializing Anomaly Detection (Prophet + Domain Rules)...


In [13]:
# Select a user to visualize (User 1 is usually a good candidate)
target_id = df['person_id'].unique()[0]
print(f"    Analyzing User ID: {target_id}...")
analyzed_df = analyze_user_health(target_id)

anomaly_count = analyzed_df['is_anomaly'].sum()
print(f"    Analysis Complete. Found {anomaly_count} anomalies for User {target_id}.")

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.


    Analyzing User ID: 1...
    Analysis Complete. Found 64 anomalies for User 1.


In [14]:
print("Generating Visualization 1: Heart Rate Anomalies...")

# --- CHART 1: Interactive Heart Rate Anomaly Detection ---
fig_hr = go.Figure()

# 1. Confidence Band (The "Safe Zone")
fig_hr.add_trace(go.Scatter(
    x=analyzed_df['timestamp'], y=analyzed_df['yhat_upper'],
    mode='lines', line=dict(width=0), showlegend=False, hoverinfo='skip'
))
fig_hr.add_trace(go.Scatter(
    x=analyzed_df['timestamp'], y=analyzed_df['yhat_lower'],
    mode='lines', line=dict(width=0),
    fill='tonexty', fillcolor='rgba(0, 100, 255, 0.1)',
    name='Expected Range (99%)'
))

# 2. Actual Heart Rate Line
fig_hr.add_trace(go.Scatter(
    x=analyzed_df['timestamp'], y=analyzed_df['heart_rate'],
    mode='lines', name='Actual Heart Rate',
    line=dict(color='royalblue', width=1.5)
))

# 3. Anomaly Markers (Red X)
anomalies = analyzed_df[analyzed_df['is_anomaly'] == 1]
fig_hr.add_trace(go.Scatter(
    x=anomalies['timestamp'], y=anomalies['heart_rate'],
    mode='markers', name='Anomaly Detected',
    marker=dict(color='red', size=10, symbol='x', line=dict(width=2)),
    # Tooltip annotations
    text=anomalies['anomaly_reason'],
    hovertemplate="<b>Date:</b> %{x}<br><b>HR:</b> %{y} bpm<br><b>Reason:</b> %{text}<extra></extra>"
))

fig_hr.update_layout(
    title=f"<b>Heart Rate Anomaly Detection</b> | User {target_id}",
    xaxis_title="Date",
    yaxis_title="Heart Rate (BPM)",
    template="plotly_white",
    hovermode="x unified",
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)
fig_hr.show()




Generating Visualization 1: Heart Rate Anomalies...


In [15]:
print("Generating Visualization 2: Sleep Patterns...")

# --- CHART 2: Sleep Pattern Visualization ---
# Color Logic for Sleep Bars
def get_sleep_color(row):
    # If explicitly flagged as anomaly in our logic
    if "Sleep" in str(row['anomaly_reason']):
        return "Critical (<4h)", "red"
    elif row['sleep_duration'] < 6:
        return "Low (<6h)", "orange"
    return "Normal", "green"

analyzed_df[['sleep_status', 'bar_color']] = analyzed_df.apply(lambda x: pd.Series(get_sleep_color(x)), axis=1)

fig_sleep = px.bar(
    analyzed_df,
    x='timestamp',
    y='sleep_duration',
    color='sleep_status',
    color_discrete_map={
        "Normal": "#2ca02c",      # Green
        "Low (<6h)": "#ff7f0e",   # Orange
        "Critical (<4h)": "#d62728" # Red
    },
    title=f"<b>Sleep Quality & Anomalies</b> | User {target_id}"
)

# Add Reference Line for 8 Hours
fig_sleep.add_hline(y=8, line_dash="dot", annotation_text="Ideal (8h)", annotation_position="top right")

fig_sleep.update_layout(
    xaxis_title="Date",
    yaxis_title="Sleep Duration (Hours)",
    template="plotly_white",
    legend_title="Sleep Status"
)
fig_sleep.show()

print("\n>>> TASKS COMPLETED. Please screenshot the two charts above.")

Generating Visualization 2: Sleep Patterns...



>>> TASKS COMPLETED. Please screenshot the two charts above.


In [16]:
# =============================================================================
# EXTRA TASK: ANOMALY ROOT CAUSE ANALYSIS
# =============================================================================
print("Generating High Heart Rate Activity Report...")

# 1. Filter the analyzed dataframe for anomalies
# We specifically look for "High" or "Spike" in the reason text
high_hr_anomalies = analyzed_df[
    (analyzed_df['is_anomaly'] == 1) &
    (analyzed_df['anomaly_reason'].str.contains("High|Spike|Critical", case=False))
].copy()

# 2. Select relevant columns for the report
report = high_hr_anomalies[['timestamp', 'heart_rate', 'workout_type', 'daily_steps', 'anomaly_reason']]

# 3. Rename for cleaner output
report.columns = ['Date', 'Heart Rate (BPM)', 'Activity Type', 'Daily Steps', 'Alert Reason']

# 4. Sort by highest Heart Rate to see the most critical events first
report = report.sort_values(by='Heart Rate (BPM)', ascending=False)

# Display the data table
print(f"\nFound {len(report)} High Heart Rate Anomalies for User {target_id}:")
display(report.head(10)) # Shows top 10 rows in Colab

# --- VISUALIZATION: Which Activity Causes the Most Anomalies? ---
if not report.empty:
    import plotly.express as px

    # Count anomalies per activity type
    activity_counts = report['Activity Type'].value_counts().reset_index()
    activity_counts.columns = ['Activity', 'Count']

    fig_root_cause = px.bar(
        activity_counts,
        x='Activity',
        y='Count',
        color='Count',
        color_continuous_scale='Reds',
        title=f"<b>Activities Linked to Heart Rate Spikes</b> | User {target_id}",
        text='Count'
    )
    fig_root_cause.update_layout(template="plotly_white")
    fig_root_cause.show()
else:
    print("Good news! No High Heart Rate anomalies detected for this user.")

Generating High Heart Rate Activity Report...

Found 64 High Heart Rate Anomalies for User 1:


Unnamed: 0,Date,Heart Rate (BPM),Activity Type,Daily Steps,Alert Reason
148,2023-05-29,99.0,Yoga,14207,Critical Sleep Deprivation
244,2023-09-02,99.0,Yoga,7702,Critical Sleep Deprivation
174,2023-06-24,98.0,Strength,10179,Critical Sleep Deprivation
308,2023-11-05,98.0,Cardio,10497,Critical Sleep Deprivation
352,2023-12-19,97.0,Yoga,4597,Critical Sleep Deprivation
245,2023-09-03,96.0,Yoga,6454,Critical Sleep Deprivation
312,2023-11-09,95.0,Yoga,12304,Critical Sleep Deprivation
20,2023-01-21,92.0,Yoga,14052,Critical Sleep Deprivation
133,2023-05-14,92.0,Yoga,5496,Critical Sleep Deprivation
155,2023-06-05,92.0,Yoga,5587,Critical Sleep Deprivation
