In [49]:
import polars as pl
from scipy import stats
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import matplotlib.dates as mdates

## Mann-Whitney U Test on Stress/Wellbeing Data

This notebook performs statistical analysis on stress/wellbeing measurements comparing treatment and control groups.

In [50]:
# Load the stress data
df = pl.read_csv("../data/processed/cleaned_stress_data.csv", try_parse_dates=True)
df

date,time,wellbeing_level,scale_1_5,datetime,emotion_score,experiment_group,measurement_order
date,time,str,i64,datetime[μs],i64,str,i64
2025-06-09,03:00:00,"""Neutral""",3,2025-06-08 18:00:00,4,"""control""",2
2025-06-08,23:59:00,"""Pleasant""",4,2025-06-08 14:59:00,6,"""control""",1
2025-06-08,05:50:00,"""Pleasant""",4,2025-06-07 20:50:00,6,"""control""",2
2025-06-07,23:44:00,"""Very Pleasant""",5,2025-06-07 14:44:00,7,"""control""",1
2025-06-07,03:03:00,"""Slightly Pleasant""",4,2025-06-06 18:03:00,5,"""control""",2
…,…,…,…,…,…,…,…
2025-05-14,14:01:00,"""Neutral""",3,2025-05-14 14:01:00,4,"""treatment""",1
2025-05-13,18:00:00,"""Slightly Pleasant""",4,2025-05-13 18:00:00,5,"""treatment""",2
2025-05-13,14:00:00,"""Unpleasant""",2,2025-05-13 14:00:00,2,"""treatment""",1
2025-05-12,18:01:00,"""Pleasant""",4,2025-05-12 18:01:00,6,"""treatment""",2


In [51]:
# Overview of the data
print("Data shape:", df.shape)
print("\nColumns:", df.columns)
print("\nExperiment groups:")
print(df.group_by("experiment_group").len())
print("\nWellbeing levels:")
print(df.group_by("wellbeing_level").len().sort("wellbeing_level"))
print("\nEmotion score range:")
print(df.group_by("emotion_score").len().sort("emotion_score"))

Data shape: (50, 8)

Columns: ['date', 'time', 'wellbeing_level', 'scale_1_5', 'datetime', 'emotion_score', 'experiment_group', 'measurement_order']

Experiment groups:
shape: (2, 2)
┌──────────────────┬─────┐
│ experiment_group ┆ len │
│ ---              ┆ --- │
│ str              ┆ u32 │
╞══════════════════╪═════╡
│ treatment        ┆ 25  │
│ control          ┆ 25  │
└──────────────────┴─────┘

Wellbeing levels:
shape: (6, 2)
┌─────────────────────┬─────┐
│ wellbeing_level     ┆ len │
│ ---                 ┆ --- │
│ str                 ┆ u32 │
╞═════════════════════╪═════╡
│ Neutral             ┆ 11  │
│ Pleasant            ┆ 12  │
│ Slightly Pleasant   ┆ 17  │
│ Slightly Unpleasant ┆ 5   │
│ Unpleasant          ┆ 3   │
│ Very Pleasant       ┆ 2   │
└─────────────────────┴─────┘

Emotion score range:
shape: (6, 2)
┌───────────────┬─────┐
│ emotion_score ┆ len │
│ ---           ┆ --- │
│ i64           ┆ u32 │
╞═══════════════╪═════╡
│ 2             ┆ 3   │
│ 3             ┆ 5   │
│ 4 

In [52]:
treatment_data = df.filter(pl.col("experiment_group") == "treatment")
control_data = df.filter(pl.col("experiment_group") == "control")

In [53]:
emotion_treatment = treatment_data["emotion_score"].to_numpy()
emotion_control = control_data["emotion_score"].to_numpy()

In [54]:
# Perform Mann-Whitney U tests
print("=== Mann-Whitney U Test Results ===\n")

# Emotion score test
emotion_stat, emotion_p = mannwhitneyu(emotion_treatment, emotion_control, alternative='two-sided')
print("Emotion Score:")
print(f"  Treatment median: {np.median(emotion_treatment):.2f}")
print(f"  Control median: {np.median(emotion_control):.2f}")
print(f"  U-statistic: {emotion_stat:.2f}")
print(f"  p-value: {emotion_p:.6f}")
print(f"  Significant (α=0.05): {'Yes' if emotion_p < 0.05 else 'No'}")

=== Mann-Whitney U Test Results ===

Emotion Score:
  Treatment median: 5.00
  Control median: 5.00
  U-statistic: 306.00
  p-value: 0.904186
  Significant (α=0.05): No


In [55]:
def create_stress_mannwhitney_typst_table(results_list):
    """Create a typst formatted table for Mann-Whitney U test results on stress data with improved formatting"""

    typst_table = f"""#figure(
  table(
    columns: (1.3fr, 0.7fr, 0.7fr, 1fr, 1fr, 0.9fr, 0.8fr, 1.1fr),
    align: (col, row) => {{
      if row == 0 {{ center + horizon }}
      else if col == 0 {{ left + horizon }}
      else if col == 1 or col == 2 {{ center + horizon }}
      else if col == 3 or col == 4 or col == 5 {{ right + horizon }}
      else if col == 6 {{ right + horizon }}
      else {{ center + horizon }}
    }},
    inset: (x: 6pt, y: 8pt),
    stroke: (x, y) => {{
      if y == 0 {{
        (bottom: 2pt + black, rest: 0.75pt + gray.lighten(30%))
      }}
      else {{
        (bottom: 0.4pt + gray.lighten(70%), rest: 0.75pt + gray.lighten(30%))
      }}
    }},
    fill: (col, row) => {{
      if row == 0 {{ blue.lighten(90%) }}
      else if calc.odd(row) {{ gray.lighten(97%) }}
      else {{ white }}
    }},
    table.header(
      [*Variable*],
      [*Treat. N*],
      [*Ctrl. N*],
      [*Treatment Median*],
      [*Control Median*],
      [*U-stat*],
      [*p-value*],
      [*Significance*]
    ),"""

    for result in results_list:
        # Format numbers with appropriate precision
        treatment_n = f"{result['treatment_n']}"
        control_n = f"{result['control_n']}"

        # Format medians with 3 significant digits (handle both string and numeric inputs)
        try:
            treatment_median = f"{float(result['treatment_median']):.3g}"
        except (ValueError, TypeError):
            treatment_median = str(result['treatment_median'])

        try:
            control_median = f"{float(result['control_median']):.3g}"
        except (ValueError, TypeError):
            control_median = str(result['control_median'])

        # Format U-statistic as integer if it's a whole number (handle string inputs)
        try:
            u_stat = float(result['u_statistic'])
            if u_stat.is_integer():
                u_stat_str = f"{int(u_stat)}"
            else:
                u_stat_str = f"{u_stat:.1f}"
        except (ValueError, TypeError):
            u_stat_str = str(result['u_statistic'])

        # Format p-value with appropriate precision (handle string inputs)
        try:
            p_val = float(result['p_value'])
            if p_val < 0.001:
                p_val_str = "< 0.001"
            elif p_val < 0.01:
                p_val_str = f"{p_val:.3f}"
            else:
                p_val_str = f"{p_val:.3f}"
        except (ValueError, TypeError):
            p_val_str = str(result['p_value'])

        # Format result with color coding
        result_text = result['result']
        if 'significant' in result_text.lower() and 'not' not in result_text.lower():
            result_formatted = f"[#text(fill: red.darken(20%))[*{result_text}*]]"
        elif 'not significant' in result_text.lower():
            result_formatted = f"[#text(fill: gray.darken(30%))[{result_text}]]"
        else:
            result_formatted = f"[{result_text}]"

        typst_table += f"""
    [*{result['variable']}*],
    [{treatment_n}],
    [{control_n}],
    [{treatment_median}],
    [{control_median}],
    [{u_stat_str}],
    [#{p_val_str}],
    {result_formatted},"""

    typst_table += """
  ),
  caption: [Mann-Whitney U Test Results for Stress Variables by Treatment Group],
  kind: table
)"""

    return typst_table

    return typst_table

def stress_mannwhitney_test_result(treatment_data, control_data, variable_name):
    """Perform Mann-Whitney U test and format results for stress data typst table"""

    # Remove any NaN values
    treatment_clean = treatment_data[~np.isnan(treatment_data)]
    control_clean = control_data[~np.isnan(control_data)]

    if len(treatment_clean) > 0 and len(control_clean) > 0:
        # Perform Mann-Whitney U test
        u_stat, p_value = mannwhitneyu(treatment_clean, control_clean, alternative='two-sided')

        # Calculate medians
        treatment_median = np.median(treatment_clean)
        control_median = np.median(control_clean)

        # Determine significance
        result = 'Significant' if p_value < 0.05 else 'Not significant'

        return {
            'variable': variable_name.replace('_', ' ').title(),
            'treatment_n': len(treatment_clean),
            'control_n': len(control_clean),
            'treatment_median': f"{treatment_median:.2f}",
            'control_median': f"{control_median:.2f}",
            'u_statistic': f"{u_stat:.2f}",
            'p_value': f"{p_value:.6f}" if p_value >= 0.000001 else "< 0.000001",
            'result': result
        }
    else:
        return {
            'variable': variable_name.replace('_', ' ').title(),
            'treatment_n': len(treatment_clean),
            'control_n': len(control_clean),
            'treatment_median': 'N/A',
            'control_median': 'N/A',
            'u_statistic': 'N/A',
            'p_value': 'N/A',
            'result': 'Insufficient data'
        }

In [56]:
# Generate typst table for stress data analysis
stress_results = [
    stress_mannwhitney_test_result(emotion_treatment, emotion_control, "emotion_score")
]

stress_typst_table = create_stress_mannwhitney_typst_table(stress_results)
print(stress_typst_table)

#figure(
  table(
    columns: (1.3fr, 0.7fr, 0.7fr, 1fr, 1fr, 0.9fr, 0.8fr, 1.1fr),
    align: (col, row) => {
      if row == 0 { center + horizon }
      else if col == 0 { left + horizon }
      else if col == 1 or col == 2 { center + horizon }
      else if col == 3 or col == 4 or col == 5 { right + horizon }
      else if col == 6 { right + horizon }
      else { center + horizon }
    },
    inset: (x: 6pt, y: 8pt),
    stroke: (x, y) => {
      if y == 0 {
        (bottom: 2pt + black, rest: 0.75pt + gray.lighten(30%))
      }
      else {
        (bottom: 0.4pt + gray.lighten(70%), rest: 0.75pt + gray.lighten(30%))
      }
    },
    fill: (col, row) => {
      if row == 0 { blue.lighten(90%) }
      else if calc.odd(row) { gray.lighten(97%) }
      else { white }
    },
    table.header(
      [*Variable*],
      [*Treat. N*],
      [*Ctrl. N*],
      [*Treatment Median*],
      [*Control Median*],
      [*U-stat*],
      [*p-value*],
      [*Significance*]
    ),
    [*Emoti

In [57]:
# Create descriptive statistics typst table
def create_descriptive_stats_typst(df):
    """Create typst table with descriptive statistics for the dataset"""

    # Calculate descriptive stats by group
    desc_stats = df.group_by("experiment_group").agg([
        pl.col("emotion_score").count().alias("n"),
        pl.col("emotion_score").mean().alias("mean_emo"),
        pl.col("emotion_score").median().alias("med_emo"),
        pl.col("emotion_score").std().alias("sd_emo"),
        pl.col("scale_1_5").mean().alias("mean_scale"),
        pl.col("scale_1_5").median().alias("med_scale"),
        pl.col("scale_1_5").std().alias("sd_scale")
    ]).sort("experiment_group")

    typst_table = """#table(
  columns: (auto, auto, auto, auto, auto, auto, auto, auto, auto),
  inset: 6pt,
  align: center + horizon,
  stroke: 0.5pt,
  table.header(
    [*Group*], [*N*], [*Emo Mean*], [*Emo Med*], [*Emo SD*], [*Scale Mean*], [*Scale Med*], [*Scale SD*], [*Total*]
  ),"""

    total_n = df.height

    for row in desc_stats.iter_rows(named=True):
        group = row['experiment_group'].title()
        n = row['n']
        mean_emo = f"{row['mean_emo']:.2f}"
        med_emo = f"{row['med_emo']:.1f}"
        sd_emo = f"{row['sd_emo']:.2f}"
        mean_scale = f"{row['mean_scale']:.2f}"
        med_scale = f"{row['med_scale']:.1f}"
        sd_scale = f"{row['sd_scale']:.2f}"

        typst_table += f"\n  [{group}], [{n}], [{mean_emo}], [{med_emo}], [{sd_emo}], [{mean_scale}], [{med_scale}], [{sd_scale}], [],"""

    # Add total row
    total_mean_emo = df["emotion_score"].mean()
    total_med_emo = df["emotion_score"].median()
    total_sd_emo = df["emotion_score"].std()
    total_mean_scale = df["scale_1_5"].mean()
    total_med_scale = df["scale_1_5"].median()
    total_sd_scale = df["scale_1_5"].std()

    typst_table += f"\n  [*Total*], [*{total_n}*], [*{total_mean_emo:.2f}*], [*{total_med_emo:.1f}*], [*{total_sd_emo:.2f}*], [*{total_mean_scale:.2f}*], [*{total_med_scale:.1f}*], [*{total_sd_scale:.2f}*], [*{total_n}*],"

    typst_table += "\n)"
    return typst_table

# Generate and display the descriptive statistics table
desc_table = create_descriptive_stats_typst(df)
print(desc_table)

#table(
  columns: (auto, auto, auto, auto, auto, auto, auto, auto, auto),
  inset: 6pt,
  align: center + horizon,
  stroke: 0.5pt,
  table.header(
    [*Group*], [*N*], [*Emo Mean*], [*Emo Med*], [*Emo SD*], [*Scale Mean*], [*Scale Med*], [*Scale SD*], [*Total*]
  ),
  [Control], [25], [4.80], [5.0], [1.22], [3.48], [4.0], [0.87], [],
  [Treatment], [25], [4.64], [5.0], [1.25], [3.52], [4.0], [0.77], [],
  [*Total*], [*50*], [*4.72*], [*5.0*], [*1.23*], [*3.50*], [*4.0*], [*0.81*], [*50*],
)
