In [69]:
import polars as pl
import datetime as dt

In [70]:
df = pl.read_csv("../data/raw/stress_transcribed.csv")
df

date,time,wellbeing_level,scale_1_5
str,str,str,i64
"""2025-06-09""","""03:00""","""Neutral""",3
"""2025-06-08""","""23:59""","""Pleasant""",4
"""2025-06-08""","""05:50""","""Pleasant""",4
"""2025-06-07""","""23:44""","""Very Pleasant""",5
"""2025-06-07""","""03:03""","""Slightly Pleasant""",4
…,…,…,…
"""2025-05-14""","""14:01""","""Neutral""",3
"""2025-05-13""","""18:00""","""Slightly Pleasant""",4
"""2025-05-13""","""14:00""","""Unpleasant""",2
"""2025-05-12""","""18:01""","""Pleasant""",4


In [71]:
# experiment_start_date = "2025-05-12"
# experiment_end_date = "2025-06-06" # maybe add monday 7th

# additional_week_start_date = "2025-06-30"
# additional_week_end_date = "2025-07-06" # maybe add monday 7th

experiment_start_date = pl.datetime(2025, 5, 11, 16, time_zone="UTC")
experiment_end_date = pl.datetime(2025, 6, 8, 16, time_zone="UTC")

TREATMENT_WEEKS = [20, 22]
CONTROL_WEEKS = [21, 23]

emotion_map = {
    "Very Pleasant": 7,
    "Pleasant": 6,
    "Slightly Pleasant": 5,
    "Neutral": 4,
    "Slightly Unpleasant": 3,
    "Unpleasant": 2,
    "Very Unpleasant": 1
}

In [72]:
df_clean = df.with_columns(
    pl.concat_str([pl.col("date"), pl.col("time")], separator=" ")
    .str.to_datetime(format="%Y-%m-%d %H:%M")
    .alias("datetime"),
    pl.col("wellbeing_level").replace(emotion_map).cast(pl.Int64).alias("emotion_score")
).with_columns(
    pl.when(pl.col("datetime") > pl.datetime(2025, 5, 31, 23, 59))
    .then((pl.col("datetime") - pl.duration(hours=9)))
    .otherwise(pl.col("datetime"))
).with_columns(
    pl.when(pl.col("datetime").dt.week().is_in(TREATMENT_WEEKS))
    .then(pl.lit("treatment"))
    .when(pl.col("datetime").dt.week().is_in(CONTROL_WEEKS))
    .then(pl.lit("control"))
    .otherwise(pl.lit("outside_experiment"))
    .alias("experiment_group"),
    pl.when(pl.col("datetime").dt.time() < pl.time(17, 30))
    .then(pl.lit(1))
    .otherwise(pl.lit(2))
    .alias("measurement_order")
)

df_clean

date,time,wellbeing_level,scale_1_5,datetime,emotion_score,experiment_group,measurement_order
str,str,str,i64,datetime[μs],i64,str,i32
"""2025-06-09""","""03:00""","""Neutral""",3,2025-06-08 18:00:00,4,"""control""",2
"""2025-06-08""","""23:59""","""Pleasant""",4,2025-06-08 14:59:00,6,"""control""",1
"""2025-06-08""","""05:50""","""Pleasant""",4,2025-06-07 20:50:00,6,"""control""",2
"""2025-06-07""","""23:44""","""Very Pleasant""",5,2025-06-07 14:44:00,7,"""control""",1
"""2025-06-07""","""03:03""","""Slightly Pleasant""",4,2025-06-06 18:03:00,5,"""control""",2
…,…,…,…,…,…,…,…
"""2025-05-14""","""14:01""","""Neutral""",3,2025-05-14 14:01:00,4,"""treatment""",1
"""2025-05-13""","""18:00""","""Slightly Pleasant""",4,2025-05-13 18:00:00,5,"""treatment""",2
"""2025-05-13""","""14:00""","""Unpleasant""",2,2025-05-13 14:00:00,2,"""treatment""",1
"""2025-05-12""","""18:01""","""Pleasant""",4,2025-05-12 18:01:00,6,"""treatment""",2


In [73]:
df_clean.write_csv("../data/processed/cleaned_stress_data.csv")