In [3]:
import pandas as pd
import numpy as np
from math import log
from collections import Counter


FILE_PATH = "ATUS_2010_Cluster_joined_main_dataset.csv"

# Match column names exactly (case + spaces)
CASE_ID_COL   = "Case ID"
GENDER_COL    = "Gender"
ACTIVITY_COL  = "Activity"
START_COL     = "Start Timestamp"
END_COL       = "End Timestamp"

# For wide 144-slot format (probably not used here)
SLOT_PREFIX   = "slot_"

DIARY_START_HOUR = 4
BIN_MINUTES = 10
N_SLOTS = int((24*60)//BIN_MINUTES)  # 144


In [4]:
# === Cell 2: Load & detect schema (Updated) ===
df = pd.read_csv(FILE_PATH)

# Clean headers & trim whitespace in key columns
df.columns = df.columns.str.strip()
for c in [GENDER_COL, ACTIVITY_COL]:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# Normalize gender labels
def _norm_gender(x):
    x = str(x).strip().lower()
    if x in ["m", "male", "man"]:
        return "Male"
    if x in ["f", "female", "woman", "women"]:
        return "Female"
    return str(x) if x else "Unknown"

df[GENDER_COL] = df[GENDER_COL].apply(_norm_gender)

# Detect wide vs long
slot_cols = [c for c in df.columns if c.startswith(SLOT_PREFIX)]
is_wide = len(slot_cols) >= N_SLOTS

print("Detected format:", "WIDE (144 slots)" if is_wide else "LONG (events with times)")

if is_wide:
    slot_cols = sorted(slot_cols, key=lambda x: int(x.replace(SLOT_PREFIX,"")))
else:
    for col in [CASE_ID_COL, ACTIVITY_COL, START_COL, END_COL]:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    df[START_COL] = pd.to_datetime(df[START_COL], errors="coerce")
    df[END_COL]   = pd.to_datetime(df[END_COL], errors="coerce")
    if df[[START_COL, END_COL]].isna().any().any():
        raise ValueError("Found NaT in start/end times. Please fix/clean time columns.")


Detected format: LONG (events with times)


  df[START_COL] = pd.to_datetime(df[START_COL], errors="coerce")
  df[END_COL]   = pd.to_datetime(df[END_COL], errors="coerce")


In [5]:
# === Cell 3: Helpers ===

def longest_run_length(labels):
    max_run = 0
    current_label = None
    current_len = 0
    for lab in labels:
        if pd.isna(lab):
            current_label = None
            current_len = 0
            continue
        if lab == current_label:
            current_len += 1
        else:
            current_label = lab
            current_len = 1
        max_run = max(max_run, current_len)
    return max_run

def shannon_entropy(labels):
    labs = [x for x in labels if pd.notna(x)]
    if len(labs) == 0:
        return 0.0, 0.0
    counts = Counter(labs)
    total = sum(counts.values())
    ps = [v/total for v in counts.values()]
    H = -sum(p*log(p) for p in ps)
    k = len(counts)
    H_norm = H / log(k) if k > 1 else 0.0
    return H, H_norm

def compute_complexity(sequence_activities, bin_minutes=10):
    seq = list(sequence_activities)
    transitions = sum(
        1 for i in range(1, len(seq))
        if pd.notna(seq[i-1]) and pd.notna(seq[i]) and seq[i] != seq[i-1]
    )
    unique_count = len({x for x in seq if pd.notna(x)})
    max_run_slots = longest_run_length(seq)
    max_run_minutes = max_run_slots * bin_minutes
    H, H_norm = shannon_entropy(seq)
    return {
        "Transitions": transitions,
        "Unique": unique_count,
        "Max run (min)": max_run_minutes,
        "Entropy": H,
        "Entropy (norm)": H_norm
    }

def build_10min_sequence_from_long(df_case, diary_start_hour=4, bin_minutes=10):
    first_ts = df_case[START_COL].min()
    if pd.isna(first_ts):
        return [np.nan]*N_SLOTS

    day0 = first_ts.normalize()
    diary_start = day0 + pd.Timedelta(hours=diary_start_hour)
    if first_ts < diary_start:
        diary_start -= pd.Timedelta(days=1)
    diary_end = diary_start + pd.Timedelta(hours=24)

    rows = []
    for _, r in df_case.iterrows():
        s = max(r[START_COL], diary_start)
        e = min(r[END_COL], diary_end)
        if pd.isna(s) or pd.isna(e) or e <= s:
            continue
        rows.append((s, e, r[ACTIVITY_COL]))

    if not rows:
        return [np.nan]*N_SLOTS

    seq = []
    for k in range(N_SLOTS):
        t = diary_start + pd.Timedelta(minutes=k*bin_minutes)
        label = np.nan
        for (s, e, act) in rows:
            if s <= t < e:
                label = act
                break
        seq.append(label)
    return seq


In [6]:
# === Cell 4: Build sequences per case (long format only) ===
if not is_wide:
    sequences = {}
    for cid, g in df.groupby(CASE_ID_COL, sort=False):
        sequences[cid] = build_10min_sequence_from_long(
            g.sort_values(START_COL),
            diary_start_hour=DIARY_START_HOUR,
            bin_minutes=BIN_MINUTES
        )
    seq_df = pd.DataFrame.from_dict(sequences, orient="index")
    seq_df.columns = [f"{SLOT_PREFIX}{i}" for i in range(N_SLOTS)]
    seq_df.index.name = CASE_ID_COL
    seq_df.reset_index(inplace=True)

    gender_map = df.drop_duplicates(subset=[CASE_ID_COL])[ [CASE_ID_COL, GENDER_COL] ]
    wide_df = seq_df.merge(gender_map, on=CASE_ID_COL, how="left")
else:
    wide_df = df[[CASE_ID_COL, GENDER_COL] + slot_cols].copy()

print(wide_df.shape)
wide_df.head()


(649, 146)


Unnamed: 0,Case ID,slot_0,slot_1,slot_2,slot_3,slot_4,slot_5,slot_6,slot_7,slot_8,...,slot_135,slot_136,slot_137,slot_138,slot_139,slot_140,slot_141,slot_142,slot_143,Gender
0,'20100101100520,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,...,,,,,,,,,,Female
1,'20100101100658,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,...,,,,,,,,,,Male
2,'20100101100920,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,...,,,,,,,,,,Female
3,'20100101101236,,,,,,,,,,...,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Male
4,'20100101101423,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,Sleeping,...,,,,,,,,,,Female


In [7]:
# === Cell 5: Per-case metrics and Gender averages ===
def compute_metrics_row(row, slot_columns):
    seq = [row[c] for c in slot_columns]
    return compute_complexity(seq, bin_minutes=BIN_MINUTES)

slot_cols_use = slot_cols if is_wide else [f"{SLOT_PREFIX}{i}" for i in range(N_SLOTS)]

metrics = wide_df.apply(lambda r: pd.Series(compute_metrics_row(r, slot_cols_use)), axis=1)
metrics_df = pd.concat([wide_df[[CASE_ID_COL, GENDER_COL]].reset_index(drop=True), metrics], axis=1)

agg = (metrics_df
       .groupby(GENDER_COL, dropna=False)
       .agg({
           "Transitions": "mean",
           "Unique": "mean",
           "Max run (min)": "mean",
           "Entropy": "mean",
           "Entropy (norm)": "mean"
       })
       .rename_axis("Subgroup")
       .reset_index())

agg_rounded = agg.copy()
for col in ["Transitions", "Unique", "Entropy", "Entropy (norm)"]:
    agg_rounded[col] = agg_rounded[col].round(3)
agg_rounded["Max run (min)"] = agg_rounded["Max run (min)"].round(1)

print("Male vs Female — Complexity metrics (averages):")
display(agg_rounded)

OUT_PATH = "complexity_by_gender.csv"
agg_rounded.to_csv(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")


Male vs Female — Complexity metrics (averages):


Unnamed: 0,Subgroup,Transitions,Unique,Max run (min),Entropy,Entropy (norm)
0,Female,13.599,6.177,269.1,1.351,0.713
1,Male,11.412,5.654,280.2,1.233,0.673


Saved: complexity_by_gender.csv


In [8]:
# Ensure DayType column exists and is trimmed
DAYTYPE_COL = "DayType"
if DAYTYPE_COL not in df.columns:
    raise ValueError("Missing 'DayType' column in the raw data.")

# Make a per-case DayType mapping (from the long/original df), then join to metrics_df
daytype_map = (df[[CASE_ID_COL, DAYTYPE_COL]]
               .drop_duplicates(subset=[CASE_ID_COL])
               .copy())
daytype_map[DAYTYPE_COL] = daytype_map[DAYTYPE_COL].astype(str).str.strip().str.title()

# Attach DayType to per-case metrics (metrics_df was created in Cell 5)
metrics_daytype = metrics_df.merge(daytype_map, on=CASE_ID_COL, how="left")

# Aggregate by DayType
agg_day = (metrics_daytype
           .groupby(DAYTYPE_COL, dropna=False)
           .agg({
               "Transitions": "mean",
               "Unique": "mean",
               "Max run (min)": "mean",
               "Entropy": "mean",
               "Entropy (norm)": "mean"
           })
           .rename_axis("Subgroup")
           .reset_index())

# Round for presentation
agg_day_rounded = agg_day.copy()
for col in ["Transitions", "Unique", "Entropy", "Entropy (norm)"]:
    agg_day_rounded[col] = agg_day_rounded[col].round(3)
agg_day_rounded["Max run (min)"] = agg_day_rounded["Max run (min)"].round(1)

print("Weekday vs Weekend — Complexity metrics (averages):")
display(agg_day_rounded)

# Save
agg_day_rounded.to_csv("complexity_by_daytype.csv", index=False)
print("Saved: complexity_by_daytype.csv")

# (Optional) save per-case with DayType for robustness checks
per_case_daytype = metrics_daytype.copy()
for col in ["Transitions", "Unique", "Entropy", "Entropy (norm)"]:
    per_case_daytype[col] = per_case_daytype[col].round(3)
per_case_daytype["Max run (min)"] = per_case_daytype["Max run (min)"].round(1)

per_case_daytype.to_csv("complexity_per_case_daytype.csv", index=False)
print("Saved: complexity_per_case_daytype.csv")


Weekday vs Weekend — Complexity metrics (averages):


Unnamed: 0,Subgroup,Transitions,Unique,Max run (min),Entropy,Entropy (norm)
0,Weekday,13.279,6.179,292.0,1.306,0.685
1,Weekend,12.208,5.772,256.5,1.301,0.708


Saved: complexity_by_daytype.csv
Saved: complexity_per_case_daytype.csv
