In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import sys, numpy as np, pandas as pd, matplotlib
print("Python:", sys.version.split()[0], "| pandas:", pd.__version__, "| matplotlib:", matplotlib.__version__)
print("OK: inline plotting is ON")


Python: 3.13.6 | pandas: 2.3.2 | matplotlib: 3.10.6
OK: inline plotting is ON


In [2]:
# Files (adjust paths if needed)
INPUT_SUBSET   = "ATUS_2010_PM_Masters_SouthMidwest_AllDays.csv"  # your subset
BIN            = 10
LINKAGE        = "complete"   # we’ll use COMPLETE linkage here
N_CLUSTERS     = 5

import os
assert os.path.exists(INPUT_SUBSET), f"Missing file: {INPUT_SUBSET}"
print("Parameters OK")


Parameters OK


In [3]:
df = pd.read_csv(INPUT_SUBSET, dtype={"case_id": str}, encoding="latin1")

# normalize strings
for c in ["case_id","activity_code","activity_name","start_time","stop_time",
          "sex_label","region_label","education_label","weekday_label","event_n"]:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip()

# minute offsets for convenience (not strictly needed later)
df["start_min"] = pd.to_timedelta(df["start_time"]).dt.total_seconds().div(60).astype(int)
df["stop_min"]  = pd.to_timedelta(df["stop_time"]).dt.total_seconds().div(60).astype(int)

df = df.sort_values(["case_id","start_min","event_n"], na_position="last").reset_index(drop=True)

print("Rows:", len(df), "| Unique cases:", df["case_id"].nunique())
display(df.head(3))


Rows: 14049 | Unique cases: 649


Unnamed: 0,case_id,event_n,activity_code,activity_name,start_time,stop_time,sex_label,PEEDUCA,education_label,GEREG,region_label,TUDIARYDAY,weekday_label,is_weekend,start_min,stop_min
0,'20100101100520,1,1,Sleeping,04:00:00,07:00:00,Female,44,Masters degree,3,South,1,Sunday,True,240,420
1,'20100101100520,2,11,Eating and drinking,07:00:00,09:00:00,Female,44,Masters degree,3,South,1,Sunday,True,420,540
2,'20100101100520,3,18,Travel related to personal care,09:00:00,09:15:00,Female,44,Masters degree,3,South,1,Sunday,True,540,555


In [4]:
SLOTS = (24*60)//BIN
bin_starts = np.arange(0, 24*60, BIN)

rows, ids = [], []
for cid, g in df.groupby("case_id", sort=False):
    starts = g["start_min"].to_numpy()
    stops  = g["stop_min"].to_numpy()
    codes  = g["activity_code"].astype(str).to_numpy()

    idx = np.searchsorted(starts, bin_starts, side="right") - 1
    valid = (idx >= 0) & (stops[idx] > bin_starts)
    row = np.where(valid, codes[idx], "")
    rows.append(row); ids.append(cid)

seq = pd.DataFrame(rows, index=ids, columns=[f"S{k:03d}" for k in range(SLOTS)]).reset_index(names="case_id")
seq.to_csv("SSA_Subset_binned_10min.csv", index=False)
print("Saved -> SSA_Subset_binned_10min.csv | shape:", seq.shape)


Saved -> SSA_Subset_binned_10min.csv | shape: (649, 145)


In [5]:
!pip install -q python-Levenshtein scipy tqdm

import numpy as np, pandas as pd
import Levenshtein as L
from tqdm import tqdm
from scipy.cluster.hierarchy import linkage, fcluster

# encode to chars for fast Levenshtein
X = seq.drop(columns=["case_id"]).astype(str).to_numpy()
states = pd.unique(X.ravel())
states = states[states != ""]
base = 0x3700
state2char = {s: chr(base+i) for i, s in enumerate(sorted(states))}
state2char[""] = " "

sequences = ["".join(state2char[v] for v in row) for row in X]

# condensed distances
m = len(sequences)
n_pairs = m*(m-1)//2
dist_condensed = np.empty(n_pairs, dtype=np.float32)
idx = 0
for i in tqdm(range(m-1), desc="Levenshtein distances"):
    si = sequences[i]
    for j in range(i+1, m):
        dist_condensed[idx] = L.distance(si, sequences[j])
        idx += 1

print("Pairs:", n_pairs)


Levenshtein distances: 100%|█████████████████████████████████████████████████████████| 648/648 [00:10<00:00, 60.20it/s]

Pairs: 210276





In [6]:
from scipy.cluster.hierarchy import linkage as _linkage, fcluster as _fcluster

Z_alt = _linkage(dist_condensed, method="complete")
labels_alt = _fcluster(Z_alt, N_CLUSTERS, criterion="maxclust")
membership_alt = pd.DataFrame({"case_id": seq["case_id"].astype(str), "cluster": labels_alt})

print("Cluster sizes (COMPLETE):")
display(membership_alt.value_counts("cluster").sort_index().to_frame("n"))

# (optional) save for reuse
membership_alt.to_csv("SSA_Subset_clusters_OM_complete_k5.csv", index=False)


Cluster sizes (COMPLETE):


Unnamed: 0_level_0,n
cluster,Unnamed: 1_level_1
1,84
2,29
3,16
4,519
5,1


In [7]:
# Join clusters
base2 = df.merge(membership_alt, on="case_id", how="inner")

# naive minutes = stop_min - start_min (no cross-midnight split; no overlap handling)
mins = (pd.to_timedelta(base2["stop_time"]).dt.total_seconds()/60
      - pd.to_timedelta(base2["start_time"]).dt.total_seconds()/60)
base2["minutes_naive"] = mins.clip(lower=0)

# person x activity minutes, then cluster average (NOTE: averages only over people who DID that activity)
pp = base2.groupby(["cluster","case_id","activity_name"], as_index=False)["minutes_naive"].sum()

# pivot to wide (missing activities drop out here!)
avg_cluster_minutes = pp.groupby(["cluster","activity_name"], as_index=False)["minutes_naive"].mean()
naive_hours_wide = (avg_cluster_minutes
                    .pivot(index="cluster", columns="activity_name", values="minutes_naive")
                    .fillna(0) / 60.0).round(2)

print("Naïve 'exact HOURS' (will tend to overcount due to overlaps and conditional averaging):")
display(naive_hours_wide)


Naïve 'exact HOURS' (will tend to overcount due to overlaps and conditional averaging):


activity_name,"Attending meetings, conferences, & training",Attending religious services,Civic obligations & participation,Doing aerobics,Eating and drinking,Insufficient detail in verbatim,Interior cleaning,Physical care for hh children,Physical care for nonhh children,Security procedures rel. to consumer purchases,Sleeping,Socializing and communicating with others,"Taking class for degree, certification, or licensure",Travel related to personal care,Using household services,Using paid childcare services,Waiting associated with telephone calls,"Work, main job"
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,2.43,1.93,0.75,1.81,1.63,0.72,2.96,2.21,0.7,0.96,11.83,4.35,5.0,1.58,1.26,1.1,0.71,2.77
2,0.48,2.17,0.0,1.07,1.26,0.56,1.13,1.05,0.15,0.63,9.83,2.07,1.29,1.77,0.0,0.25,0.86,8.32
3,5.34,2.67,0.0,1.06,1.15,0.72,1.55,1.52,0.42,1.31,4.96,2.64,0.0,3.49,0.0,0.42,1.03,2.53
4,1.73,1.74,0.62,1.42,1.25,1.68,2.43,1.94,0.83,1.01,4.1,3.92,3.59,1.49,0.65,0.8,0.64,6.38
5,0.0,0.0,0.0,0.0,2.0,1.5,4.5,0.0,0.0,0.0,0.5,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
naive_totals = naive_hours_wide.sum(axis=1).round(2)
print("Total hours per cluster (naïve, likely > 24):")
display(naive_totals.to_frame("total_hours"))


Total hours per cluster (naïve, likely > 24):


Unnamed: 0_level_0,total_hours
cluster,Unnamed: 1_level_1
1,44.7
2,32.89
3,30.81
4,36.22
5,14.5


In [9]:
# convert HH:MM:SS to minutes [0,1440)
def to_min(s):
    td = pd.to_timedelta(s)
    return (td.dt.total_seconds()/60).astype(int) % (24*60)

norm_rows = []
cols = list(df.columns) + ["seg_start","seg_stop"]

for _, g in df.groupby("case_id", sort=False):
    g = g.sort_values(["start_time","stop_time","event_n"] if "event_n" in g.columns else ["start_time","stop_time"])
    g = g.copy()
    g["s"] = to_min(g["start_time"])
    g["e"] = to_min(g["stop_time"])
    for _, r in g.iterrows():
        s, e = int(r["s"]), int(r["e"])
        if e >= s:
            norm_rows.append(list(r.drop(labels=["s","e"]).values) + [s, e])
        else:
            # split across midnight
            norm_rows.append(list(r.drop(labels=["s","e"]).values) + [s, 1440])
            norm_rows.append(list(r.drop(labels=["s","e"]).values) + [0, e])

norm = pd.DataFrame(norm_rows, columns=cols)

# simple overlap count per case: any seg_start < next seg_stop after sorting
def count_overlaps(g):
    g = g.sort_values(["seg_start","seg_stop"]).reset_index(drop=True)
    overlaps = (g["seg_start"].shift(-1) < g["seg_stop"]).fillna(False)
    return int(overlaps.sum())

ovl = norm.groupby("case_id").apply(count_overlaps).rename("overlap_count").reset_index()

share_overlapping_cases = (ovl["overlap_count"] > 0).mean()
print("Share of cases with at least one overlap:", round(share_overlapping_cases, 3))
display(ovl.head(10))


Share of cases with at least one overlap: 0.966


  ovl = norm.groupby("case_id").apply(count_overlaps).rename("overlap_count").reset_index()


Unnamed: 0,case_id,overlap_count
0,'20100101100520,1
1,'20100101100658,1
2,'20100101100920,1
3,'20100101101236,1
4,'20100101101423,1
5,'20100101101742,1
6,'20100101101777,1
7,'20100101101864,0
8,'20100101101981,1
9,'20100101102311,1


In [10]:
# Minute timeline per person (first-write-wins), then cluster average hours
case_ids = norm["case_id"].unique().tolist()
activity_minutes = []

for cid, g in norm.groupby("case_id", sort=False):
    timeline = np.full(1440, "", dtype=object)
    g = g.sort_values(["seg_start","seg_stop","event_n"], na_position="last").reset_index(drop=True)
    for _, r in g.iterrows():
        a = str(r["activity_name"]); s = int(r["seg_start"]); e = int(r["seg_stop"])
        if e > s:
            blank = (timeline[s:e] == "")
            if blank.any():
                idx = np.where(blank)[0] + s
                timeline[idx] = a
    vals, counts = np.unique(timeline[timeline != ""], return_counts=True)
    activity_minutes.append(dict(zip(vals, counts)))

pp_minutes = pd.DataFrame(activity_minutes, index=case_ids).fillna(0).astype(int)
pp_minutes.insert(0, "case_id", case_ids)

pp_minutes_cl = pp_minutes.merge(membership_alt, on="case_id", how="left")
exact_minutes = pp_minutes_cl.drop(columns=["case_id"]).groupby("cluster").mean()
exact_hours = (exact_minutes / 60.0).round(2)

print("Exact HOURS per cluster (minute timeline, sums ≈ 24):")
display(exact_hours)
print("Totals:")
display(exact_hours.sum(axis=1).round(2).to_frame("total_hours"))


Exact HOURS per cluster (minute timeline, sums ≈ 24):


Unnamed: 0_level_0,Doing aerobics,Eating and drinking,Interior cleaning,Security procedures rel. to consumer purchases,Sleeping,Socializing and communicating with others,Travel related to personal care,"Work, main job",Waiting associated with telephone calls,Physical care for hh children,"Attending meetings, conferences, & training",Attending religious services,Physical care for nonhh children,Using paid childcare services,"Taking class for degree, certification, or licensure",Insufficient detail in verbatim,Using household services,Civic obligations & participation
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0.47,1.48,2.81,0.53,9.09,5.53,1.5,0.53,0.15,0.53,0.32,0.34,0.12,0.12,0.18,0.24,0.06,0.01
2,0.22,1.19,0.85,0.15,7.62,2.29,1.67,8.81,0.09,0.33,0.07,0.22,0.01,0.03,0.34,0.11,0.0,0.0
3,0.4,1.02,1.29,0.76,9.12,2.11,3.31,0.65,0.32,0.55,3.67,0.5,0.05,0.08,0.0,0.18,0.0,0.0
4,0.32,1.17,1.99,0.48,9.84,3.74,1.32,3.28,0.1,0.71,0.22,0.21,0.09,0.06,0.08,0.37,0.01,0.0
5,0.0,1.0,15.5,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0


Totals:


Unnamed: 0_level_0,total_hours
cluster,Unnamed: 1_level_1
1,24.01
2,24.0
3,24.01
4,23.99
5,24.0
