In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import sys, numpy as np, pandas as pd, matplotlib
print("Python:", sys.version.split()[0], "| pandas:", pd.__version__, "| matplotlib:", matplotlib.__version__)
print("OK: inline plotting is ON")


Python: 3.13.6 | pandas: 2.3.2 | matplotlib: 3.10.6
OK: inline plotting is ON


In [2]:
# === Parameters ===
INPUT_SUBSET   = "ATUS_2010_PM_Masters_SouthMidwest_AllDays.csv"  # <-- your subset file
BIN            = 10       # minutes per slot (144 slots/day)
N_CLUSTERS     = 5        # requested number of clusters
SAMPLE_FOR_OM  = None     # set to an int (e.g., 500) to sample; None = use all cases 
RANDOM_SEED    = 42
LINKAGE        = "average"  # 'average' or 'complete' recommended for OM/Hamming dissimilarities

import os
assert os.path.exists(INPUT_SUBSET), f"Input not found: {INPUT_SUBSET}"
print("Parameters OK")


Parameters OK


In [3]:
df = pd.read_csv(INPUT_SUBSET, dtype={"case_id": str}, encoding="latin1")

for c in ["case_id","activity_code","activity_name","start_time","stop_time"]:
    df[c] = df[c].astype(str).str.strip()

# Convert to minutes from midnight
df["start_min"] = pd.to_timedelta(df["start_time"]).dt.total_seconds().div(60).astype(int)
df["stop_min"]  = pd.to_timedelta(df["stop_time"]).dt.total_seconds().div(60).astype(int)

# Sort by case/time
df = df.sort_values(["case_id","start_min","event_n"]).reset_index(drop=True)

print("Rows:", len(df), "| Unique cases:", df["case_id"].nunique())
print("First 3 case_ids (repr):")
for x in df["case_id"].astype(str).head(3):
    print(repr(x))

display(df.head(3))


Rows: 14049 | Unique cases: 649
First 3 case_ids (repr):
"'20100101100520"
"'20100101100520"
"'20100101100520"


Unnamed: 0,case_id,event_n,activity_code,activity_name,start_time,stop_time,sex_label,PEEDUCA,education_label,GEREG,region_label,TUDIARYDAY,weekday_label,is_weekend,start_min,stop_min
0,'20100101100520,1,1,Sleeping,04:00:00,07:00:00,Female,44,Masters degree,3,South,1,Sunday,True,240,420
1,'20100101100520,2,11,Eating and drinking,07:00:00,09:00:00,Female,44,Masters degree,3,South,1,Sunday,True,420,540
2,'20100101100520,3,18,Travel related to personal care,09:00:00,09:15:00,Female,44,Masters degree,3,South,1,Sunday,True,540,555


In [4]:
try:
    from tqdm.notebook import tqdm
except Exception:
    tqdm = lambda x, **k: x  # no progress bar fallback

SLOTS = (24*60)//BIN
bin_starts = np.arange(0, 24*60, BIN)

rows, ids = [], []
for cid, g in tqdm(df.groupby("case_id", sort=False), total=df["case_id"].nunique(), desc="Binning cases"):
    starts = g["start_min"].to_numpy()
    stops  = g["stop_min"].to_numpy()
    codes  = g["activity_code"].astype(str).to_numpy()

    idx = np.searchsorted(starts, bin_starts, side="right") - 1
    valid = (idx >= 0) & (stops[idx] > bin_starts)
    row = np.where(valid, codes[idx], "")
    rows.append(row)
    ids.append(cid)

seq = pd.DataFrame(rows, index=ids, columns=[f"S{k:03d}" for k in range(SLOTS)]).reset_index(names="case_id")
seq.to_csv("SSA_Subset_binned_10min.csv", index=False)
print("Saved -> SSA_Subset_binned_10min.csv | shape:", seq.shape)

# Codebook: activity_code -> activity_name
codebook = (df[["activity_code","activity_name"]]
            .drop_duplicates()
            .sort_values("activity_code"))
codebook.to_csv("SSA_Subset_codebook.csv", index=False)
print("Saved -> SSA_Subset_codebook.csv")


Binning cases:   0%|          | 0/649 [00:00<?, ?it/s]

Saved -> SSA_Subset_binned_10min.csv | shape: (649, 145)
Saved -> SSA_Subset_codebook.csv


In [5]:
# Install once if needed (run cell, restart kernel only if required):
!pip install -q python-Levenshtein scipy tqdm

import numpy as np, pandas as pd, time, sys
from tqdm import tqdm
import Levenshtein as L
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster

print("OK: OM dependencies loaded")


OK: OM dependencies loaded


In [6]:

seq = pd.read_csv("SSA_Subset_binned_10min.csv", dtype={"case_id": str}).fillna("")

# Optional sampling for OM (set SAMPLE_FOR_OM in Cell 2)
rng = np.random.default_rng(RANDOM_SEED)
all_ids = seq["case_id"].tolist()
n_all   = len(all_ids)

if SAMPLE_FOR_OM is not None:
    n_take  = min(int(SAMPLE_FOR_OM), n_all)
    keep    = set(rng.choice(all_ids, size=n_take, replace=False))
    seq_om  = seq[seq["case_id"].isin(keep)].reset_index(drop=True)
else:
    seq_om  = seq.reset_index(drop=True)

case_ids = seq_om["case_id"].astype(str).tolist()
X = seq_om.drop(columns=["case_id"]).astype(str).to_numpy()

print(f"[SAMPLE] Using {len(case_ids)} / {n_all} cases; sequence length = {X.shape[1]}")

# Map activity codes to single chars (fast Levenshtein)
states = pd.unique(X.ravel())
states = states[states != ""]
base = 0x3700  # start of an unused Unicode block
state2char = {s: chr(base+i) for i, s in enumerate(sorted(states))}
state2char[""] = " "

sequences = ["".join([state2char[v] for v in row]) for row in X]
print("[ENCODE] Example sequence (first 60 chars):", sequences[0][:60], "...")


[SAMPLE] Using 649 / 649 cases; sequence length = 144
[ENCODE] Example sequence (first 60 chars):                         㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜀㜄㜄㜄㜄㜄㜄㜄㜄㜄㜄㜄㜄㜐㜐㜈㜈㜈㜈 ...


In [7]:
m = len(sequences)
n_pairs = m*(m-1)//2
dist_condensed = np.empty(n_pairs, dtype=np.float32)

idx = 0
t0 = time.time()
for i in tqdm(range(m-1), desc="Levenshtein distances"):
    s_i = sequences[i]
    for j in range(i+1, m):
        dist_condensed[idx] = L.distance(s_i, sequences[j])  # unit costs: sub=1, ins=1, del=1
        idx += 1
t1 = time.time()
print(f"[OM] computed {n_pairs} pairwise distances in {t1-t0:.1f}s")


Levenshtein distances: 100%|█████████████████████████████████████████████████████████| 648/648 [00:10<00:00, 60.78it/s]

[OM] computed 210276 pairwise distances in 10.7s





In [8]:
# NOTE: 'average'/'complete' are safer for dissimilarities than 'ward'
LINKAGE = LINKAGE  # from Cell 2
K       = N_CLUSTERS

Z = linkage(dist_condensed, method=LINKAGE)
labels = fcluster(Z, K, criterion="maxclust")

membership = pd.DataFrame({"case_id": case_ids, "cluster": labels})
out_mem = f"SSA_Subset_clusters_OM_{LINKAGE}_k{K}_n{len(case_ids)}.csv"
membership.to_csv(out_mem, index=False)
print("Saved ->", out_mem)

print("\nCluster sizes:")
display(membership.value_counts("cluster").sort_index().to_frame("n"))


Saved -> SSA_Subset_clusters_OM_average_k5_n649.csv

Cluster sizes:


Unnamed: 0_level_0,n
cluster,Unnamed: 1_level_1
1,561
2,85
3,1
4,1
5,1


In [9]:
# Join sequences with clusters
joined = seq_om.merge(membership, on="case_id", how="left")

def top_states_share(df_cluster, topn=12):
    vals = df_cluster.drop(columns=["case_id","cluster"]).to_numpy().ravel()
    vals = vals[vals != ""]
    s = pd.Series(vals).value_counts(normalize=True).head(topn)
    return (s*100).round(1)

summary_codes = joined.groupby("cluster").apply(top_states_share).unstack().fillna(0.0)
summary_codes = summary_codes.rename_axis(index="cluster", columns="state_code")
display(summary_codes)

# Map to names
codebook = pd.read_csv("SSA_Subset_codebook.csv")
code2name = dict(zip(codebook["activity_code"].astype(str), codebook["activity_name"]))
summary_names = summary_codes.rename(columns=code2name)

print("\nTop activity NAMES by time share per cluster:")
display(summary_names)

out_codes  = f"SSA_Subset_cluster_top_CODES_OM_{LINKAGE}_k{K}_n{len(case_ids)}.csv"
out_names  = f"SSA_Subset_cluster_top_NAMES_OM_{LINKAGE}_k{K}_n{len(case_ids)}.csv"
summary_codes.to_csv(out_codes)
summary_names.to_csv(out_names)
print("Saved ->", out_codes)
print("Saved ->", out_names)


  summary_codes = joined.groupby("cluster").apply(top_states_share).unstack().fillna(0.0)


state_code,1.0,2.0,3.0,5.0,7.0,8.0,11.0,12.0,13.0,14.0,15.0,16.0,18.0,50.0
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,23.0,10.7,3.8,18.4,2.5,0.0,6.5,19.1,1.7,1.1,1.6,0.0,7.8,1.8
2,37.3,11.8,2.6,2.9,2.4,0.0,7.0,20.0,2.2,2.0,1.8,0.0,6.9,0.8
3,21.1,4.7,7.0,18.0,3.1,0.8,8.6,0.0,2.3,0.0,19.5,3.1,11.7,0.0
4,21.4,5.7,0.0,0.0,10.7,0.0,3.6,0.0,0.0,0.0,0.0,0.0,58.6,0.0
5,4.3,39.1,0.0,0.0,0.0,0.0,17.4,26.1,0.0,0.0,0.0,0.0,0.0,13.0



Top activity NAMES by time share per cluster:


state_code,1.0,2.0,3.0,5.0,7.0,8.0,11.0,12.0,13.0,14.0,15.0,16.0,18.0,50.0
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,23.0,10.7,3.8,18.4,2.5,0.0,6.5,19.1,1.7,1.1,1.6,0.0,7.8,1.8
2,37.3,11.8,2.6,2.9,2.4,0.0,7.0,20.0,2.2,2.0,1.8,0.0,6.9,0.8
3,21.1,4.7,7.0,18.0,3.1,0.8,8.6,0.0,2.3,0.0,19.5,3.1,11.7,0.0
4,21.4,5.7,0.0,0.0,10.7,0.0,3.6,0.0,0.0,0.0,0.0,0.0,58.6,0.0
5,4.3,39.1,0.0,0.0,0.0,0.0,17.4,26.1,0.0,0.0,0.0,0.0,0.0,13.0


Saved -> SSA_Subset_cluster_top_CODES_OM_average_k5_n649.csv
Saved -> SSA_Subset_cluster_top_NAMES_OM_average_k5_n649.csv


In [10]:
print("summary_codes column sample & dtype:")
print(list(summary_codes.columns)[:10])
print(type(list(summary_codes.columns)[0]))

codebook = pd.read_csv("SSA_Subset_codebook.csv")
print("\ncodebook activity_code sample & dtype:")
print(codebook["activity_code"].head().tolist())
print(codebook["activity_code"].dtype)


summary_codes column sample & dtype:
[1.0, 2.0, 3.0, 5.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]
<class 'float'>

codebook activity_code sample & dtype:
[1, 10, 11, 12, 13]
int64


In [11]:
# Ensure we have the latest codebook
codebook = pd.read_csv("SSA_Subset_codebook.csv").copy()

# Build BOTH string- and numeric-keyed maps
cb_str_map = dict(zip(codebook["activity_code"].astype(str), codebook["activity_name"]))
cb_num_map = dict(zip(pd.to_numeric(codebook["activity_code"], errors="coerce"),
                      codebook["activity_name"]))

# 1) Try mapping by STRING keys
summary_names_fixed = summary_codes.rename(columns=cb_str_map)

# 2) If any numeric-looking codes remain, convert cols to numeric and map again
#    (this covers columns like 1.0, 2.0 that didn't match "1", "2" above)
unmapped = [c for c in summary_names_fixed.columns if isinstance(c, (int, float))]
if len(unmapped) > 0:
    tmp = summary_names_fixed.copy()
    # convert all columns to numeric where possible
    tmp.columns = pd.to_numeric(tmp.columns, errors="coerce")
    # rename using numeric map
    tmp = tmp.rename(columns=cb_num_map)
    summary_names_fixed = tmp

print("Top activity NAMES by time share per cluster (fixed):")
display(summary_names_fixed)

# Save the fixed version
out_names_fixed = f"SSA_Subset_cluster_top_NAMES_OM_{LINKAGE}_k{K}_n{len(case_ids)}_FIXED.csv"
summary_names_fixed.to_csv(out_names_fixed)
print("Saved ->", out_names_fixed)

# Optional: show any columns that still didn’t map (should be none)
still_num = [c for c in summary_names_fixed.columns if isinstance(c, (int, float))]
if still_num:
    print("Warning: These columns did not map to names:", still_num)


Top activity NAMES by time share per cluster (fixed):


state_code,Sleeping,Interior cleaning,Physical care for hh children,"Work, main job",Security procedures rel. to consumer purchases,Using paid childcare services,Eating and drinking,Socializing and communicating with others,Doing aerobics,Attending religious services,"Attending meetings, conferences, & training",Waiting associated with telephone calls,Travel related to personal care,Insufficient detail in verbatim
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,23.0,10.7,3.8,18.4,2.5,0.0,6.5,19.1,1.7,1.1,1.6,0.0,7.8,1.8
2,37.3,11.8,2.6,2.9,2.4,0.0,7.0,20.0,2.2,2.0,1.8,0.0,6.9,0.8
3,21.1,4.7,7.0,18.0,3.1,0.8,8.6,0.0,2.3,0.0,19.5,3.1,11.7,0.0
4,21.4,5.7,0.0,0.0,10.7,0.0,3.6,0.0,0.0,0.0,0.0,0.0,58.6,0.0
5,4.3,39.1,0.0,0.0,0.0,0.0,17.4,26.1,0.0,0.0,0.0,0.0,0.0,13.0


Saved -> SSA_Subset_cluster_top_NAMES_OM_average_k5_n649_FIXED.csv


In [12]:

singletons = membership[membership["cluster"].isin([3,4,5])]["case_id"].tolist()
print("Singleton case_ids:", singletons)


Singleton case_ids: ["'20100201100839", "'20100605100746", "'20101009100653"]


In [13]:
from scipy.cluster.hierarchy import linkage, fcluster

LINKAGE_ALT = "complete"  # try 'complete' instead of 'average'
K_ALT = 5

Z_alt = linkage(dist_condensed, method=LINKAGE_ALT)
labels_alt = fcluster(Z_alt, K_ALT, criterion="maxclust")

membership_alt = pd.DataFrame({"case_id": case_ids, "cluster": labels_alt})
print("Cluster sizes with LINKAGE =", LINKAGE_ALT)
display(membership_alt.value_counts("cluster").sort_index().to_frame("n"))

# Save (alt)
out_mem_alt = f"SSA_Subset_clusters_OM_{LINKAGE_ALT}_k{K_ALT}_n{len(case_ids)}.csv"
membership_alt.to_csv(out_mem_alt, index=False)
print("Saved ->", out_mem_alt)


Cluster sizes with LINKAGE = complete


Unnamed: 0_level_0,n
cluster,Unnamed: 1_level_1
1,82
2,291
3,37
4,4
5,235


Saved -> SSA_Subset_clusters_OM_complete_k5_n649.csv


In [14]:
# Summaries for COMPLETE-linkage clusters (membership_alt)

import pandas as pd
import numpy as np

# Use the sequences from earlier; if not in memory, load the binned sequences.
if 'seq_om' not in globals():
    seq_om = pd.read_csv("SSA_Subset_binned_10min.csv", dtype={"case_id": str}).fillna("")

# Quick cluster size check
print("Cluster sizes (COMPLETE):")
display(membership_alt.value_counts("cluster").sort_index().to_frame("n"))

# Join sequences with clusters
joined_alt = seq_om.merge(membership_alt, on="case_id", how="inner")

def top_states_share(df_cluster, topn=12):
    vals = df_cluster.drop(columns=["case_id","cluster"]).to_numpy().ravel()
    vals = vals[vals != ""]
    s = pd.Series(vals).value_counts(normalize=True).head(topn)
    return (s*100).round(1)

# Top activity codes (percentage of time per cluster)
summary_codes_alt = (
    joined_alt.groupby("cluster").apply(top_states_share).unstack().fillna(0.0)
).rename_axis(index="cluster", columns="state_code")

display(summary_codes_alt)

# Map codes -> names robustly (handles '1' vs 1.0)
codebook = pd.read_csv("SSA_Subset_codebook.csv")
cb_str_map = dict(zip(codebook["activity_code"].astype(str), codebook["activity_name"]))
cb_num_map = dict(zip(pd.to_numeric(codebook["activity_code"], errors="coerce"),
                      codebook["activity_name"]))

summary_names_alt = summary_codes_alt.rename(columns=cb_str_map)
# If any numeric-looking columns remain, convert and map again
unmapped = [c for c in summary_names_alt.columns if isinstance(c, (int, float))]
if unmapped:
    tmp = summary_names_alt.copy()
    tmp.columns = pd.to_numeric(tmp.columns, errors="coerce")
    tmp = tmp.rename(columns=cb_num_map)
    summary_names_alt = tmp

print("\nTop activity NAMES by time share per cluster (COMPLETE):")
display(summary_names_alt)

# Save outputs
k_alt = membership_alt["cluster"].nunique()
n_alt = len(membership_alt)
out_codes_alt = f"SSA_Subset_cluster_top_CODES_OM_complete_k{k_alt}_n{n_alt}.csv"
out_names_alt = f"SSA_Subset_cluster_top_NAMES_OM_complete_k{k_alt}_n{n_alt}.csv"

summary_codes_alt.to_csv(out_codes_alt)
summary_names_alt.to_csv(out_names_alt)
print("Saved ->", out_codes_alt)
print("Saved ->", out_names_alt)


Cluster sizes (COMPLETE):


Unnamed: 0_level_0,n
cluster,Unnamed: 1_level_1
1,82
2,291
3,37
4,4
5,235


  joined_alt.groupby("cluster").apply(top_states_share).unstack().fillna(0.0)


state_code,1.0,2.0,3.0,5.0,6.0,7.0,11.0,12.0,13.0,14.0,15.0,18.0,50.0
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,38.0,12.1,2.6,2.1,0.8,2.6,7.2,19.3,2.3,1.6,1.6,7.1,0.0
2,26.5,11.4,4.6,2.9,0.0,3.2,6.7,25.3,2.2,1.5,2.6,8.1,2.5
3,29.6,4.2,1.8,40.9,0.0,0.7,4.3,6.2,0.9,1.1,0.8,7.7,0.7
4,22.8,9.2,0.5,28.6,0.0,1.0,16.4,14.8,0.0,0.3,0.0,4.1,2.3
5,17.2,11.0,3.2,33.6,0.0,2.0,6.5,13.9,1.3,0.7,0.6,7.6,1.1



Top activity NAMES by time share per cluster (COMPLETE):


state_code,Sleeping,Interior cleaning,Physical care for hh children,"Work, main job","Taking class for degree, certification, or licensure",Security procedures rel. to consumer purchases,Eating and drinking,Socializing and communicating with others,Doing aerobics,Attending religious services,"Attending meetings, conferences, & training",Travel related to personal care,Insufficient detail in verbatim
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,38.0,12.1,2.6,2.1,0.8,2.6,7.2,19.3,2.3,1.6,1.6,7.1,0.0
2,26.5,11.4,4.6,2.9,0.0,3.2,6.7,25.3,2.2,1.5,2.6,8.1,2.5
3,29.6,4.2,1.8,40.9,0.0,0.7,4.3,6.2,0.9,1.1,0.8,7.7,0.7
4,22.8,9.2,0.5,28.6,0.0,1.0,16.4,14.8,0.0,0.3,0.0,4.1,2.3
5,17.2,11.0,3.2,33.6,0.0,2.0,6.5,13.9,1.3,0.7,0.6,7.6,1.1


Saved -> SSA_Subset_cluster_top_CODES_OM_complete_k5_n649.csv
Saved -> SSA_Subset_cluster_top_NAMES_OM_complete_k5_n649.csv


In [15]:
# Convert percent-of-day to hours for each activity column in summary_names_alt
def pct_to_hours(df):
    out = df.copy()
    for c in out.columns:
        if c != "cluster":
            out[c] = (out[c].astype(float) * 24.0 / 100.0).round(2)
    return out

summary_hours_alt = pct_to_hours(summary_names_alt)
print("Top activity HOURS (~) per cluster (COMPLETE linkage):")
display(summary_hours_alt)


Top activity HOURS (~) per cluster (COMPLETE linkage):


state_code,Sleeping,Interior cleaning,Physical care for hh children,"Work, main job","Taking class for degree, certification, or licensure",Security procedures rel. to consumer purchases,Eating and drinking,Socializing and communicating with others,Doing aerobics,Attending religious services,"Attending meetings, conferences, & training",Travel related to personal care,Insufficient detail in verbatim
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,9.12,2.9,0.62,0.5,0.19,0.62,1.73,4.63,0.55,0.38,0.38,1.7,0.0
2,6.36,2.74,1.1,0.7,0.0,0.77,1.61,6.07,0.53,0.36,0.62,1.94,0.6
3,7.1,1.01,0.43,9.82,0.0,0.17,1.03,1.49,0.22,0.26,0.19,1.85,0.17
4,5.47,2.21,0.12,6.86,0.0,0.24,3.94,3.55,0.0,0.07,0.0,0.98,0.55
5,4.13,2.64,0.77,8.06,0.0,0.48,1.56,3.34,0.31,0.17,0.14,1.82,0.26


In [16]:
import pandas as pd
def pct_and_hours(df):
    pct = summary_names_alt.copy()
    hrs = (pct.astype(float) * 24.0 / 100.0)
    pretty = pct.copy()
    for c in pct.columns:
        pretty[c] = pct[c].map(lambda v: f"{float(v):.1f}% (~{float(v)*24/100:.1f} h)")
    return pretty

summary_pretty_alt = pct_and_hours(summary_names_alt)
print("Top activities with percent and ~hours per cluster:")
display(summary_prety_alt if 'summary_prety_alt' in globals() else summary_pretty_alt)


Top activities with percent and ~hours per cluster:


state_code,Sleeping,Interior cleaning,Physical care for hh children,"Work, main job","Taking class for degree, certification, or licensure",Security procedures rel. to consumer purchases,Eating and drinking,Socializing and communicating with others,Doing aerobics,Attending religious services,"Attending meetings, conferences, & training",Travel related to personal care,Insufficient detail in verbatim
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,38.0% (~9.1 h),12.1% (~2.9 h),2.6% (~0.6 h),2.1% (~0.5 h),0.8% (~0.2 h),2.6% (~0.6 h),7.2% (~1.7 h),19.3% (~4.6 h),2.3% (~0.6 h),1.6% (~0.4 h),1.6% (~0.4 h),7.1% (~1.7 h),0.0% (~0.0 h)
2,26.5% (~6.4 h),11.4% (~2.7 h),4.6% (~1.1 h),2.9% (~0.7 h),0.0% (~0.0 h),3.2% (~0.8 h),6.7% (~1.6 h),25.3% (~6.1 h),2.2% (~0.5 h),1.5% (~0.4 h),2.6% (~0.6 h),8.1% (~1.9 h),2.5% (~0.6 h)
3,29.6% (~7.1 h),4.2% (~1.0 h),1.8% (~0.4 h),40.9% (~9.8 h),0.0% (~0.0 h),0.7% (~0.2 h),4.3% (~1.0 h),6.2% (~1.5 h),0.9% (~0.2 h),1.1% (~0.3 h),0.8% (~0.2 h),7.7% (~1.8 h),0.7% (~0.2 h)
4,22.8% (~5.5 h),9.2% (~2.2 h),0.5% (~0.1 h),28.6% (~6.9 h),0.0% (~0.0 h),1.0% (~0.2 h),16.4% (~3.9 h),14.8% (~3.6 h),0.0% (~0.0 h),0.3% (~0.1 h),0.0% (~0.0 h),4.1% (~1.0 h),2.3% (~0.6 h)
5,17.2% (~4.1 h),11.0% (~2.6 h),3.2% (~0.8 h),33.6% (~8.1 h),0.0% (~0.0 h),2.0% (~0.5 h),6.5% (~1.6 h),13.9% (~3.3 h),1.3% (~0.3 h),0.7% (~0.2 h),0.6% (~0.1 h),7.6% (~1.8 h),1.1% (~0.3 h)


In [1]:
import pandas as pd
from datetime import datetime, timedelta
import re

# --- paths ---
INPUT_EVENTS   = "ATUS_2010_PM_Masters_SouthMidwest_AllDays.csv"     # your subset
MEMBERSHIP_CSV = "SSA_Subset_clusters_OM_complete_k5_n649.csv"       # <-- put the *exact* file saved by OM notebook
CLUSTER_TAG    = "OM_complete_k5"                                    # label to remember scheme

# --- load ---
ev = pd.read_csv(INPUT_EVENTS, dtype={"case_id": str}, encoding="latin1")
mem = pd.read_csv(MEMBERSHIP_CSV, dtype={"case_id": str})

# --- normalize ids (strip stray leading apostrophes etc.) ---
def norm_id(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"^'+", "", s)     # remove any leading apostrophes
    return s

for c in ["case_id"]:
    ev[c]  = ev[c].astype(str).map(norm_id)
    mem[c] = mem[c].astype(str).map(norm_id)

# --- sanity: coverage ---
n_ev_cases  = ev["case_id"].nunique()
n_mem_cases = mem["case_id"].nunique()
print("Events cases:", n_ev_cases, "| Membership cases:", n_mem_cases)

# who is missing a cluster?
miss_ids = sorted(set(ev["case_id"]) - set(mem["case_id"]))
print("Cases missing a cluster label:", len(miss_ids))
if miss_ids[:5]:
    print("Sample:", miss_ids[:5])

# If you *sampled* in the OM notebook, some cases may be missing.
# Ideally, re-run OM on *all 649* cases; otherwise you can drop unlabelled traces for cluster-based PM slices:
# ev = ev[ev["case_id"].isin(mem["case_id"])]


Events cases: 649 | Membership cases: 649
Cases missing a cluster label: 0


In [2]:
ev_cl = ev.merge(mem[["case_id","cluster"]], on="case_id", how="left")
# Optional: make a composite attribute that encodes scheme + id (nice in PM tools)
ev_cl["Cluster"] = ev_cl["cluster"].map(lambda x: f"{CLUSTER_TAG}_{int(x)}" if pd.notna(x) else "UNCL")
ev_cl.head(3)


Unnamed: 0,case_id,event_n,activity_code,activity_name,start_time,stop_time,sex_label,PEEDUCA,education_label,GEREG,region_label,TUDIARYDAY,weekday_label,is_weekend,cluster,Cluster
0,20100101100520,1,1,Sleeping,04:00:00,07:00:00,Female,44,Masters degree,3,South,1,Sunday,True,2,OM_complete_k5_2
1,20100101100520,2,11,Eating and drinking,07:00:00,09:00:00,Female,44,Masters degree,3,South,1,Sunday,True,2,OM_complete_k5_2
2,20100101100520,3,18,Travel related to personal care,09:00:00,09:15:00,Female,44,Masters degree,3,South,1,Sunday,True,2,OM_complete_k5_2


In [3]:
BASE_DATE = datetime(2010,1,1)

def make_dt(hms):
    t = datetime.strptime(hms, "%H:%M:%S").time()
    return datetime.combine(BASE_DATE.date(), t)

def start_end_datetimes(row):
    s = make_dt(row["start_time"])
    e = make_dt(row["stop_time"])
    if e < s: e += timedelta(days=1)  # cross-midnight
    return s, e

rows = []
for _, r in ev_cl.iterrows():
    s,e = start_end_datetimes(r)
    rows.append({
        "Case ID": r["case_id"],
        "Activity": r["activity_name"],
        "Start Timestamp": s.strftime("%Y-%m-%d %H:%M:%S"),
        "End Timestamp":   e.strftime("%Y-%m-%d %H:%M:%S"),
        "Cluster":         r["Cluster"],     # <- case attribute
    })
disc = pd.DataFrame(rows)
disc.to_csv("log_intervals_disco_with_cluster.csv", index=False)

cel = disc.rename(columns={
    "Case ID":"CASE_ID",
    "Activity":"ACTIVITY",
    "Start Timestamp":"START_TS",
    "End Timestamp":"END_TS"
})
cel.to_csv("log_intervals_celonis_with_cluster.csv", index=False)

print("Saved:",
      "log_intervals_disco_with_cluster.csv,",
      "log_intervals_celonis_with_cluster.csv")


Saved: log_intervals_disco_with_cluster.csv, log_intervals_celonis_with_cluster.csv


In [4]:
import pandas as pd, re

MEMBERSHIP_CSV = "SSA_Subset_clusters_OM_complete_k5_n649.csv"  # <-- use the exact file you saved
mem = pd.read_csv(MEMBERSHIP_CSV, dtype={"case_id": str})

def norm_id(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"^'+", "", s)  # drop leading apostrophes
    return s

mem["case_id"] = mem["case_id"].astype(str).map(norm_id)
print("Loaded membership rows:", len(mem), "| distinct cases:", mem["case_id"].nunique())
display(mem.head())


Loaded membership rows: 649 | distinct cases: 649


Unnamed: 0,case_id,cluster
0,20100101100520,2
1,20100101100658,2
2,20100101100920,2
3,20100101101236,1
4,20100101101423,2


In [7]:

expected = {1:82, 2:291, 3:37, 4:4, 5:235}

sizes = mem["cluster"].value_counts().sort_index().to_dict()
print("Observed COMPLETE cluster sizes:", sizes)

# Hard assert (will raise if mismatch)
assert sizes == expected, f"Cluster sizes differ! expected={expected} observed={sizes}"
print("✅ Sizes match the COMPLETE solution exactly.")


Observed COMPLETE cluster sizes: {1: 82, 2: 291, 3: 37, 4: 4, 5: 235}
✅ Sizes match the COMPLETE solution exactly.


In [8]:
import pandas as pd
from datetime import datetime, timedelta
import re

# --- inputs ---
INPUT_EVENTS    = "ATUS_2010_PM_Masters_SouthMidwest_AllDays.csv"
MEMBERSHIP_CSV  = "SSA_Subset_clusters_OM_complete_k5_n649.csv"  # your OM COMPLETE k=5 labels
CLUSTER_TAG     = "OM_complete_k5"  # label the scheme in the exported files

# --- helpers ---
def norm_id(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"^'+", "", s)    # drop any leading apostrophes
    return s

def to_cid(s: str) -> str:
    # Make case IDs explicitly non-numeric to avoid scientific notation everywhere
    return "CID_" + norm_id(s)

def hms(s: str) -> str:
    # Ensure HH:MM:SS formatting (pads if needed)
    t = datetime.strptime(s.strip(), "%H:%M:%S").time()
    return t.strftime("%H:%M:%S")

BASE_DATE = datetime(2010, 1, 1)
def make_dt(hms_str):
    t = datetime.strptime(hms_str, "%H:%M:%S").time()
    return datetime.combine(BASE_DATE.date(), t)

def start_end_datetimes(row):
    s = make_dt(row["start_time"])
    e = make_dt(row["stop_time"])
    if e < s:
        e += timedelta(days=1)  # cross-midnight
    return s, e

# --- load events + clusters ---
ev  = pd.read_csv(INPUT_EVENTS, dtype={"case_id": str}, encoding="latin1")
mem = pd.read_csv(MEMBERSHIP_CSV, dtype={"case_id": str})

for c in ["case_id","activity_name","start_time","stop_time"]:
    ev[c] = ev[c].astype(str).str.strip()
mem["case_id"] = mem["case_id"].astype(str).str.strip()

# merge cluster labels from OM notebook
ev = ev.merge(mem[["case_id","cluster"]], on="case_id", how="left")

# create robust case-id text
ev["CaseID_text"] = ev["case_id"].map(to_cid)

# make a friendly cluster tag (string) to guarantee text semantics in tools
ev["Cluster"] = ev["cluster"].map(lambda x: f"{CLUSTER_TAG}_{int(x)}" if pd.notna(x) else "UNCL")

# --- (A) Disco: time-only intervals ---
disc_rows = []
for _, r in ev.iterrows():
    disc_rows.append({
        "Case ID":        r["CaseID_text"],                 # text — no scientific notation
        "Activity":       r["activity_name"],
        "Start Timestamp": hms(r["start_time"]),            # HH:MM:SS only
        "End Timestamp":   hms(r["stop_time"]),             # HH:MM:SS only (no cross-midnight correction here by request)
        "Cluster":         r["Cluster"],                    # case attribute
    })
pd.DataFrame(disc_rows).to_csv("log_intervals_disco_with_cluster_TIMEONLY.csv", index=False)

# --- (B) Celonis: time-only intervals ---
cel_rows = []
for _, r in ev.iterrows():
    cel_rows.append({
        "CASE_ID":  r["CaseID_text"],
        "ACTIVITY": r["activity_name"],
        "START_TS": hms(r["start_time"]),                  # HH:MM:SS
        "END_TS":   hms(r["stop_time"]),                   # HH:MM:SS
        "CLUSTER":  r["Cluster"],
    })
pd.DataFrame(cel_rows).to_csv("log_intervals_celonis_with_cluster_TIMEONLY.csv", index=False)

# --- (C) ProM: full timestamps + lifecycle (kept for correctness & import) ---
prom_rows = []
for _, r in ev.iterrows():
    s, e = start_end_datetimes(r)  # full date+time to handle midnight
    prom_rows.append({
        "case_id":              r["CaseID_text"],          # text case id
        "activity":             r["activity_name"],
        "time:timestamp":       s.strftime("%Y-%m-%d %H:%M:%S"),
        "lifecycle:transition": "start",
        "cluster":              r["Cluster"],
    })
    prom_rows.append({
        "case_id":              r["CaseID_text"],
        "activity":             r["activity_name"],
        "time:timestamp":       e.strftime("%Y-%m-%d %H:%M:%S"),
        "lifecycle:transition": "complete",
        "cluster":              r["Cluster"],
    })
pd.DataFrame(prom_rows).to_csv("log_prom_lifecycle_with_cluster.csv", index=False)

print("Saved:")
print(" - log_intervals_disco_with_cluster_TIMEONLY.csv")
print(" - log_intervals_celonis_with_cluster_TIMEONLY.csv")
print(" - log_prom_lifecycle_with_cluster.csv")


Saved:
 - log_intervals_disco_with_cluster_TIMEONLY.csv
 - log_intervals_celonis_with_cluster_TIMEONLY.csv
 - log_prom_lifecycle_with_cluster.csv
