In [None]:

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

## TODO: 
- Separately get locations for bing vs home
- Remove house, home
- Return >1 activity per response (maybe look at probabilities)
- Differentiate egocentric vs other activity
- Check that the activities are not flickering back and forth
- "Other" to scan through what is being missed maybe
- Compare the unconstrained to SAYCam list

In [None]:
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

In [None]:
csv_dir = '/ccn2/dataset/babyview/outputs_20250312/activities/videollama3_constrained'
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]
print(f'Found {len(csv_files)} CSV files in {csv_dir}')

In [None]:
# Read and concatenate all CSV files into a single DataFrame, then extract the Location and Activity series.
dfs = []
non_bing_counts = 0
for csv_file in tqdm(csv_files, desc="Processing CSV files"):
    # Skip Bing files
    if csv_file.startswith('01') or csv_file.startswith('02'):
        continue
    non_bing_counts += 1
    file_path = os.path.join(csv_dir, csv_file)
    df = pd.read_csv(file_path)
    # Only include files with both columns
    if 'Location' in df.columns and 'Activity' in df.columns:
        dfs.append(df)
        
big_df = pd.concat(dfs, ignore_index=True)
print('Non-Bing CSV files:', non_bing_counts)
print(f'Combined DataFrame has {len(big_df)} rows.')

# Extract and convert to strings
location_series = big_df['Location'].astype(str)
activity_series = big_df['Activity'].astype(str)

## Separately: Locations, Activities

In [None]:
location_counts = location_series.value_counts()
plt.figure(figsize=(10, 5))
plt.bar(location_counts.index, location_counts.values, color='skyblue')
plt.xticks(rotation=45)
plt.xlabel("Location")
plt.ylabel("Count")
plt.title("Counts of Each Unique Location")
plt.tight_layout()
plt.show()

In [None]:
activity_counts = activity_series.value_counts()
plt.figure(figsize=(10, 5))
plt.bar(activity_counts.index, activity_counts.values, color='skyblue')
plt.xticks(rotation=45)
plt.xlabel("Location")
plt.ylabel("Count")
plt.title("Counts of Each Unique Activity")
plt.tight_layout()
plt.show()

# Locations + Activities

In [None]:
df = big_df.copy()
df = df.dropna(subset=["Location", "Activity"])
for col in ["Location", "Activity"]:
    df[col] = df[col].astype(str).str.strip().fillna("Unknown")

# contingency table
ct = pd.crosstab(df["Location"], df["Activity"])

# (optional) keep top-N by totals so the plot is readable
top_rows = ct.sum(1).nlargest(20).index
top_cols = ct.sum(0).nlargest(20).index
ct = ct.loc[top_rows, top_cols]

fig, ax = plt.subplots(figsize=(max(8, 0.55*len(ct.columns)), max(6, 0.45*len(ct))))
im = ax.imshow(ct.values, aspect="auto")
ax.set_xticks(np.arange(ct.shape[1])); ax.set_xticklabels(ct.columns, rotation=45, ha="right")
ax.set_yticks(np.arange(ct.shape[0])); ax.set_yticklabels(ct.index)
ax.set_xlabel("Activity"); ax.set_ylabel("Location"); ax.set_title("Location × Activity (counts)")
plt.colorbar(im, ax=ax, label="Count")
plt.tight_layout()

### 2) Row-normalized (“distribution of activities within each location”)

In [None]:
row_share = ct.div(ct.sum(1).replace(0, 1), axis=0)

fig, ax = plt.subplots(figsize=(max(8, 0.55*len(row_share.columns)), max(6, 0.45*len(row_share))))
im = ax.imshow(row_share.values, aspect="auto", vmin=0, vmax=1)
ax.set_xticks(np.arange(row_share.shape[1])); ax.set_xticklabels(row_share.columns, rotation=45, ha="right")
ax.set_yticks(np.arange(row_share.shape[0])); ax.set_yticklabels(row_share.index)
ax.set_xlabel("Activity"); ax.set_ylabel("Location"); ax.set_title("Activity share within each Location")
plt.colorbar(im, ax=ax, label="Share (0–1)")
plt.tight_layout()


### 3) “Surprise” map (which pairs are over/under-represented given totals)

This controls for marginals and highlights associations, not just frequency.

In [None]:
import numpy as np

obs = ct.values
row_tot = obs.sum(axis=1, keepdims=True)
col_tot = obs.sum(axis=0, keepdims=True)
grand = obs.sum()
expected = row_tot @ col_tot / grand
std_resid = (obs - expected) / np.sqrt(np.maximum(expected, 1e-9))

fig, ax = plt.subplots(figsize=(max(8, 0.55*ct.shape[1]), max(6, 0.45*ct.shape[0])))
im = ax.imshow(std_resid, aspect="auto")
ax.set_xticks(np.arange(ct.shape[1])); ax.set_xticklabels(ct.columns, rotation=45, ha="right")
ax.set_yticks(np.arange(ct.shape[0])); ax.set_yticklabels(ct.index)
ax.set_xlabel("Activity"); ax.set_ylabel("Location"); ax.set_title("Standardized residuals (obs − exp) / √exp")
plt.colorbar(im, ax=ax, label="Std residual (±)")
plt.tight_layout()


### 4) Alternative view (stacked bars)

Good when you have many activities but want per-location distributions.

In [None]:
row_share.plot(kind="bar", stacked=True, figsize=(max(8, 0.7*len(row_share)), 6))
plt.ylabel("Share"); plt.title("Activities within each Location")
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left"); plt.tight_layout()


## IMU + (Locations, Activities)

In [None]:
overall_imu_dir = '/ccn2/dataset/babyview/outputs_20250312/imu/10s_clips/'
imu_csv_files = [f for f in os.listdir(overall_imu_dir) if f.endswith('.csv')]

dfs_imu = []
for f in tqdm(imu_csv_files, desc="Processing IMU CSV files"):
    df_imu = pd.read_csv(os.path.join(overall_imu_dir, f))
    dfs_imu.append(df_imu)
imu_df = pd.concat(dfs_imu, ignore_index=True)
print(f"Concatenated {len(imu_csv_files)} files into a DataFrame with {len(imu_df)} rows.")

In [None]:
# merge big_df with imu_df on video_id, big_df should have the main key,
# and drop the imu rows which did not match
merged_df = big_df.merge(imu_df, on="video_id", how="inner", suffixes=("", "_imu"))
print('Merged DataFrame has', len(merged_df), 'rows after inner join on video_id.')

merged_df = merged_df.dropna(how="any")
print('Merged DataFrame has', len(merged_df), 'rows after dropping rows with any NaN values.')

In [None]:
import numpy as np
import pandas as pd

df = merged_df.copy()

acc_cols  = ["ACCL_X (m/s²)","ACCL_Y (m/s²)","ACCL_Z (m/s²)"]
grav_cols = ["GRAV_X (m/s²)","GRAV_Y (m/s²)","GRAV_Z (m/s²)"]

df[acc_cols+grav_cols] = df[acc_cols+grav_cols].apply(pd.to_numeric, errors="coerce")

A = df[acc_cols].to_numpy()
G = df[grav_cols].to_numpy()

# Detect GRAV_* scale (≈1 or ≈9.81). Your screenshot suggests ≈1.
g_norm = np.nanmedian(np.linalg.norm(G, axis=1))
if 0.5 < g_norm < 2:        # GRAV_* is ~unit vector: scale to m/s²
    grav_scale = 9.81
elif 8 < g_norm < 11:       # already in m/s²
    grav_scale = 1.0
else:                        # fallback: rescale so ||G|| ≈ 9.81
    grav_scale = 9.81 / g_norm

G_mps2 = G * grav_scale

# Linear (gravity-removed) acceleration per axis
LIN = A - G_mps2
df["LIN_X (m/s²)"], df["LIN_Y (m/s²)"], df["LIN_Z (m/s²)"] = LIN.T

# Magnitudes and decomposition relative to gravity
lin_norm = np.linalg.norm(LIN, axis=1)
g_hat = G_mps2 / (np.linalg.norm(G_mps2, axis=1, keepdims=True) + 1e-9)

df["lin_norm"] = lin_norm                                   # orientation-invariant intensity
df["lin_parallel_g"]  = (LIN * g_hat).sum(1)               # up/down component
df["lin_perp_g_norm"] = np.linalg.norm(LIN - df["lin_parallel_g"].to_numpy()[:,None]*g_hat, axis=1)  # horizontal
# Tilt (0° = upright). In your system, upright has ACCL_Y ≈ -9.8 ⇒ gravity points toward -Y.
df["tilt_deg"] = np.degrees(np.arccos(np.clip(np.abs(-g_hat[:,1]), 0, 1)))

# --- Gyroscope features (rotational motion) ---
gyro_cols = ["GYRO_X (rad/s)", "GYRO_Y (rad/s)", "GYRO_Z (rad/s)"]
df[gyro_cols] = df[gyro_cols].apply(pd.to_numeric, errors="coerce")
W = df[gyro_cols].to_numpy()

# Total rotational speed
df["gyro_norm"] = np.linalg.norm(W, axis=1)  # rad/s

# Decompose relative to gravity direction g_hat:
#   parallel  ~ yaw rate (rotation around vertical/gravity axis)
#   perpendicular magnitude ~ pitch/roll rate combined
gyro_parallel = (W * g_hat).sum(axis=1)                       # signed yaw rate (rad/s)
gyro_perp     = W - gyro_parallel[:, None] * g_hat
df["gyro_parallel_g"]   = gyro_parallel                       # rad/s
df["gyro_perp_g_norm"]  = np.linalg.norm(gyro_perp, axis=1)   # rad/s



In [None]:
def violin_by_signed(data: pd.DataFrame, cat: str, value: str, *,
                     top:int=20, lower_q:float=0.005, upper_q:float=0.995,
                     order_by:str="abs_median", save:bool=False):
    """
    Make a single violin plot of a signed IMU component grouped by a category.
    order_by: 'abs_median' (magnitude-focused) or 'median' (signed).
    """
    tmp = data.dropna(subset=[cat, value]).copy()

    # Keep top categories by frequency so the figure stays readable
    keep = tmp[cat].value_counts().nlargest(top).index
    tmp = tmp[tmp[cat].isin(keep)]

    # Winsorize tails symmetrically (preserve sign, just trim extremes)
    q_lo = tmp[value].quantile(lower_q)
    q_hi = tmp[value].quantile(upper_q)
    tmp[value] = tmp[value].clip(q_lo, q_hi)

    # Choose ordering
    if order_by == "abs_median":
        order = (tmp.groupby(cat)[value]
                   .apply(lambda s: s.abs().median())
                   .sort_values(ascending=False).index.tolist())
    else:  # 'median'
        order = (tmp.groupby(cat)[value].median()
                   .sort_values(ascending=False).index.tolist())

    # Prepare data arrays
    data_arrays = [tmp.loc[tmp[cat] == c, value].to_numpy() for c in order]

    # Plot
    fig, ax = plt.subplots(figsize=(max(8, 0.55*len(order)), 5))
    _ = ax.violinplot(data_arrays, showextrema=False)

    # Overlay robust summary: median (•) and IQR (┃)
    g = tmp.groupby(cat)[value]
    med = g.median().reindex(order).to_numpy()
    q1  = g.quantile(0.25).reindex(order).to_numpy()
    q3  = g.quantile(0.75).reindex(order).to_numpy()
    x   = np.arange(1, len(order)+1)
    ax.scatter(x, med, s=18, zorder=3)
    ax.vlines(x, q1, q3, linewidth=3, alpha=0.9, zorder=2)

    ax.axhline(0, linestyle="--", linewidth=1, alpha=0.7)  # zero reference
    ax.set_xticks(x); ax.set_xticklabels(order, rotation=45, ha="right")
    ax.set_ylabel(value)
    ax.set_title(f"{value} by {cat} (per row)")
    plt.tight_layout()

    if save:
        safe_value = value.replace("/", "_per_").replace(" ", "_")
        plt.savefig(f"{safe_value}_by_{cat}.png", dpi=150)

    return fig, ax

In [None]:
imu_signed_cols = ["LIN_X (m/s²)","LIN_Y (m/s²)","LIN_Z (m/s²)",
                   "GYRO_X (rad/s)","GYRO_Y (rad/s)","GYRO_Z (rad/s)"]

for col in ["gyro_norm", "lin_norm","lin_parallel_g","lin_perp_g_norm", "tilt_deg"]:
    violin_by_signed(df, "Activity", col, top=20, order_by="median")
    violin_by_signed(df, "Location", col, top=20, order_by="median")
    
for col in imu_signed_cols:
    violin_by_signed(df, "Activity", col, top=20, order_by="abs_median")
    violin_by_signed(df, "Location", col, top=20, order_by="abs_median")
