# Multi-Sensor Time Alignment

**Objective**: Align disparate sensor streams onto a common hourly time grid for Tier 1 participants.

**Input**: Cleaned sensor data (`data/processed/cleaned/`)
**Output**: Aligned hourly feature matrix (`data/processed/aligned/`)

**Steps**:
1. Define study period (Spring 2013)
2. Load cleaned data for each participant
3. Resample each sensor to hourly resolution
4. Merge into single dataframe
5. Visualize data coverage

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
from datetime import datetime

# Add src to path
sys.path.append(str(Path('../../').resolve()))

from src.data.alignment import align_participant

sns.set_style('whitegrid')

# Paths
CLEANED_PATH = Path('../../data/processed/cleaned')
ALIGNED_PATH = Path('../../data/processed/aligned')
ALIGNED_PATH.mkdir(parents=True, exist_ok=True)

# Study Dates (Spring 2013)
START_DATE = datetime(2013, 3, 27)
END_DATE = datetime(2013, 6, 5)

# Load Tier 1 participants
tiers_df = pd.read_csv('../../data/processed/participant_tiers.csv')
tier1_participants = tiers_df[tiers_df['quality_tier'] == 'Tier 1: Excellent']['participant'].tolist()

print(f"Processing {len(tier1_participants)} Tier 1 participants")

## Alignment Loop

In [None]:
sensors = ['activity', 'conversation', 'gps', 'bluetooth', 'wifi', 'dark', 'phonelock', 'phonecharge', 'audio', 'wifi_location']

for pid in tier1_participants:
    print(f"Aligning {pid}...")
    
    # Load all available sensors for this participant
    sensor_dfs = {}
    for sensor in sensors:
        file_path = CLEANED_PATH / sensor / f"{sensor}_{pid}.csv"
        if file_path.exists():
            try:
                df = pd.read_csv(file_path)
                sensor_dfs[sensor] = df
            except Exception as e:
                print(f"  Error loading {sensor}: {e}")
    
    if not sensor_dfs:
        print(f"  No data found for {pid}")
        continue
        
    # Align
    aligned_df = align_participant(pid, sensor_dfs, START_DATE, END_DATE)
    
    # Save
    out_file = ALIGNED_PATH / f"aligned_{pid}.csv"
    aligned_df.to_csv(out_file)
    print(f"  Saved {len(aligned_df)} rows, {len(aligned_df.columns)} columns")

## Validation & Visualization

In [None]:
# Load one aligned file to inspect
example_pid = tier1_participants[0]
df_aligned = pd.read_csv(ALIGNED_PATH / f"aligned_{example_pid}.csv", index_col=0, parse_dates=True)

print("Columns:", df_aligned.columns.tolist())
print("\nHead:")
display(df_aligned.head())

# Visualize Missing Data (Heatmap)
plt.figure(figsize=(15, 8))
sns.heatmap(df_aligned.isnull().T, cbar=False, cmap='viridis')
plt.title(f'Missing Data Heatmap - {example_pid}')
plt.xlabel('Time')
plt.tight_layout()
plt.show()

In [None]:
# Plot Activity vs Conversation
plt.figure(figsize=(15, 5))
plt.plot(df_aligned.index, df_aligned['activity_active_minutes'], label='Active Min', alpha=0.7)
plt.plot(df_aligned.index, df_aligned['conversation_minutes'], label='Conversation Min', alpha=0.7)
plt.legend()
plt.title(f'Activity vs Conversation - {example_pid}')
plt.ylabel('Minutes per Hour')
plt.tight_layout()
plt.show()