# Data Cleaning Pipeline Application

**Objective**: Apply cleaning pipeline to all 10 sensors for Tier 1 participants.

**Steps**:
1. Load Tier 1 participants (from Phase 2)
2. For each sensor:
   - Validate timestamps
   - Validate values
   - Detect outliers
   - Handle missing data
3. Save cleaned data to `data/processed/cleaned/`
4. Generate cleaning report

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import os

# Add src to path
sys.path.append(str(Path('../../').resolve()))

from src.data.cleaning import (
    validate_timestamps, 
    validate_values, 
    detect_outliers, 
    handle_missing_data
)

sns.set_style('whitegrid')

# Paths
RAW_PATH = Path('../../data/raw/dataset/sensing')
PROCESSED_PATH = Path('../../data/processed/cleaned')
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

# Load Tier 1 participants
tiers_df = pd.read_csv('../../data/processed/participant_tiers.csv')
tier1_participants = tiers_df[tiers_df['quality_tier'] == 'Tier 1: Excellent']['participant'].tolist()

print(f"Loaded {len(tier1_participants)} Tier 1 participants")

## Cleaning Pipeline Function

In [None]:
def clean_sensor_data(sensor_name, participant_id):
    """Run full cleaning pipeline for one sensor/participant."""
    # 1. Load data
    sensor_dir = RAW_PATH / sensor_name
    files = list(sensor_dir.glob(f"*{participant_id}*.csv"))
    
    if not files:
        return None, None
        
    df = pd.read_csv(files[0])
    original_len = len(df)
    report = {'sensor': sensor_name, 'participant': participant_id, 'original_rows': original_len}
    
    # 2. Validate Timestamps
    time_col = 'timestamp' if 'timestamp' in df.columns else ('start_timestamp' if 'start_timestamp' in df.columns else None)
    
    if time_col:
        df, time_issues = validate_timestamps(df, time_col)
        report.update(time_issues)
    else:
        report['time_error'] = 'No timestamp column'
        return df, report
        
    # 3. Validate Values
    df, value_issues = validate_values(df, sensor_name)
    for k, v in value_issues.items():
        report[f'invalid_{k}'] = v
        
    # 4. Outlier Detection (for specific numeric columns)
    outlier_cols = []
    if sensor_name == 'conversation':
        outlier_cols = ['duration'] if 'duration' in df.columns else []
    elif sensor_name == 'bluetooth':
        outlier_cols = ['class_id'] # Example
        
    outliers_removed = 0
    for col in outlier_cols:
        if col in df.columns:
            outlier_idx = detect_outliers(df, col, method='iqr')
            outliers_removed += len(outlier_idx)
            df = df.drop(outlier_idx)
            
    report['outliers_removed'] = outliers_removed
    
    # 5. Handle Missing Data (if applicable)
    # Usually we don't fill raw sensor data gaps unless resampling, 
    # but let's say we want to fill small gaps in continuous sensors like 'activity' if we were resampling.
    # Here we just return the cleaned raw data.
    
    report['final_rows'] = len(df)
    report['rows_removed'] = original_len - len(df)
    
    return df, report

## Run Pipeline on All Sensors

In [None]:
sensors = ['activity', 'conversation', 'gps', 'bluetooth', 'wifi', 'dark', 'phonelock', 'phonecharge', 'audio', 'wifi_location']
all_reports = []

for sensor in sensors:
    print(f"Processing {sensor}...")
    sensor_out_path = PROCESSED_PATH / sensor
    sensor_out_path.mkdir(exist_ok=True)
    
    for pid in tier1_participants:
        df_clean, report = clean_sensor_data(sensor, pid)
        
        if df_clean is not None:
            # Save cleaned data
            out_file = sensor_out_path / f"{sensor}_{pid}.csv"
            df_clean.to_csv(out_file, index=False)
            all_reports.append(report)

reports_df = pd.DataFrame(all_reports)
reports_df.to_csv('../../data/processed/cleaning_report.csv', index=False)
print("\nCleaning Complete!")

## Cleaning Summary Report

In [None]:
# Summarize issues by sensor
summary = reports_df.groupby('sensor')[['original_rows', 'rows_removed', 'future_timestamps', 'duplicates']].sum()
summary['pct_removed'] = (summary['rows_removed'] / summary['original_rows']) * 100

print("Cleaning Summary by Sensor:")
print(summary.round(2))

plt.figure(figsize=(12, 6))
sns.barplot(x=summary.index, y=summary['pct_removed'])
plt.title('Percentage of Data Removed per Sensor')
plt.ylabel('% Removed')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()