# Participant Quality Analysis

Analyze data quality and coverage for each participant to define inclusion tiers.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json

sns.set_style('whitegrid')

SENSING_PATH = Path('../../data/raw/dataset/sensing')

with open('../../data/raw/sensing_manifest.json') as f:
    manifest = json.load(f)

participants = manifest['participants']
sensors = manifest['sensor_types']

## 1. Coverage Matrix

In [None]:
# Create coverage matrix
coverage_data = []

for participant in participants:
    coverage = {'participant': participant}
    
    for sensor in sensors:
        sensor_dir = SENSING_PATH / sensor
        files = list(sensor_dir.glob(f"*{participant}*.csv"))
        
        if files:
            df = pd.read_csv(files[0])
            coverage[f'{sensor}_rows'] = len(df)
            coverage[f'{sensor}_days'] = (pd.to_datetime(df['timestamp'], unit='s').max() - 
                                         pd.to_datetime(df['timestamp'], unit='s').min()).days if 'timestamp' in df.columns else 0
        else:
            coverage[f'{sensor}_rows'] = 0
            coverage[f'{sensor}_days'] = 0
    
    coverage_data.append(coverage)

coverage_df = pd.DataFrame(coverage_data)
coverage_df.to_csv('../../data/processed/participant_quality.csv', index=False)
print("Coverage matrix created")

## 2. Quality Tiers

In [None]:
# Define quality tiers based on average duration
day_cols = [c for c in coverage_df.columns if c.endswith('_days')]
coverage_df['avg_days'] = coverage_df[day_cols].mean(axis=1)
coverage_df['min_days'] = coverage_df[day_cols].min(axis=1)

def assign_tier(row):
    if row['min_days'] >= 60:  # ~9 weeks minimum
        return 'Tier 1: Excellent'
    elif row['min_days'] >= 45:  # ~6-7 weeks
        return 'Tier 2: Good'
    elif row['min_days'] >= 30:  # ~4-5 weeks
        return 'Tier 3: Moderate'
    else:
        return 'Tier 4: Low Coverage'

coverage_df['quality_tier'] = coverage_df.apply(assign_tier, axis=1)

print(coverage_df['quality_tier'].value_counts())

coverage_df[['participant', 'avg_days', 'min_days', 'quality_tier']].to_csv(
    '../../data/processed/participant_tiers.csv', index=False
)

print("\nParticipant tiers saved")

## 3. Visualizations

In [None]:
# Heatmap of coverage
plt.figure(figsize=(14, 10))
heatmap_data = coverage_df[[c for c in coverage_df.columns if c.endswith('_days')]].T
heatmap_data.columns = coverage_df['participant']
sns.heatmap(heatmap_data, cmap='YlGnBu', cbar_kws={'label': 'Days of Data'})
plt.title('Data Coverage by Participant and Sensor')
plt.tight_layout()
plt.show()

# Quality tier distribution
plt.figure(figsize=(10, 6))
coverage_df['quality_tier'].value_counts().plot(kind='bar')
plt.title('Participant Quality Tier Distribution')
plt.xlabel('Quality Tier')
plt.ylabel('Number of Participants')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()