# 1. Dataset Construction and Preprocessing

This notebook constructs the master HAR dataset from raw  .dat files, performing initial cleaning and organization for downstream analysis.

- **Data Source:** Raw sensor data from 9 subjects performing various activities.
- **Processing Steps:** Parse individual subject files, remove transient activities, ensure data types, and merge into a master CSV.
- **Outputs:** Per-subject CSVs, master dataset, schema JSON, and processing report.
- **Purpose:** Prepare clean, structured data for exploratory data analysis and modeling in subsequent notebooks.

In [None]:
# Imports & path setup
import os, re, json
from pathlib import Path
import pandas as pd
import numpy as np

RAW_DIR = Path('C:\\Users\\ASUS\\Desktop\\AAAA\\final_project\\data\\raw')
PROCESSED_DIR = Path('C:\\Users\\ASUS\\Desktop\\AAAA\\final_project\\data\\processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print('Raw exists:', RAW_DIR.exists(), 'Processed exists:', PROCESSED_DIR.exists())

Raw exists: True Processed exists: True


In [None]:
# Column names & activity mapping
# Define the expected column structure for PAMAP2 data files
def get_column_names():
    cols = ['timestamp','activity_id','heart_rate']
    sensors = ['hand','chest','ankle']
    imu_cols = [
        'imu_temp',
        'acc_x_16g','acc_y_16g','acc_z_16g',
        'acc_x_6g','acc_y_6g','acc_z_6g',
        'gyro_x','gyro_y','gyro_z',
        'mag_x','mag_y','mag_z',
        'ori_w','ori_x','ori_y','ori_z'
    ]
    for s in sensors:
        for c in imu_cols:
            cols.append(f'{s}_{c}')
    assert len(cols) == 54, f'Expected 54 columns, got {len(cols)}'
    return cols

# Mapping of activity IDs to descriptive names
ACTIVITY_MAP = {
    0: 'Other (transient)',
    1: 'Lying',
    2: 'Sitting',
    3: 'Standing',
    4: 'Walking',
    5: 'Running',
    6: 'Cycling',
    7: 'Nordic walking',
    9: 'Watching TV',
    10: 'Computer work',
    11: 'Car driving',
    12: 'Ascending stairs',
    13: 'Descending stairs',
    16: 'Vacuum cleaning',
    17: 'Ironing',
    18: 'Folding laundry',
    19: 'House cleaning',
    20: 'Playing soccer',
    24: 'Rope jumping'
}
TRANSIENT_ID = 0

## 1.2 Data Schema and Activity Mapping

This section defines the column structure for PAMAP2 sensor data and maps activity IDs to human-readable labels. It includes IMU sensors (accelerometer, gyroscope, magnetometer, orientation) from hand, chest, and ankle, plus heart rate and temperature.

In [None]:
# Parser for a single subject .dat file
def parse_subject_dat(file_path: Path) -> pd.DataFrame:
    # Read the .dat file with space-separated values, no header
    df = pd.read_csv(file_path, sep=r'\s+', header=None, engine='python')
    df.columns = get_column_names()

    # Extract subject ID from filename (e.g., subject101.dat -> 101)
    m = re.search(r'subject(\d+)', file_path.name)
    subject_id = int(m.group(1)) if m else None
    df['subject_id'] = subject_id

    # Remove transient / unlabeled activities (activity_id == 0)
    df = df[df['activity_id'] != TRANSIENT_ID].copy()

    # Ensure numeric types for sensor columns, coerce errors to NaN
    num_cols = [c for c in df.columns if c not in ['activity_id','subject_id']]
    df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

    # Sort by timestamp to ensure chronological order
    df = df.sort_values('timestamp')
    return df

## 1.3 Subject Data Parsing

This section defines a function to parse individual subject .dat files into clean DataFrames. It handles column assignment, subject ID extraction, transient activity removal, data type conversion, and timestamp sorting.

In [None]:
# Process all raw files and write per-subject + master CSV
def process_all_raw() -> pd.DataFrame | None:
    # Find all subject .dat files in the raw directory
    files = sorted(RAW_DIR.glob('subject*.dat'))
    if not files:
        print('No .dat files found in', RAW_DIR)
        return None

    per_subject = []
    for f in files:
        # Parse each subject's data
        df = parse_subject_dat(f)
        # Extract subject ID for output filename
        sid = int(re.search(r'subject(\d+)', f.name).group(1))
        out_path = PROCESSED_DIR / f'subject_{sid}.csv'
        # Save individual subject CSV
        df.to_csv(out_path, index=False)
        print('Wrote', out_path, 'rows:', len(df))
        per_subject.append(df)

    # Concatenate all subjects into master dataset
    master = pd.concat(per_subject, ignore_index=True)
    master_out = PROCESSED_DIR / 'dataset_master.csv'
    # Save master CSV
    master.to_csv(master_out, index=False)
    print('Wrote', master_out, 'rows:', len(master))
    return master

# Execute the processing
master_df = process_all_raw()

Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_101.csv rows: 249957
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_102.csv rows: 263349
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_103.csv rows: 174338
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_104.csv rows: 231421
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_105.csv rows: 272442
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_106.csv rows: 250096
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_107.csv rows: 232776
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_108.csv rows: 262102
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\subject_109.csv rows: 6391
Wrote C:\Users\ASUS\Desktop\AAAA\final_project\data\processed\dataset_master.csv rows: 1942872


## 1.4 Batch Processing and Master Dataset Creation

This section processes all raw .dat files in the raw directory, parses each subject's data, saves individual CSVs, and concatenates them into a master dataset for analysis.

In [None]:
# Save schema.json and a simple processing report
def save_schema_and_report(df: pd.DataFrame) -> None:
    # Create schema dictionary with dataset metadata
    schema = {
        'target': 'activity_id',
        'subject_col': 'subject_id',
        'timestamp_col': 'timestamp',
        'columns': [{'name': c, 'dtype': str(df[c].dtype)} for c in df.columns],
        'activity_map': ACTIVITY_MAP
    }
    # Save schema as JSON
    with open(PROCESSED_DIR / 'schema.json', 'w') as f:
        json.dump(schema, f, indent=2)

    # Generate processing report
    report_lines = []
    report_lines.append('# Processing Report')
    report_lines.append(f'Total rows: {len(df)}')
    report_lines.append('Rows per subject:')
    rows_per_subject = df.groupby('subject_id').size().to_dict()
    for sid, cnt in sorted(rows_per_subject.items()):
        report_lines.append(f'- subject {sid}: {cnt}')

    # Report missing values
    na_counts = df.isna().sum()
    top_na = na_counts[na_counts > 0].sort_values(ascending=False)
    report_lines.append('Missing values (non-zero):')
    for name, cnt in top_na.items():
        report_lines.append(f'- {name}: {cnt}')

    # Save report as Markdown
    with open(PROCESSED_DIR / 'processing_report.md', 'w') as f:
        f.write('\n'.join(report_lines))
    print('Saved schema.json and processing_report.md')

# Execute if master_df was created
if master_df is not None:
    save_schema_and_report(master_df)

Saved schema.json and processing_report.md


## 1.5 Schema and Report Generation

This section generates metadata files: a JSON schema describing the dataset structure and an activity mapping, plus a Markdown report summarizing processing statistics like row counts and missing values.

## 1.6 Next Steps

- **EDA and Modeling:** Use the merged `dataset_master.csv` for exploratory data analysis and model training in Notebook 2.
- **Subject Splits:** Respect the fixed subject-based train/validation/test splits to prevent data leakage.
- **Advanced Processing:** Implement within-subject imputations, activity restriction, and window-level feature engineering in subsequent notebooks for robust HAR modeling.