In [1]:
pip install pandas numpy matplotlib seaborn scikit-learn requests tf2onnx skl2onnx onnx onnxmltools keras_tuner



In [21]:
import os
import pandas as pd
import numpy as np
from scipy import stats

# --- Configuration ---
RAW_DATA_DIR = 'data/raw/MHEALTHDATASET'
PROCESSED_DATA_DIR = 'data/processed'
WINDOW_SIZE = 100  # 2 seconds @ 50Hz
STEP_SIZE = 50     # 50% overlap

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

def load_raw_data():
    subjects = [f'mHealth_subject{i}.log' for i in range(1, 11)]
    data_list = []

    print("Loading subject log files...")
    for filename in subjects:
        filepath = os.path.join(RAW_DATA_DIR, filename)
        if os.path.exists(filepath):
            # 23 columns
            df = pd.read_csv(filepath, sep='\t', header=None)
            data_list.append(df)

    if not data_list:
        raise FileNotFoundError("No data found. Run 01-data-collection.ipynb first.")

    return pd.concat(data_list, ignore_index=True)

def process_data():
    # 1. Load
    raw_df = load_raw_data()

    # 2. Clean: Remove Null Class (Label 0)
    # Column 23 is the label
    print(f"Raw shape: {raw_df.shape}")
    clean_df = raw_df[raw_df[23] != 0].copy()
    print(f"Cleaned shape (no null class): {clean_df.shape}")

    X_raw = clean_df.iloc[:, :23].values
    y_raw = clean_df.iloc[:, 23].values

    # 3. Sliding Window (3D Transformation)
    print("Creating sliding windows...")
    X_frames, y_frames = [], []
    for i in range(0, len(X_raw) - WINDOW_SIZE, STEP_SIZE):
        X_frames.append(X_raw[i: i + WINDOW_SIZE])
        # Label is the mode of the window
        mode_label = stats.mode(y_raw[i: i + WINDOW_SIZE], keepdims=True)[0][0]
        y_frames.append(mode_label)

    X = np.array(X_frames)
    y = np.array(y_frames)

    # Remap labels to 0-11 (originally 1-12) for OHE compatibility
    y = y - 1

    print(f"Final Processed Data: X={X.shape}, y={y.shape}")

    # 4. Save
    np.save(os.path.join(PROCESSED_DATA_DIR, 'X.npy'), X)
    np.save(os.path.join(PROCESSED_DATA_DIR, 'y.npy'), y)
    print("Saved X.npy and y.npy to 'data/processed/'")

if __name__ == "__main__":
    process_data()

Loading subject log files...
Raw shape: (1215745, 24)
Cleaned shape (no null class): (343195, 24)
Creating sliding windows...
Final Processed Data: X=(6862, 100, 23), y=(6862,)
Saved X.npy and y.npy to 'data/processed/'
