### Run the notebook in the directory which contains capture24 dataset

In [42]:
import os
import glob
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
cd capture24

/Users/akashmurali/Documents/capstone/project/capture24


In [None]:
# mapping df
annotation_df = pd.read_csv("annotation-label-dictionary.csv", index_col ="annotation", dtype = "string")

In [None]:
# load each participants data into a df seperately
def load_participant_df(filepath):
    
    df = pd.read_csv(filepath, index_col = "time", parse_dates =['time'], dtype = {'x':'f4', 'y':'f4', 'z':'f4', 'annotation':'string'})

    return df

In [26]:
# function to map to a given schema
def map_annotation(df, annotation_df, schemas):

    for s in schemas:
        label_col = f'label:{s}'
        if label_col in annotation_df.columns:
            label_map = annotation_df[label_col].to_dict()
            df[s] = df['annotation'].map(label_map)

    return df

In [55]:
# check validity of the window
def is_valid_window(window, excepected_length):
    if len(window)!= excepected_length:
        return False
    if window[['x', 'y', 'z']].isna().any().any():
        return False
    return True


In [None]:
def extract_windows(df, schemas, window_size = 10, sample_rate = 100): # default to 10sec and 100hz
    
    X,Y,T = [],{schema:[] for schema in schemas}, []
    excepected_length = window_size * sample_rate

    for timestamp, window in df.resample(f'{window_size}s', origin = "start"):
        if not is_valid_window(window, excepected_length):
            continue
        x = window[['x','y','z']].to_numpy()
        labels_valid = True
        window_labels = {}
        for schema in schemas:
            if schema not in window.columns:
                labels_valid = False
                break

            label_mode = window[schema].mode(dropna = False)

            if len(label_mode) == 0:
                labels_valid = False
                break
            window_labels[schema] = label_mode.iloc[0]

        if not labels_valid:
            continue
        X.append(x)
        T.append(timestamp)

        for schema in schemas:
            Y[schema].append(window_labels[schema])
    X = np.stack(X)
    T = np.array(T)
    for schema in schemas:
        Y[schema] = np.array(Y[schema])

    
    return X,Y,T

In [69]:
def preprocess_all(annotation_df, schemas, output_dir = "preprocessed", window_size = 10, sample_rate = 100):
    
    print("=" * 70)
    print("loading all participants data")
    print("=" * 70)
    participant_files = sorted(glob.glob('P[0-9][0-9][0-9].csv*'))
    print("total files found",len(participant_files))

    all_X = [] # all x,y,z acceleartion
    all_Y = {schema:[] for schema in schemas} # Willets label map and Walmsley label map
    all_T = [] # all timestamps
    all_P = [] # all participants/test subjects

    for filepath in tqdm(participant_files, desc = "processing participants"):
        try:
            p_id = filepath.split('.')[0]
            df = load_participant_df(filepath)
            df = map_annotation(df, annotation_df, schemas)
            X,Y,T = extract_windows(df, schemas, window_size, sample_rate)
            if X is None:
                print(f"no valid window for {p_id}")
                continue
            all_X.append(X)
            all_T.append(T)
            all_P.append(np.array([p_id] * len(X)))

            for schema in schemas:
                all_Y[schema].append(Y[schema])

        except Exception as e:
            print(f"error processing {filepath}: {e}")
            continue

    # combining data from all participants
    X_combined = np.vstack(all_X)
    T_combined = np.hstack(all_T)
    P_combined = np.hstack(all_P)
    
    Y_combined = {}
    for schema in schemas:
        Y_combined[schema] = np.hstack(all_Y[schema])

    os.makedirs(output_dir, exist_ok=True)
    
    np.save(os.path.join(output_dir, 'X.npy'), X_combined)
    np.save(os.path.join(output_dir, 'T.npy'), T_combined)
    np.save(os.path.join(output_dir, 'P.npy'), P_combined)

    for schema in schemas:
        np.save(os.path.join(output_dir, f'Y_{schema}.npy'), Y_combined[schema])


schemas = ["WillettsSpecific2018"]
preprocess_all(annotation_df, schemas)

loading all participants data
total files found 151


processing participants: 100%|██████████| 151/151 [28:43<00:00, 11.41s/it]


#### to load the processed .npy files

In [71]:
def load_prepared_data(data_dir='preprocessed', schema='WillettsSpecific2018'):
    print(f"Loading prepared data from: {data_dir}")
    
    X = np.load(os.path.join(data_dir, 'X.npy'))
    Y = np.load(os.path.join(data_dir, f'Y_{schema}.npy'), allow_pickle=True)
    T = np.load(os.path.join(data_dir, 'T.npy'), allow_pickle=True)
    P = np.load(os.path.join(data_dir, 'P.npy'), allow_pickle=True)
    
    print(f"\nLoaded data:")
    print(f"  X shape: {X.shape}")
    print(f"  Y shape: {Y.shape}")
    print(f"  Number of participants: {len(np.unique(P))}")
    
    return X, Y, T, P


# Usage
X, Y, T, P = load_prepared_data(schema='WillettsSpecific2018')

Loading prepared data from: preprocessed

Loaded data:
  X shape: (1398022, 1000, 3)
  Y shape: (1398022,)
  Number of participants: 151
