In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from collections import Counter

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GroupShuffleSplit

from collections import Counter

# Preprocessing Pipeline

In [5]:
def get_pt_id(file_path):
    """ Returns participantID based on file name"""
    # get filename
    filename = os.path.basename(file_path)
    # split string, get all before '_tug'
    participant_id = filename.split(('_TUG'))[0] #added upper
    return participant_id
    
# 5-sec windows with 20% overlap.
def create_sliding_windows(df, window_length=5, stepsize=4):
    """Constructs 5-sec windows w 50% overlap
    Returns list of windows per pt
    """
    # initialize
    windows = [] # to save output
    start = df['Time'].min()
    end_time = df['Time'].max()

    while start + window_length <= end_time:
        end = start + window_length
        # create df per window
        window_df = df[(df['Time'] >= start) & (df['Time'] < end)]

        windows.append({
            'start': start,
            'end': end,
            'row indices': window_df.index.tolist()
        })

        # slide window
        start += stepsize

    return windows


def preprocess(file):
    """ Reads in each csv file, removes cols with all NA & rows == Sitting, and fixes types
    Returns: df, ID, features to be used in X, and windows to be used in X """
    
    df = pd.read_csv(file)
    # remove columns with NA
    df.dropna(axis=1, how='all', inplace=True)
    
    # generate ID
    participant_id = get_pt_id(file)

    # diagnosis
    df['ClinicalLabel'] = 0 if participant_id.startswith(("HC", "WHC")) else 1
    
    # fix Time data type
    df['Time'] = df['Time'].str.replace(" sec", "").astype(float)
    
    # create sliding windows
    windows = create_sliding_windows(df)

    # feature engineering per window
    names = ['FreeAcc', 'Gyr', 'VelInc', 'Roll', 'Pitch', 'Yaw']
    chosen_variables = []

    for col in df.columns.tolist():
        if 'LowerBack' in col: # missing in majority of pts
            continue
        for n in names:
            if n in col:
                chosen_variables.append(col)

    return df, participant_id, chosen_variables, windows

In [6]:
def feature_extraction_raw(df, participant_id, features, windows): 
    """Returns:
    X: data per window, y: label per window, groups: participant_id per window, used for grouped k-fold """
    X, y, groups = [], [], []

    clinical_label = df['ClinicalLabel'].iloc[0]

    for window in windows:
        window_data = df.loc[window['row indices']].reindex(columns=features).to_numpy()
        if window_data.shape[0] == 500: # for 5-min windows
            X.append(window_data)
            y.append(clinical_label)
            groups.append(participant_id)
            
    return X, y, groups

# Code to pre-process all files via pipeline

In [7]:
# define paths, get each path saved to list all_files
csv_folders = ['/home/sbinder3/wearables/synapse_data', '/home/sbinder3/wearables/synapse_data_PD', '/home/sbinder3/wearables/adtl_data']
all_files = [] # loop gets every csv file in folder
for folder in csv_folders: 
    all_files += glob.glob(os.path.join(folder, '*.csv')) # represents full path

# Y labels
# all_clinical = pd.read_csv('/home/sbinder3/wearables/all_clinical.csv')
# clinical_lookup = all_clinical.set_index('Participant')['Bradykinesia'].to_dict()

all_X, all_y, all_groups = [], [], []

# run every file through the pipeline
for file in all_files:
    df, participant_id, chosen_variables, windows = preprocess(file)
    X, y, groups = feature_extraction_raw(df, participant_id, chosen_variables, windows)
    all_X.extend(X)
    all_y.extend(y)
    all_groups.extend(groups)

    # = np.array(all_X) # list of arrays

  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


In [8]:
print(len(all_y))
print(len(all_X))
print(len(all_groups))

5353
5353
5353


# Fix Inconsistent Dimensions and NA Values

In [9]:

shapes = [x.shape for x in all_X]
shape_counts = Counter(shapes)
print(shape_counts)


Counter({(500, 150): 4872, (500, 144): 333, (500, 138): 65, (500, 126): 48, (500, 66): 35})


In [10]:
# count NA features
# all_X is a list of (200, n_features) windows
def count_NA_feats(all_X): 
    feature_nan_counts = Counter()
    
    for window in all_X:
        if window.shape[1] != all_X[0].shape[1]:
            continue  # skip malformed windows
        nan_mask = np.isnan(window).any(axis=0) 
        for i, is_nan in enumerate(nan_mask):
            if is_nan:
                feature_nan_counts[i] += 1
    
    return feature_nan_counts
feature_nan_counts = count_NA_feats(all_X)

In [12]:
# Drop worst featueres (missing in > 1000 windows)

bad_features = [i for i, count in feature_nan_counts.items() if count > 1000]
print(bad_features)

# drop them from each window
all_X_clean = []
for window in all_X:
    if window.shape[1] < max(bad_features):  # skip malformed
        continue
    clean_window = np.delete(window, bad_features, axis=1)
    all_X_clean.append(clean_window)


[144, 145, 146, 147, 148, 149]


In [13]:
# count NA after dropping bad features

count = 0
for i, window in enumerate(all_X_clean):
    if np.isnan(window).sum() >0:
        count += 1
        #print(f"Index {i} has shape {window.shape}")
print(f'Remaining num windows with NA: {count}')

Remaining num windows with NA: 394


In [14]:
remaining_feat = count_NA_feats(all_X_clean)

In [15]:
# Remove Bad Features from y and groups too 

mask = [x.shape[1] >= max(bad_features) for x in all_X]
all_y_clean = [y for y, keep in zip(all_y, mask) if keep]
all_groups_clean = [g for g, keep in zip(all_groups, mask) if keep]


In [16]:
print(len(all_X_clean))
print(len(all_y_clean))
print(len(all_groups_clean))

4872
4872
4872


In [40]:
# Drop Remaining NA windows (489)
final_X = []
final_y = []
final_groups = []

for x, y_, g in zip(all_X_clean, all_y_clean, all_groups_clean):
    if not np.isnan(x).any(): # only appends if all values present
        final_X.append(x)
        final_y.append(y_)
        final_groups.append(g)

# Convert to arrays
final_X = np.array(final_X)
final_y = np.array(final_y)
final_groups = np.array(final_groups)

print(f"Final clean dataset shape: {final_X.shape}")


Final clean dataset shape: (4478, 500, 144)


In [41]:
classes, counts = np.unique(final_y, return_counts=True)

for cls, count in zip(classes, counts):
    print(f"Class {cls}: {count} samples ({count / len(final_y):.1%})")


Class 0: 2250 samples (50.2%)
Class 1: 2228 samples (49.8%)


In [20]:
# Save as .npy files for loading into CNN_train

np.save("final_X.npy", final_X)
np.save("final_y.npy", final_y)
np.save("final_groups.npy", final_groups)

# Create Hold Out Set

In [43]:
# reserve some participants just for testing
# model will not see during training at all

# init 
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# split
train_idx, test_idx = next(gss.split(final_X, final_y, final_groups))

# subset
X_train, X_test = final_X[train_idx], final_X[test_idx]
y_train, y_test = final_y[train_idx], final_y[test_idx]
groups_train = final_groups[train_idx]


In [44]:
np.save("X_train.npy", X_train)
np.save("y_train", y_train)
np.save("X_test.npy", X_test)
np.save("y_test", y_test)
np.save("groups_train.npy", groups_train)

# Test Case

In [73]:
hdf, hparticipant_id, hchosen_variables, hwindows = preprocess('/home/sbinder3/wearables/synapse_data/HC114_TUG.csv')
Xh = feature_extraction_raw(hdf, hparticipant_id, hwindows, hchosen_variables)

In [76]:
len(Xh)

32

In [70]:
X = []

for window in hwindows:
    window_data = df.loc[window['row indices']].reindex(columns=hchosen_variables).to_numpy()
    print(window_data.shape)
    if window_data.shape[0] == 200:
            X.append(window_data)
X = np.array(X)

(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)
(200, 162)


[array([[-0.082683,  0.017232, -0.023378, ...,       nan,       nan,
               nan],
        [-0.088264, -0.001931,  0.021178, ...,       nan,       nan,
               nan],
        [-0.081623,  0.05753 ,  0.020825, ...,       nan,       nan,
               nan],
        ...,
        [-0.086034,  0.00209 ,  0.021198, ...,       nan,       nan,
               nan],
        [-0.064707, -0.030613, -0.016915, ...,       nan,       nan,
               nan],
        [-0.099144, -0.026574,  0.00721 , ...,       nan,       nan,
               nan]], shape=(200, 162)),
 array([[-8.7266e-02, -5.0000e-06,  2.1187e-02, ...,         nan,
                 nan,         nan],
        [-8.4395e-02,  9.5640e-03, -1.0920e-03, ...,         nan,
                 nan,         nan],
        [-8.0782e-02, -3.0706e-02,  3.5549e-02, ...,         nan,
                 nan,         nan],
        ...,
        [-6.2370e-02,  1.1993e-02,  2.7277e-02, ...,         nan,
                 nan,         nan],
      

In [None]:
X = 

In [13]:
feature_extraction_raw(hctug, 'HC114', windows, features)

  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)
  → window shape: (200, 156)


[array([[ 2.95080000e-02,  7.41370000e-02, -3.93500000e-03, ...,
         -8.78168800e+01,  1.61535360e+01,  1.62923242e+02],
        [ 6.26660000e-02,  7.79230000e-02,  2.86600000e-03, ...,
         -8.79721370e+01,  1.61022780e+01,  1.62663530e+02],
        [ 3.34810000e-02,  5.15640000e-02, -1.42910000e-02, ...,
         -8.81324680e+01,  1.60660850e+01,  1.62445276e+02],
        ...,
        [ 1.42534000e-01, -3.61130000e-02, -8.54000000e-04, ...,
         -9.33552130e+01,  2.23596900e+01,  1.42520756e+02],
        [-7.82980000e-02, -1.40790000e-02,  1.83430000e-02, ...,
         -9.27175680e+01,  2.17860120e+01,  1.43733073e+02],
        [-3.20160000e-02,  3.75380000e-02, -6.54840000e-02, ...,
         -9.21073650e+01,  2.11782750e+01,  1.44966598e+02]],
       shape=(200, 156)),
 array([[ 3.97990000e-02,  3.80650000e-02, -8.41500000e-03, ...,
         -7.47337020e+01,  1.01860020e+01, -1.55952782e+02],
        [ 1.07470000e-02,  1.18070000e-02, -2.58280000e-02, ...,
         -7.5