## With EDA 1 It gaves us a hint that with PCA the activities are spatially sepearted with static activities towards the left and dynamic activities towards the right. Based on this knowledge we have used a different set of labels[sleep, light, sedentary, moderate-vigorous] to hide away the heirarchy.

In [1]:
cd /Users/akashmurali/Documents/capstone/project

[Errno 2] No such file or directory: '/Users/akashmurali/Documents/capstone/project'
/content


In [None]:
import pandas as pd
import os

# load as a df
df_147 = pd.read_csv('/data/P147.csv', index_col="time", parse_dates=["time"], dtype={'x': 'f4', 'y': 'f4', 'z': 'f4', 'annotation': 'string'})

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Capstone Project/Data/P147.csv'

In [None]:
# load annotation-label-dictionary.csv as df to remap the annotation to proper labels
annotation_df = pd.read_csv("/data/annotation-label-dictionary.csv", index_col = "annotation", dtype = "string")
annotation_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206 entries, 7030 sleeping;MET 0.95 to vehicle;MET 1.3
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   label:WillettsSpecific2018  206 non-null    string
 1   label:WillettsMET2018       206 non-null    string
 2   label:DohertySpecific2018   206 non-null    string
 3   label:Willetts2018          206 non-null    string
 4   label:Doherty2018           206 non-null    string
 5   label:Walmsley2020          206 non-null    string
dtypes: string(6)
memory usage: 11.3 KB


In [5]:
annotation_df.head()

Unnamed: 0_level_0,label:WillettsSpecific2018,label:WillettsMET2018,label:DohertySpecific2018,label:Willetts2018,label:Doherty2018,label:Walmsley2020
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7030 sleeping;MET 0.95,sleep,sleep,sleep,sleep,sleep,sleep
occupation;office and administrative support;11580 office/computer work general;MET 1.5,sitting,sitstand+lowactivity,sedentary-screen,sit-stand,sedentary,sedentary
home activity;household chores;preparing meals/cooking/washing dishes;5035 kitchen activity general cooking/washing/dishes/cleaning up;MET 3.3,household-chores,sitstand+activity,tasks-moderate,mixed,moderate,light
occupation;office and administrative support;11580 office wok/computer work general;MET 1.5,sitting,sitstand+lowactivity,sedentary-screen,sit-stand,sedentary,sedentary
home activity;miscellaneous;sitting;9060 sitting/lying reading or without observable/identifiable activities;MET 1.3,sitting,sitstand+lowactivity,sedentary-non-screen,sit-stand,sedentary,sedentary


In [6]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [7]:
# Direct mapping
df_147['activity_label'] = df_147['annotation'].map(annotation_df['label:Walmsley2020'])

# Check for unmapped
print(f"Unmapped rows: {df_147['activity_label'].isna().sum()}")

Unmapped rows: 2866205


In [8]:
df_147['activity_label'].nunique()

3

In [9]:
from tqdm import tqdm
import numpy as np

def extract_windows(data, winsize='10s'):
    X, Y = [], []
    for t, w in tqdm(data.resample(winsize, origin='start')):

        # Check window has no NaNs and is of correct length
        if w.isna().any().any() or len(w) != 1000:
            continue

        x = w[['x', 'y', 'z']].to_numpy()
        y = w['activity_label'].mode(dropna=False).item()

        X.append(x)
        Y.append(y)

    X = np.stack(X)
    Y = np.stack(Y)

    return X, Y

In [10]:
X, Y =extract_windows(df_147)

100%|██████████| 9181/9181 [00:11<00:00, 806.70it/s]


In [11]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def apply_pca_visualization(X, Y, n_components=2):
    """
    Apply PCA to visualize activity clusters in 2D

    Parameters:
    -----------
    X : np.array
        Shape (n_windows, 1000, 3) - windowed acceleration data
    Y : np.array
        Shape (n_windows,) - activity labels

    Returns:
    --------
    plotly figure
    """
    # Flatten each window into a feature vector
    # From (n_windows, 1000, 3) to (n_windows, 3000)
    X_flattened = X.reshape(X.shape[0], -1)

    print(f"Original shape: {X.shape}")
    print(f"Flattened shape: {X_flattened.shape}")

    # Standardize features (PCA needs normalization)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_flattened)

    # Apply PCA
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)

    # Create dataframe for plotting
    df_pca = pd.DataFrame({
        'PC1': X_pca[:, 0],
        'PC2': X_pca[:, 1],
        'Activity': Y
    })

    # Print explained variance
    explained_var = pca.explained_variance_ratio_
    print(f"\nExplained variance ratio:")
    for i, var in enumerate(explained_var):
        print(f"  PC{i+1}: {var*100:.2f}%")
    print(f"  Total: {sum(explained_var)*100:.2f}%")

    # Create visualization
    fig = px.scatter(
        df_pca,
        x='PC1',
        y='PC2',
        color='Activity',
        title=f'PCA: Activity Clustering <br>Explained Variance: PC1={explained_var[0]*100:.1f}%, PC2={explained_var[1]*100:.1f}%',
        labels={'PC1': f'PC1 ({explained_var[0]*100:.1f}%)',
                'PC2': f'PC2 ({explained_var[1]*100:.1f}%)'},
        opacity=0.6,
        height=600
    )

    fig.update_traces(marker=dict(size=5))

    return fig, pca, df_pca

# Apply 2D PCA
fig_2d, pca_model, df_pca = apply_pca_visualization(X, Y, n_components=2)
fig_2d.show()

Original shape: (6224, 1000, 3)
Flattened shape: (6224, 3000)

Explained variance ratio:
  PC1: 31.83%
  PC2: 28.82%
  Total: 60.64%


#### the seperation is much more evident with by hiding way the heirarchy.|

## Calculate euclidean norm minus 1

Compute the Euclidean norm of the 'x', 'y', and 'z' columns for each data point in each window in `X`, and then subtract 1 to remove the gravity component.


In [12]:
# Compute the Euclidean norm minus 1
X = np.sqrt(np.sum(X**2, axis=2)) - 1

## Feature extraction function

Create a function that takes a window of data (Euclidean norm minus 1 values) and calculates the specified moment and quantile features: mean, standard deviation, skewness, kurtosis, 25th percentile, median (50th percentile), 75th percentile, autocorrelation, spectral, and peak features from the Euclidean norm minus 1 data.


In [19]:
from statsmodels.tsa.stattools import acf
from scipy.signal import welch, find_peaks
from scipy.integrate import simpson
from scipy.stats import skew, kurtosis

def extract_window_features(window_data):
    """
    Calculates statistical, autocorrelation, spectral, and peak features
    from a single window of data (Euclidean norm minus 1).

    Parameters:
    -----------
    window_data : np.array
        A 1D numpy array representing a single window of transformed data.

    Returns:
    --------
    list
        A list containing the calculated features.
    """
    # Statistical features
    mean = np.mean(window_data)
    std_dev = np.std(window_data)
    window_skewness = skew(window_data)
    window_kurtosis = kurtosis(window_data)
    q1 = np.percentile(window_data, 25)
    median = np.percentile(window_data, 50)
    q3 = np.percentile(window_data, 75)

    # Autocorrelation at lag 1
    autocorr_lag1 = acf(window_data, nlags=1)[1]

    # Spectral features (assuming sampling frequency of 100 Hz based on 1000 samples in 10s window)
    fs = 100
    freqs, psd = welch(window_data, fs=fs, nperseg=len(window_data))
    dominant_freq_idx = np.argmax(psd)
    dominant_freq = freqs[dominant_freq_idx]
    power_dominant_freq = psd[dominant_freq_idx]
    total_power = simpson(psd, freqs) # Use Simpson's rule for total power

    # Peak features
    peaks, _ = find_peaks(window_data)
    num_peaks = len(peaks)
    mean_peak_height = np.mean(window_data[peaks]) if num_peaks > 0 else 0

    return [mean, std_dev, window_skewness, window_kurtosis, q1, median, q3,
            autocorr_lag1, dominant_freq, power_dominant_freq, total_power,
            num_peaks, mean_peak_height]

## Apply feature extraction


Apply feature extraction function to the data windows (`X`) to generate a new list of feature vectors.


In [20]:
feature_list_updated = []
for window in tqdm(X):
    features = extract_window_features(window)
    feature_list_updated.append(features)

100%|██████████| 6224/6224 [00:20<00:00, 299.25it/s]


In [21]:
df_features_updated = pd.DataFrame(feature_list_updated, columns=['mean', 'std_dev', 'skewness', 'kurtosis', 'q1', 'median', 'q3', 'autocorr_lag1', 'dominant_freq', 'power_dominant_freq', 'total_power', 'num_peaks', 'mean_peak_height'])
df_features_updated['activity_label'] = Y
display(df_features_updated.head())

Unnamed: 0,mean,std_dev,skewness,kurtosis,q1,median,q3,autocorr_lag1,dominant_freq,power_dominant_freq,total_power,num_peaks,mean_peak_height,activity_label
0,-0.001677,0.004933,0.375726,-0.952936,-0.006659,-0.000289,0.001541,0.179116,7.7,5e-06,2.5e-05,231,0.003434,sleep
1,-0.002034,0.005164,0.582792,-0.248734,-0.006659,-0.000289,0.001541,0.205702,8.7,5e-06,2.7e-05,249,0.003366,sleep
2,-0.002699,0.004752,0.897768,0.349347,-0.006659,-0.006659,0.001541,0.142531,8.5,5e-06,2.2e-05,241,0.002802,sleep
3,-0.002903,0.004555,0.707957,-0.094587,-0.006659,-0.006659,-0.000289,0.152025,8.1,4e-06,2.1e-05,236,0.002482,sleep
4,-0.003025,0.004433,0.939997,0.258811,-0.006659,-0.006659,-0.000289,0.188552,9.3,4e-06,2e-05,224,0.00227,sleep


## Create feature dataframe

Create a pandas DataFrame with the expanded set of extracted features and corresponding activity labels (`Y`).


In [22]:
df_features_updated = pd.DataFrame(feature_list_updated, columns=['mean', 'std_dev', 'skewness', 'kurtosis', 'q1', 'median', 'q3', 'autocorr_lag1', 'dominant_freq', 'power_dominant_freq', 'total_power', 'num_peaks', 'mean_peak_height'])
df_features_updated['activity_label'] = Y
display(df_features_updated.head())

Unnamed: 0,mean,std_dev,skewness,kurtosis,q1,median,q3,autocorr_lag1,dominant_freq,power_dominant_freq,total_power,num_peaks,mean_peak_height,activity_label
0,-0.001677,0.004933,0.375726,-0.952936,-0.006659,-0.000289,0.001541,0.179116,7.7,5e-06,2.5e-05,231,0.003434,sleep
1,-0.002034,0.005164,0.582792,-0.248734,-0.006659,-0.000289,0.001541,0.205702,8.7,5e-06,2.7e-05,249,0.003366,sleep
2,-0.002699,0.004752,0.897768,0.349347,-0.006659,-0.006659,0.001541,0.142531,8.5,5e-06,2.2e-05,241,0.002802,sleep
3,-0.002903,0.004555,0.707957,-0.094587,-0.006659,-0.006659,-0.000289,0.152025,8.1,4e-06,2.1e-05,236,0.002482,sleep
4,-0.003025,0.004433,0.939997,0.258811,-0.006659,-0.006659,-0.000289,0.188552,9.3,4e-06,2e-05,224,0.00227,sleep


## Display feature dataframe

Display the first few rows of the feature DataFrame to show the newly added features.


In [23]:
display(df_features_updated.head())

Unnamed: 0,mean,std_dev,skewness,kurtosis,q1,median,q3,autocorr_lag1,dominant_freq,power_dominant_freq,total_power,num_peaks,mean_peak_height,activity_label
0,-0.001677,0.004933,0.375726,-0.952936,-0.006659,-0.000289,0.001541,0.179116,7.7,5e-06,2.5e-05,231,0.003434,sleep
1,-0.002034,0.005164,0.582792,-0.248734,-0.006659,-0.000289,0.001541,0.205702,8.7,5e-06,2.7e-05,249,0.003366,sleep
2,-0.002699,0.004752,0.897768,0.349347,-0.006659,-0.006659,0.001541,0.142531,8.5,5e-06,2.2e-05,241,0.002802,sleep
3,-0.002903,0.004555,0.707957,-0.094587,-0.006659,-0.006659,-0.000289,0.152025,8.1,4e-06,2.1e-05,236,0.002482,sleep
4,-0.003025,0.004433,0.939997,0.258811,-0.006659,-0.006659,-0.000289,0.188552,9.3,4e-06,2e-05,224,0.00227,sleep
