# P-147 EDA

In [39]:
import pandas as pd

# load as a df
df_147 = pd.read_csv("P147.csv", index_col="time", parse_dates=["time"], dtype={'x': 'f4', 'y': 'f4', 'z': 'f4', 'annotation': 'string'})

In [41]:
# load annotation-label-dictionary.csv as df to remap the annotation to proper labels
annotation_df = pd.read_csv("annotation-label-dictionary.csv", index_col = "annotation", dtype = "string")
annotation_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206 entries, 7030 sleeping;MET 0.95 to vehicle;MET 1.3
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   label:WillettsSpecific2018  206 non-null    string
 1   label:WillettsMET2018       206 non-null    string
 2   label:DohertySpecific2018   206 non-null    string
 3   label:Willetts2018          206 non-null    string
 4   label:Doherty2018           206 non-null    string
 5   label:Walmsley2020          206 non-null    string
dtypes: string(6)
memory usage: 11.3+ KB


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## to map the activity annotations from CPA code to willetts_label

In [42]:
# Direct mapping
df_147['activity_label'] = df_147['annotation'].map(annotation_df['label:WillettsSpecific2018'])

# Check for unmapped
print(f"Unmapped rows: {df_147['activity_label'].isna().sum()}")

Unmapped rows: 2788381


In [45]:
df_147['activity_label'].nunique()

8

## handling missing data

In [46]:
# let us check the percentage of null counts associated with each column in the data
null_counts = (df_147.isnull().sum()/len(df_147))*100
print(null_counts) # these missing labels needs to be taken care of seperately, for now  we will not do anything special with it.

x                  0.000000
y                  0.000000
z                  0.000000
annotation        28.687044
activity_label    28.687044
dtype: float64


## windowing approach to chunk the data. we are choosing win_size = 10sec. So @100hz it is 1000 ticks.

In [49]:
from tqdm import tqdm
import numpy as np

def extract_windows(data, winsize='10s'):
    X, Y = [], []
    for t, w in tqdm(data.resample(winsize, origin='start')):

        # Check window has no NaNs and is of correct length
        if w.isna().any().any() or len(w) != 1000:
            continue

        x = w[['x', 'y', 'z']].to_numpy()
        y = w['activity_label'].mode(dropna=False).item()

        X.append(x)
        Y.append(y)

    X = np.stack(X)
    Y = np.stack(Y)

    return X, Y

In [50]:
X, Y =extract_windows(df_147)

100%|██████████| 9721/9721 [00:02<00:00, 4264.23it/s]


In [55]:
X.shape

(6826, 1000, 3)

In [66]:
print(X)

[[[-0.3381657  -0.05728219 -0.9303979 ]
  [-0.3381657  -0.05728219 -0.9303979 ]
  [-0.3381657  -0.07305303 -0.9303979 ]
  ...
  [-0.3381657  -0.07305303 -0.9303979 ]
  [-0.3381657  -0.07305303 -0.9303979 ]
  [-0.3381657  -0.07305303 -0.9303979 ]]

 [[-0.3381657  -0.07305303 -0.94610983]
  [-0.3381657  -0.07305303 -0.9303979 ]
  [-0.3381657  -0.07305303 -0.9303979 ]
  ...
  [-0.64837205 -0.59349126 -0.2547866 ]
  [-0.8810268  -0.5461787  -0.19193906]
  [-1.03613    -0.38847014 -0.20765096]]

 [[-1.1912332  -0.404241    0.01231551]
  [-1.2532744  -0.4357827   0.373689  ]
  [-0.4312276  -0.7985124   0.10658685]
  ...
  [-0.91204745 -0.6092621   0.18514633]
  [-0.91204745 -0.6408038   0.24799389]
  [-0.83449584 -0.6092621   0.32655334]]

 ...

 [[ 0.67015004  0.02166779 -0.7236136 ]
  [ 0.67015004  0.03743864 -0.7236136 ]
  [ 0.67015004  0.03743864 -0.7236136 ]
  ...
  [ 0.68566036  0.03743864 -0.7236136 ]
  [ 0.68566036  0.03743864 -0.7236136 ]
  [ 0.68566036  0.03743864 -0.7236136 ]]

 [

## activity distribution

In [52]:
print("Activity Label Distribution:")
print(df_147['activity_label'].value_counts(normalize=True))

Activity Label Distribution:
activity_label
sleep               0.424144
mixed-activity      0.126134
sitting             0.111694
household-chores     0.10894
walking              0.10291
bicycling           0.058025
manual-work         0.043194
standing             0.02496
Name: proportion, dtype: Float64


In [57]:
def plot_activity_distribution(df):
    """
    Visualize distribution of activities
    """
    activity_counts = df['activity_label'].value_counts()
    
    # Calculate percentage
    activity_pct = (activity_counts / len(df) * 100).round(2)
    
    # Convert sample counts to hours for better interpretation
    activity_hours = (activity_counts * 0.01 / 3600).round(2)  # samples * 0.01s / 3600s(sampled at 100hz i.e for every 0.01sec 1 reading*number of records with that label divided by 60sec*60min )
    
    fig = px.bar(
        x=activity_counts.index,
        y=activity_hours.values,
        labels={'x': 'Activity', 'y': 'Duration (hours)'},
        title='Activity Distribution - Time Spent in Each Activity',
        text=activity_pct.values
    )
    
    fig.update_traces(texttemplate='%{text}%', textposition='outside')
    fig.update_layout(
        xaxis_tickangle=-45,
        height=500,
        showlegend=False
    )
    
    return fig

plot_activity_distribution(df_147)

## visualization

In [63]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

def plot_acceleration_by_activity_lineplot(X, Y, samples_per_activity=5):
    """
    line plots of acceleration patterns for different activities
    
    Parameters:
    -----------
    X : np.array
        Array of shape (n_windows, 1000, 3) containing acceleration data
    Y : np.array
        Array of shape (n_windows,) containing activity labels
    samples_per_activity : int
        Number of example windows to plot per activity
    """
    # unique activities
    unique_activities = np.unique(Y)
    # unique_activities = 1
    
    
    activities_to_plot = unique_activities
    
    # Create subplots: 3 rows (x, y, z) x number of activities
    n_activities = len(activities_to_plot)
    fig = make_subplots(
        rows=3, 
        cols=n_activities,
        subplot_titles=[f"{act}" for act in activities_to_plot] * 3,
        vertical_spacing=0.08,
        horizontal_spacing=0.05,
        row_titles=['X-axis', 'Y-axis', 'Z-axis']
    )
    
    # Time axis for 10 seconds at 100Hz
    time_axis = np.linspace(0, 10, 1000)
    
    colors = ['blue', 'orange', 'green', 'red', 'purple']
    
    for col_idx, activity in enumerate(activities_to_plot, start=1):
        # Get indices for this activity
        activity_indices = np.where(Y == activity)[0]
        
        if len(activity_indices) == 0:
            continue
        
        # Sample random windows for this activity
        n_samples = min(samples_per_activity, len(activity_indices))
        sample_indices = np.random.choice(activity_indices, size=n_samples, replace=False)
        
        # Plot each sample
        for sample_num, idx in enumerate(sample_indices):
            window = X[idx]  # Shape: (1000, 3)
            
            # Plot X acceleration
            fig.add_trace(
                go.Scatter(
                    x=time_axis, 
                    y=window[:, 0],
                    mode='lines',
                    line=dict(color=colors[sample_num % len(colors)], width=1),
                    showlegend=False,
                    opacity=0.7
                ),
                row=1, col=col_idx
            )
            
            # Plot Y acceleration
            fig.add_trace(
                go.Scatter(
                    x=time_axis, 
                    y=window[:, 1],
                    mode='lines',
                    line=dict(color=colors[sample_num % len(colors)], width=1),
                    showlegend=False,
                    opacity=0.7
                ),
                row=2, col=col_idx
            )
            
            # Plot Z acceleration
            fig.add_trace(
                go.Scatter(
                    x=time_axis, 
                    y=window[:, 2],
                    mode='lines',
                    line=dict(color=colors[sample_num % len(colors)], width=1),
                    showlegend=False,
                    opacity=0.7
                ),
                row=3, col=col_idx
            )
    
    # Update axes labels
    for col_idx in range(1, n_activities + 1):
        fig.update_xaxes(title_text="Time (s)", row=3, col=col_idx)
    
    for row_idx in range(1, 4):
        fig.update_yaxes(title_text="Acceleration (g)", row=row_idx, col=1)
    
    fig.update_layout(
        height=800,
        title_text="Acceleration Patterns by Activity (X, Y, Z axes)",
        showlegend=False
    )
    
    return fig

# Plot for selected activities
fig = plot_acceleration_by_activity_lineplot(X, Y, samples_per_activity=2)
fig.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def apply_pca_visualization(X, Y, n_components=2):
    """
    Apply PCA to visualize activity clusters in 2D
    
    Parameters:
    -----------
    X : np.array
        Shape (n_windows, 1000, 3) - windowed acceleration data
    Y : np.array
        Shape (n_windows,) - activity labels
    
    Returns:
    --------
    plotly figure
    """
    # Flatten each window into a feature vector
    # From (n_windows, 1000, 3) to (n_windows, 3000)
    X_flattened = X.reshape(X.shape[0], -1)
    
    print(f"Original shape: {X.shape}")
    print(f"Flattened shape: {X_flattened.shape}")
    
    # Standardize features (PCA needs normalization)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_flattened)
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)
    
    # Create dataframe for plotting
    df_pca = pd.DataFrame({
        'PC1': X_pca[:, 0],
        'PC2': X_pca[:, 1],
        'Activity': Y
    })
    
    # Print explained variance
    explained_var = pca.explained_variance_ratio_
    print(f"\nExplained variance ratio:")
    for i, var in enumerate(explained_var):
        print(f"  PC{i+1}: {var*100:.2f}%")
    print(f"  Total: {sum(explained_var)*100:.2f}%")
    
    # Create visualization
    fig = px.scatter(
        df_pca,
        x='PC1',
        y='PC2',
        color='Activity',
        title=f'PCA: Activity Clustering <br>Explained Variance: PC1={explained_var[0]*100:.1f}%, PC2={explained_var[1]*100:.1f}%',
        labels={'PC1': f'PC1 ({explained_var[0]*100:.1f}%)', 
                'PC2': f'PC2 ({explained_var[1]*100:.1f}%)'},
        opacity=0.6,
        height=600
    )
    
    fig.update_traces(marker=dict(size=5))
    
    return fig, pca, df_pca

# Apply 2D PCA
fig_2d, pca_model, df_pca = apply_pca_visualization(X, Y, n_components=2)
fig_2d.show()

Original shape: (6826, 1000, 3)
Flattened shape: (6826, 3000)

Explained variance ratio:
  PC1: 31.17%
  PC2: 28.30%
  Total: 59.47%


### take away from the above visulization plot is:
    1. Sleep detection (almost perfect separation)
    2. Bicycling detection (distinct cluster)
    3. Walking vs static activities (clear left-right separation)

seeing this makes me wonder if hierarchical classification approach  works well in this case.

In [65]:
annotation_df

Unnamed: 0_level_0,label:WillettsSpecific2018,label:WillettsMET2018,label:DohertySpecific2018,label:Willetts2018,label:Doherty2018,label:Walmsley2020
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7030 sleeping;MET 0.95,sleep,sleep,sleep,sleep,sleep,sleep
occupation;office and administrative support;11580 office/computer work general;MET 1.5,sitting,sitstand+lowactivity,sedentary-screen,sit-stand,sedentary,sedentary
home activity;household chores;preparing meals/cooking/washing dishes;5035 kitchen activity general cooking/washing/dishes/cleaning up;MET 3.3,household-chores,sitstand+activity,tasks-moderate,mixed,moderate,light
occupation;office and administrative support;11580 office wok/computer work general;MET 1.5,sitting,sitstand+lowactivity,sedentary-screen,sit-stand,sedentary,sedentary
home activity;miscellaneous;sitting;9060 sitting/lying reading or without observable/identifiable activities;MET 1.3,sitting,sitstand+lowactivity,sedentary-non-screen,sit-stand,sedentary,sedentary
...,...,...,...,...,...,...
transportation;walking;17250 walking as the single means to a destination not to work or class;MET 3.0,mixed-activity,walking,walking,mixed,walking,moderate-vigorous
transportation;walking;17270 walking as the single means to work or class (not from);MET 3.5,walking,walking,walking,walking,walking,moderate-vigorous
transportation;public transportation;16016 riding in a bus or train;MET 1.3,vehicle,vehicle,vehicle,vehicle,sedentary,sedentary
household-chores;sitstand+lowactivity;MET 2.8,household-chores,sitstand+lowactivity,tasks-light,mixed,tasks-light,light
