In [1]:
import os
import json
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('../data/preprocessed_data.csv', index_col=0)
df.head()

Unnamed: 0_level_0,ax,gz,gx,az,gy,ay,id,side,time_diff
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-06-14 07:19:32.380,0.031798,-0.988506,-0.843705,-1.004893,0.523758,0.063902,MRBF3DNuWq0zhSXajwPy,R,
2024-06-14 07:19:32.382,0.031909,-0.989812,-0.838482,-1.004933,0.519988,0.063587,MRBF3DNuWq0zhSXajwPy,R,0.002
2024-06-14 07:19:32.384,0.032016,-0.990767,-0.833668,-1.004969,0.513473,0.063274,MRBF3DNuWq0zhSXajwPy,R,0.002
2024-06-14 07:19:32.387,0.032117,-0.991283,-0.829232,-1.004998,0.503391,0.062965,MRBF3DNuWq0zhSXajwPy,R,0.003
2024-06-14 07:19:32.389,0.032212,-0.991273,-0.825124,-1.00502,0.488897,0.062665,MRBF3DNuWq0zhSXajwPy,R,0.002


In [4]:
filtered_df = df.copy()
df.shape

(2248628, 9)

#### Acceleration Magnitude (acc_magnitude)
- Combines ax, ay, and az into a single feature representing the total movement intensity.
- Helps distinguish between static periods and movement.
- The acceleration magnitude gives an overall measure of movement, making it useful for detecting foot strikes.

In [None]:
# Compute acceleration magnitude (Euclidean norm or L2)
# This gives a single metric for overall movement
filtered_df['acc_magnitude'] = np.sqrt(filtered_df['ax']**2 + filtered_df['ay']**2 + filtered_df['az']**2)


#### Compute Rate of Change for Gyroscope Data (x, y, z)
- The gyroscope measures angular velocity, which can help detect step swings.
- I compute the first derivative (rate of change) to detect movement intensity changes.

filtered_df["gz_change"] = filtered_df["gz"].diff()
filtered_df["gy_change"] = filtered_df["gy"].diff()
filtered_df["gx_change"] = filtered_df["gx"].diff()

#### Gyroscope Change Rate (gyro_change)
- Measures how rapidly the gyroscope values (gx, gy, gz) change.
- Useful for detecting sudden rotational changes, such as turning.

In [None]:

# Compute the rate of change in gyroscope data (sum of absolute differences)
# Helps detect sudden changes in orientation
filtered_df['gyro_change'] = filtered_df[['gx', 'gy', 'gz']].diff().abs().sum(axis=1)

#### Step Detection (is_peak)

- Uses find_peaks() to locate peaks in acc_magnitude, assuming steps create spikes in movement.
- Parameters:
     - height=1.2: Ensures only significant movements are detected.
     - distance=20: Avoids detecting multiple peaks for the same step.

In [None]:
from scipy.signal import find_peaks
# Detect peaks in acceleration (potential steps)
# Identifies steps or other periodic movements
# Step Detection:
# find_peaks(acc_magnitude, height=1.2, distance=20) → Finds movement peaks.
peaks, _ = find_peaks(filtered_df['acc_magnitude'], height=1.2, distance=20)  # Adjust threshold based on dataset
filtered_df['is_peak'] = 0  # Initialize column
filtered_df.loc[peaks, 'is_peak'] = 1  # Mark detected peaks


#### Compute Step Time Intervals
- Time between steps is a key gait metric.
- Compute time difference between consecutive step detections (step peaks).

In [None]:
filtered_df["step_time_interval"] = filtered_df["time_diff"].where(filtered_df["is_step"] == 1).diff()

#### Symmetry & Imbalance Features
- Walking asymmetry (left vs. right foot differences) is important for gait analysis.
- If left and right foot data are available, we compute differences in acceleration and step count.

In [None]:
# Example: Compute left-right imbalance if separate sensors are available
if "ax_left" in filtered_df.columns and "ax_right" in filtered_df.columns:
    filtered_df["acc_diff"] = filtered_df["ax_left"] - filtered_df["ax_right"]
    filtered_df["step_diff"] = filtered_df["is_step_left"] - filtered_df["is_step_right"]


In [11]:
print(filtered_df.describe())

                 ax            gz            gx            az            gy  \
count  2.248628e+06  2.248628e+06  2.248628e+06  2.248628e+06  2.248628e+06   
mean   1.200217e-01 -2.658855e-01 -4.722533e-01 -1.272025e+00  2.483511e-01   
std    6.227705e-01  2.819690e+01  3.196942e+01  4.222818e-01  1.310447e+02   
min   -2.246148e+00 -2.689279e+02 -2.742725e+02 -3.901363e+00 -4.407931e+02   
25%   -6.503511e-02 -7.226034e+00 -8.046268e+00 -1.482821e+00 -9.359933e+01   
50%    1.326400e-02 -3.780756e-01 -3.948044e-01 -1.063839e+00  2.366885e-01   
75%    1.057819e-01  3.759453e+00  5.770688e+00 -1.009376e+00  4.497699e+00   
max    5.292321e+00  3.681251e+02  2.536795e+02  3.389588e-01  5.722992e+02   

                 ay     time_diff  
count  2.248628e+06  2.248627e+06  
mean   3.753519e-03  9.169069e-03  
std    1.570025e-01  2.196743e+00  
min   -1.090997e+00  0.000000e+00  
25%   -2.504415e-02  1.000000e-03  
50%    2.288007e-03  1.000000e-03  
75%    4.073420e-02  2.000000e-03  


In [8]:
filtered_df.columns

Index(['ax', 'gz', 'gx', 'az', 'gy', 'ay', 'id', 'side', 'time_diff'], dtype='object')

#### Time Between Steps (time_diff_steps)
- Captures the temporal relationship between detected steps.
- Helps identify walking/running patterns.

In [None]:

# Compute time differences between detected steps
# Extract temporal relationships
# Captures walking/running pace by measuring time intervals between steps.
filtered_df['time_diff_steps'] = np.nan  # Initialize with NaN
peak_indices = filtered_df.index[filtered_df['is_peak'] == 1]
filtered_df.loc[peak_indices, 'time_diff_steps'] = filtered_df['time_diff'].loc[peak_indices].diff()

# Fill missing values (since the first step won't have a previous step to compare)
filtered_df['time_diff_steps'].fillna(filtered_df['time_diff_steps'].median(), inplace=True)


#### Selecting Window Size 
for rolling - adjust based on sampling rate and movement duration

In [None]:
WINDOW_SIZE = 50  # Number of rows for rolling features (tunable)

#### Rolling Window Features (Short-Term Trends)
- Compute rolling mean and standard deviation over a fixed time window to smooth out noise and capture local trends.
- This helps detect sustained motion patterns instead of isolated peaks.

#### Rolling Mean & Standard Deviation (acc_mag_mean, acc_mag_std)
- Why? Helps detect motion consistency and eliminates small noise variations.
- How?
     - rolling(window=WINDOW_SIZE, min_periods=1).mean() → Smooths fluctuations.
     - rolling(window=WINDOW_SIZE, min_periods=1).std() → Measures motion instability.

In [None]:
# Compute rolling mean & standard deviation (local movement trends) - short term trends
# acc_magnitude: identify between static state and movement
filtered_df['acc_mag_mean'] = filtered_df['acc_magnitude'].rolling(window=WINDOW_SIZE, min_periods=1).mean()
filtered_df['acc_mag_std'] = filtered_df['acc_magnitude'].rolling(window=WINDOW_SIZE, min_periods=1).std()

#### Variance & Range Over Time Windows
Helps differentiate between stable movements (e.g., standing) and dynamic actions (e.g., walking, running).

#### Gyroscope Variance (gyro_variance)
- Identifies fast rotations (e.g., sharp turns or sudden changes in movement).
- By: Rolling window variance on gyro_change → Higher values indicate erratic (irregular) motion.

#### Acceleration Range (acc_range)
- Differentiates between stillness and intense motion (e.g., jumping or sprinting).
- By: Difference between max and min acceleration within a rolling window.

In [None]:
# Compute rolling variance in gyroscope data (identifies fast rotations)
filtered_df['gyro_variance'] = filtered_df['gyro_change'].rolling(window=WINDOW_SIZE, min_periods=1).var()


# Compute acceleration range over rolling window (max-min difference)
filtered_df['acc_range'] = filtered_df['acc_magnitude'].rolling(window=WINDOW_SIZE, min_periods=1).max() - \
                           filtered_df['acc_magnitude'].rolling(window=WINDOW_SIZE, min_periods=1).min()


#### Peak-to-Peak Time Interval
- Captures how frequently a person is stepping or making significant movements.
- Helps in step frequency detection for activity recognition.

#### Step Count (step_count)
- Helps identify activity intensity over a window (e.g., slow walking vs. running).
- rolling(window=WINDOW_SIZE).sum() → Counts steps in a moving window.

In [None]:
# Compute step frequency (rolling count of steps in the window)
filtered_df['step_count'] = filtered_df['is_peak'].rolling(window=WINDOW_SIZE, min_periods=1).sum()

#### Why These Features Improve Machine Learning Models?
- Reduces Noise → Rolling averages smooth data fluctuations.
- Captures Temporal Trends → Features reflect short-term movement patterns.
- Improves Step Recognition → Detects patterns in step timing and frequency.
- Enhances Classification → Models can differentiate between walking, running, and stillness.

#### Correlation Matrix

#### Feature Selection
I did a lot of feature selection right and left :) but its time to determine which one is more relevant and useful