In [None]:
import os
import pandas as pd
import numpy as np
import sys
import warnings
warnings.filterwarnings("ignore")

# PROJECT_ROOT = os.getcwd()  
# sys.path.append(os.path.join(PROJECT_ROOT))
RAW_DATA_PATH = "../data/data_raw"
EXTRACT_DIR = "../data/data_extracted"
 
sys.path.append(os.path.join('../src'))

from data_loader import DataLoader

data_loader = DataLoader(RAW_DATA_PATH)
df = data_loader.load_json_data()

# to save the raw extracted data
# path_to_save = os.path.join(EXTRACT_DIR, 'raw_extracted_data.csv')
# df.to_csv(path_to_save, index=False)
# print('SUCCESS')

### Alternative Import
import it from saved ones

In [173]:


# df = pd.read_csv('../data/data_extracted/raw_extracted_data.csv', index_col=0, parse_dates=["time"])
df = pd.read_csv('../data/data_extracted/raw_extracted_data.csv', parse_dates=["time"])
df.head()

Unnamed: 0,time,ax,gz,gx,az,gy,ay,id,side
0,2024-06-14 12:20:22.332,0.052216,0.0,-0.42,-1.077016,3.01,0.028792,qOFuUi37Xw1hextwdqbQ,L
1,2024-06-14 12:20:22.334,0.054168,0.07,-0.49,-1.081896,2.73,0.025864,qOFuUi37Xw1hextwdqbQ,L
2,2024-06-14 12:20:22.336,0.053192,0.14,-0.28,-1.079944,2.73,0.028792,qOFuUi37Xw1hextwdqbQ,L
3,2024-06-14 12:20:22.339,0.052704,0.07,-0.35,-1.077016,2.73,0.027816,qOFuUi37Xw1hextwdqbQ,L
4,2024-06-14 12:20:22.341,0.049288,0.07,-0.42,-1.07848,2.8,0.0244,qOFuUi37Xw1hextwdqbQ,L


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2248628 entries, 0 to 2248627
Data columns (total 9 columns):
 #   Column  Dtype         
---  ------  -----         
 0   time    datetime64[ns]
 1   ax      float64       
 2   gz      float64       
 3   gx      float64       
 4   az      float64       
 5   gy      float64       
 6   ay      float64       
 7   id      object        
 8   side    object        
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 154.4+ MB


In [80]:
df.isna().sum()

time    2230
ax         0
gz         0
gx         0
az         0
gy         0
ay         0
id         0
side       0
dtype: int64

In [174]:
# missing values
df["time"] = df["time"].fillna(method="ffill")

In [175]:
# sort
df.set_index('time', inplace=True)
df = df.sort_index()

In [176]:
df['time_diff'] = df.index.to_series().diff().dt.total_seconds()
# df['time_diff'].describe()

In [177]:
from scipy.signal import butter, filtfilt
fs = 1 / df['time_diff'].median()  # Hz (sampling frequency)

def low_pass_filter(data, cutoff=5, fs=fs, order=4):
    nyquist = 0.5 * fs  # Nyquist Frequency
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data)


# only applying filter on gyroscope because there are a lot of fluctuation 
sensor_columns = ["gx", "gy", "gz"]
# sensor_columns = ["ax", "ay", "az", "gx", "gy", "gz"]

for col in sensor_columns:
    df[col] = low_pass_filter(df[col])


In [178]:
df['acc_magnitude'] = np.sqrt(df['ax']**2 + df['ay']**2 + df['az']**2)
df['gyro_magnitude'] = np.sqrt(df['gx']**2 + df['gy']**2 + df['gz']**2)

### Feature Engineering

In [154]:
# Reset index to bring 'time' back as a column
df_reset = df.reset_index()

# Group by 'id' and aggregate numerical features
session_features = df_reset.groupby("id").agg({
    "ax": ["mean", "std", "min", "max"],
    "ay": ["mean", "std", "min", "max"],
    "az": ["mean", "std", "min", "max"],
    "gx": ["mean", "std", "min", "max"],
    "gy": ["mean", "std", "min", "max"],
    "gz": ["mean", "std", "min", "max"],
    "acc_magnitude": ["mean", "std", "min", "max"],
    "gyro_magnitude": ["mean", "std", "min", "max"],
}).reset_index()

# Flatten MultiIndex column names
# session_features.columns = ["_".join(col).strip() if isinstance(col, tuple) else col for col in session_features.columns]
session_features.columns = ["_".join(col).strip() for col in session_features.columns.values]
session_features = session_features.rename(columns={"id_": "id"})

# Compute start_time and end_time
time_stats = df_reset.groupby("id")["time"].agg(["min", "max"]).reset_index()
time_stats.rename(columns={"min": "start_time", "max": "end_time"}, inplace=True)

# Calculate session duration in seconds
time_stats["session_duration"] = (time_stats["end_time"] - time_stats["start_time"]).dt.total_seconds()

# Calculate num_measurements using size()
num_measurements = df_reset.groupby("id").size().reset_index(name="num_measurements")

# Merge all computed features
session_features = session_features.merge(time_stats, on="id").merge(num_measurements, on="id")

In [155]:
# in order to see all the columns in dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

session_features.head()

Unnamed: 0,id,ax_mean,ax_std,ax_min,ax_max,ay_mean,ay_std,ay_min,ay_max,az_mean,az_std,az_min,az_max,gx_mean,gx_std,gx_min,gx_max,gy_mean,gy_std,gy_min,gy_max,gz_mean,gz_std,gz_min,gz_max,acc_magnitude_mean,acc_magnitude_std,acc_magnitude_min,acc_magnitude_max,gyro_magnitude_mean,gyro_magnitude_std,gyro_magnitude_min,gyro_magnitude_max,start_time,end_time,session_duration,num_measurements
0,033nuFnKoOjj4NeIt9FS,0.145444,1.096804,-7.631832,9.104616,-0.010198,0.317348,-7.801656,7.797264,-1.323169,0.873987,-15.910264,5.766696,2.258583,23.254951,-46.035638,81.707701,0.334082,144.814593,-243.747121,339.899397,-1.192993,19.582512,-56.050126,47.338059,1.603012,1.126874,0.182386,18.854183,116.984669,90.638412,0.583855,351.501588,2024-06-14 12:57:50.585,2024-06-14 12:58:02.630,12.045,9530
1,0373xrf1eaJoc8IcE6Gc,0.061137,0.706941,-3.072936,6.193208,-0.029542,0.235787,-4.68968,3.992816,-1.243727,0.615819,-15.767768,10.004488,-0.018356,18.235415,-70.317366,72.982832,-0.010302,99.339888,-189.802508,292.703551,-9.042328,32.769593,-177.501529,46.560558,1.378055,0.766161,0.237786,15.774594,72.178656,78.399415,0.526052,294.842479,2024-06-14 09:58:41.709,2024-06-14 09:59:01.296,19.587,15659
2,04SwmTFshylAIDUNCYTh,0.105682,0.750896,-2.777208,5.259664,-0.01188,0.233459,-3.365248,2.351184,-1.275432,0.623397,-13.039848,6.210776,-0.973189,10.924582,-50.466418,42.774182,-0.268863,87.416961,-181.390801,200.462326,-9.011266,35.031561,-195.731268,96.067855,1.428928,0.776604,0.354213,13.114848,70.081566,64.485414,0.303466,200.90762,2024-06-14 10:06:32.231,2024-06-14 10:06:49.748,17.517,6806
3,0AxduuyH7QvfV841ANdD,0.17725,0.714278,-2.987536,5.20696,9.2e-05,0.341121,-4.249992,2.898232,-1.291535,0.431663,-6.796864,1.918816,-0.665033,24.790192,-51.842123,74.864055,1.369568,116.657076,-177.372507,293.259517,-0.070815,23.579128,-51.298438,61.004087,1.446304,0.64849,0.32869,8.971079,91.899045,79.595951,0.243727,305.233376,2024-06-14 08:14:32.793,2024-06-14 08:14:46.480,13.687,10892
4,0bYDrU653eQr2GwcMXXw,0.111582,1.038022,-5.911144,6.0878,0.002004,0.270595,-3.474072,6.768072,-1.30526,0.6762,-10.455888,2.7816,-1.021105,30.038174,-70.417917,76.356968,1.103814,144.109075,-226.493813,325.675396,1.278941,19.458435,-37.587349,59.91754,1.532167,0.988192,0.370423,13.034219,110.677028,99.003615,0.553425,336.603153,2024-06-14 09:31:03.983,2024-06-14 09:31:17.668,13.685,10892


### Calculating the Height

In [157]:
# using mean of acc_magnitude
mean_acc_magnitude = df["acc_magnitude"].mean()
std_acc_magnitude = df["acc_magnitude"].std()

mean_height = mean_acc_magnitude + 0.5 * std_acc_magnitude  

peaks, _ = find_peaks(df["acc_magnitude"], height=mean_height)

print("peaks", len(peaks))
print("mean_az",mean_acc_magnitude)
print("std_az",std_acc_magnitude)
print("height", mean_height)

peaks 303275
mean_az 1.4666781837688312
std_az 0.9405900599158591
height 1.9369732137267608


### Finding the steps

In [161]:
from scipy.signal import find_peaks

fs = 1 / df['time_diff'].median()  # Sampling rate in Hz (samples / seconds)
min_distance = int(0.5 * fs)  # Minimum distance between peaks (0.5 seconds)
# every step takes around 0.5 seconds

def count_peaks(series):
    peaks, _ = find_peaks(series, height=mean_height, distance=min_distance)  
    return len(peaks)

# Count steps based on side (L or R)
step_counts = df.groupby(["id", "side"])["acc_magnitude"].apply(count_peaks).unstack(fill_value=0)

# Check if 'side' values contain 'L' (left) and 'R' (right), then assign them explicitly
step_counts['left_steps'] = step_counts.get('L', 0)
step_counts['right_steps'] = step_counts.get('R', 0)

# Remove the temporary 'L' and 'R' columns to avoid keeping them in the final output
step_counts = step_counts.drop(columns=['L', 'R'], errors='ignore')

session_features = session_features.rename(columns={"id_": "id"})
final_df = session_features.merge(step_counts, on="id")

final_df.head()


Unnamed: 0,id,ax_mean,ax_std,ax_min,ax_max,ay_mean,ay_std,ay_min,ay_max,az_mean,az_std,az_min,az_max,gx_mean,gx_std,gx_min,gx_max,gy_mean,gy_std,gy_min,gy_max,gz_mean,gz_std,gz_min,gz_max,acc_magnitude_mean,acc_magnitude_std,acc_magnitude_min,acc_magnitude_max,gyro_magnitude_mean,gyro_magnitude_std,gyro_magnitude_min,gyro_magnitude_max,start_time,end_time,session_duration,num_measurements,left_steps,right_steps
0,033nuFnKoOjj4NeIt9FS,0.145444,1.096804,-7.631832,9.104616,-0.010198,0.317348,-7.801656,7.797264,-1.323169,0.873987,-15.910264,5.766696,2.258583,23.254951,-46.035638,81.707701,0.334082,144.814593,-243.747121,339.899397,-1.192993,19.582512,-56.050126,47.338059,1.603012,1.126874,0.182386,18.854183,116.984669,90.638412,0.583855,351.501588,2024-06-14 12:57:50.585,2024-06-14 12:58:02.630,12.045,9530,4,5
1,0373xrf1eaJoc8IcE6Gc,0.061137,0.706941,-3.072936,6.193208,-0.029542,0.235787,-4.68968,3.992816,-1.243727,0.615819,-15.767768,10.004488,-0.018356,18.235415,-70.317366,72.982832,-0.010302,99.339888,-189.802508,292.703551,-9.042328,32.769593,-177.501529,46.560558,1.378055,0.766161,0.237786,15.774594,72.178656,78.399415,0.526052,294.842479,2024-06-14 09:58:41.709,2024-06-14 09:59:01.296,19.587,15659,6,7
2,04SwmTFshylAIDUNCYTh,0.105682,0.750896,-2.777208,5.259664,-0.01188,0.233459,-3.365248,2.351184,-1.275432,0.623397,-13.039848,6.210776,-0.973189,10.924582,-50.466418,42.774182,-0.268863,87.416961,-181.390801,200.462326,-9.011266,35.031561,-195.731268,96.067855,1.428928,0.776604,0.354213,13.114848,70.081566,64.485414,0.303466,200.90762,2024-06-14 10:06:32.231,2024-06-14 10:06:49.748,17.517,6806,4,4
3,0AxduuyH7QvfV841ANdD,0.17725,0.714278,-2.987536,5.20696,9.2e-05,0.341121,-4.249992,2.898232,-1.291535,0.431663,-6.796864,1.918816,-0.665033,24.790192,-51.842123,74.864055,1.369568,116.657076,-177.372507,293.259517,-0.070815,23.579128,-51.298438,61.004087,1.446304,0.64849,0.32869,8.971079,91.899045,79.595951,0.243727,305.233376,2024-06-14 08:14:32.793,2024-06-14 08:14:46.480,13.687,10892,5,6
4,0bYDrU653eQr2GwcMXXw,0.111582,1.038022,-5.911144,6.0878,0.002004,0.270595,-3.474072,6.768072,-1.30526,0.6762,-10.455888,2.7816,-1.021105,30.038174,-70.417917,76.356968,1.103814,144.109075,-226.493813,325.675396,1.278941,19.458435,-37.587349,59.91754,1.532167,0.988192,0.370423,13.034219,110.677028,99.003615,0.553425,336.603153,2024-06-14 09:31:03.983,2024-06-14 09:31:17.668,13.685,10892,5,5


In [164]:
display(final_df[['left_steps', 'right_steps', 'id', 'acc_magnitude_mean', 'session_duration', 'num_measurements']].max())
print("*" * 60)
display(final_df[['left_steps', 'right_steps', 'id', 'acc_magnitude_mean', 'session_duration', 'num_measurements']].min())

left_steps                              11
right_steps                              9
id                    zmCkqhYTl03DYdauXAtc
acc_magnitude_mean                1.741766
session_duration                 15506.002
num_measurements                     39944
dtype: object

************************************************************


left_steps                               2
right_steps                              0
id                    033nuFnKoOjj4NeIt9FS
acc_magnitude_mean                1.092013
session_duration                    10.938
num_measurements                      4082
dtype: object

### Saving Preprocessed Data:

In [165]:
# to save the raw extracted data
path_to_save = os.path.join(EXTRACT_DIR, 'preprocessed_data.csv')
final_df.to_csv(path_to_save, index=False)
print('SUCCESS')

SUCCESS


In [29]:
from watermark import watermark
print(watermark())

Last updated: 2025-02-22T11:20:39.835645+01:00

Python implementation: CPython
Python version       : 3.10.14
IPython version      : 8.27.0

Compiler    : Clang 12.0.0 (clang-1200.0.32.2)
OS          : Darwin
Release     : 19.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

