# Pipeline for Production
So the approach that I used (for simplicity) that I get the data from dataframe. based on a real world scenario it could be upload or pulled from or some other services then we can change the code accordingly. However the python code for an to end pipeline is provided in: <br>
https://github.com/arifhaidari/step_detection_data_science/blob/main/src/calculated_steps_generator.py <br>
If you just run this python script (make sure the path and input data are correct and provided), then it will provide you calculated_steps.json file using raw sensor data.

In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('../data/preprocessed_data.csv', index_col=0, parse_dates=["time"])
df.head()

Unnamed: 0_level_0,ax,gz,gx,az,gy,ay,id,side,time_diff
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-06-14 07:19:32.380,0.031798,-0.988506,-0.843705,-1.004893,0.523758,0.063902,MRBF3DNuWq0zhSXajwPy,R,
2024-06-14 07:19:32.382,0.031909,-0.989812,-0.838482,-1.004933,0.519988,0.063587,MRBF3DNuWq0zhSXajwPy,R,0.002
2024-06-14 07:19:32.384,0.032016,-0.990767,-0.833668,-1.004969,0.513473,0.063274,MRBF3DNuWq0zhSXajwPy,R,0.002
2024-06-14 07:19:32.387,0.032117,-0.991283,-0.829232,-1.004998,0.503391,0.062965,MRBF3DNuWq0zhSXajwPy,R,0.003
2024-06-14 07:19:32.389,0.032212,-0.991273,-0.825124,-1.00502,0.488897,0.062665,MRBF3DNuWq0zhSXajwPy,R,0.002


In [2]:
import joblib
# Load the model
loaded_model = joblib.load('../models/random_forest_model.pkl')

# Use the loaded model to make predictions
# predictions = loaded_model.predict(X_test)

In [45]:
from scipy.signal import find_peaks

def extract_features(df):
     """
     Extracts features from raw sensor data at the session level.
     
     Parameters:
          df (pd.DataFrame): The sensor data containing columns ['id', 'time', 'ax', 'ay', 'az', 'gx', 'gy', 'gz', 'side']
          time could come also as index for the dataframe as well
     
     Returns:
          pd.DataFrame: Processed session-level features with step counts.
     """
     # Ensure 'time' is a datetime type
     if df.get("time") is not None:
          if not pd.api.types.is_datetime64_any_dtype(df['time']):
               df["time"] = pd.to_datetime(df["time"])
          df.set_index('time', inplace=True)
          df = df.sort_index()
          
     if not isinstance(df.index, pd.DatetimeIndex):
          df.index = pd.to_datetime(df.index)
     
     
     # Aggregate sensor statistics per session
     session_features = df.groupby("id").agg({
          "ax": ["mean", "std", "min", "max"],
          "ay": ["mean", "std", "min", "max"],
          "az": ["mean", "std", "min", "max"],
          "gx": ["mean", "std", "min", "max"],
          "gy": ["mean", "std", "min", "max"],
          "gz": ["mean", "std", "min", "max"]
          # "time_diff": ["sum", "count"]  # Sum gives session duration
     }).reset_index()

     # Flatten MultiIndex columns
     session_features.columns = ["_".join(col).strip() for col in session_features.columns.values]
     session_features = session_features.rename(columns={"id_": "id"})

     # Function to count steps using peaks in 'az'
     def count_peaks(series):
          peaks, _ = find_peaks(series, height=-1.66)  
          return len(peaks)

     # Count steps based on side (L or R)
     step_counts = df.groupby(["id", "side"])["az"].apply(count_peaks).unstack(fill_value=0)

     # Check if 'side' values contain 'L' (left) and 'R' (right), then assign them explicitly
     if 'L' in step_counts.columns:
          step_counts['left_steps'] = step_counts['L']
     else:
          step_counts['left_steps'] = 0

     if 'R' in step_counts.columns:
          step_counts['right_steps'] = step_counts['R']
     else:
          step_counts['right_steps'] = 0

     # Remove the temporary 'L' and 'R' columns to avoid keeping them in the final output
     step_counts = step_counts.drop(columns=['L', 'R'], errors='ignore')

     # merge all data
     final_df = session_features.merge(step_counts, on="id")
     
     # Extract time-based features
     final_df['session_duration'] = (df.index.max() - df.index.min()).total_seconds()
     final_df['num_measurements'] = len(df)
     final_df['start_time'] = df.index.min()
     final_df['end_time'] = df.index.max()

     return final_df

In [46]:
# Generate Predictions for a New Session
def predict_steps(new_session_data):
    new_session_features = extract_features(new_session_data)  

    # Extract only model input features
    X_new = new_session_features.drop(columns=["id", "start_time", "end_time", "left_steps", "right_steps"])

    # Predict step counts
    predicted_steps = loaded_model.predict(X_new)

    # Convert predictions to a structured output
    new_session_features["left_steps_pred"] = predicted_steps[:, 0].round().astype(int)
    new_session_features["right_steps_pred"] = predicted_steps[:, 1].round().astype(int)

    # Convert timestamps to string format
    new_session_features["start_time"] = new_session_features["start_time"].dt.strftime("%Y-%m-%dT%H:%M:%S")
    new_session_features["end_time"] = new_session_features["end_time"].dt.strftime("%Y-%m-%dT%H:%M:%S")

    # Final structured output
    output = new_session_features[["id", "start_time", "end_time", "left_steps_pred", "right_steps_pred"]]
    output.rename(columns={"left_steps_pred": "left_steps", "right_steps_pred": "right_steps"}, inplace=True)

    return output.to_dict(orient="records")

# Example Prediction
example_session = df[df["id"] == "MRBF3DNuWq0zhSXajwPy"]
# print('value of example_session')
# print(example_session)
predicted_steps = predict_steps(example_session)
print(f"Predicted Steps:\n {predicted_steps}")


Predicted Steps:
 [{'id': 'MRBF3DNuWq0zhSXajwPy', 'start_time': '2024-06-14T07:19:32', 'end_time': '2024-06-14T07:19:44', 'left_steps': 99, 'right_steps': 106}]
