# Preprocessing for Asphalt Data

In [16]:
# Enable autoload for just updated files
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
import sys
import numpy as np
sys.path.append('../../')   # Add parent directory to Python path
from utils.preprocessing import *
from utils.segmentation import *
from utils.visualization import *

## Person 2

In [None]:
df_one= pd.read_csv('../../data/RoadRoughness/Raw/Asphalt/P2/E1/HandleBar/Accelerometer/Accelerometer.0.csv')  
print(df_one.isnull().sum())
plot_accelerometer_data(df_one,None)

Date         0
NTP          0
GNSS-Time    0
Acc-X        0
Acc-Y        0
Acc-Z        0
dtype: int64


In [22]:
def trim_initial_quiet_period(df, threshold=0.5, window_size=100):
    """
    Automatically trims the initial quiet period in accelerometer data where no significant movement occurs.
    
    Args:
        df: DataFrame with accelerometer data
        threshold: Standard deviation threshold to detect activity
        window_size: Size of the rolling window for standard deviation calculation
        
    Returns:
        Trimmed DataFrame starting from where activity begins
    """

    # Calculate rolling standard deviation for all axes
    # Computes the standard deviation within each window = window_size
    roll_std_x = df['Acc-X'].rolling(window=window_size).std()
    roll_std_y = df['Acc-Y'].rolling(window=window_size).std()
    roll_std_z = df['Acc-Z'].rolling(window=window_size).std()
    
    # Combine all axes to detect activity in any direction
    combined_std = roll_std_x + roll_std_y + roll_std_z
    
    # Find the first point where the combined standard deviation exceeds the threshold
    # (We use a buffer of window_size to ensure we have enough data before the activity starts)
    activity_starts = combined_std[window_size:].gt(threshold).idxmax()
    
    # If no activity is detected, return the original dataframe
    if activity_starts == 0:
        print("No significant activity detected in the dataset.")
        return df
    
    # Trim the dataframe to start from the detected activity start point
    # We can optionally include a small buffer before the activity starts
    buffer = int(window_size/2)  # Half window size as buffer
    start_idx = max(0, activity_starts - buffer)
    #start_idx = max(0, activity_starts)
    
    trimmed_df = df.iloc[start_idx:].copy()
    
    # Print info about the trimming
    start_time = df.iloc[activity_starts]['NTP']
    original_len = len(df)
    trimmed_len = len(trimmed_df)
    removed_percentage = ((original_len - trimmed_len) / original_len) * 100
    
    print(f"Activity detected starting at index {activity_starts}")
    print(f"Trimmed {original_len - trimmed_len} datapoints ({removed_percentage:.1f}% of the dataset)")
    print(f"Activity start time (NTP): {start_time}")
    
    return trimmed_df

In [28]:
# Apply the trimming function to your dataset
trimmed_df_one = trim_initial_quiet_period(df_one, threshold=0.9)

# Visualize the result
plot_accelerometer_data(trimmed_df_one, None)

Activity detected starting at index 3007
Trimmed 2957 datapoints (28.0% of the dataset)
Activity start time (NTP): 2024-11-18 15:38:54.207000


## Person 3

In [46]:
df_three= pd.read_csv('../../data/RoadRoughness/Raw/Asphalt/P3/E2/HandleBar/Accelerometer/Accelerometer.0.csv') 
plot_accelerometer_data(df_three,None)


In [47]:
# Apply the trimming function to your dataset
trimmed_df_three = trim_initial_quiet_period(df_three, threshold=1.5)

# Visualize the result
plot_accelerometer_data(trimmed_df_three, None)

Activity detected starting at index 2707
Trimmed 2657 datapoints (33.9% of the dataset)
Activity start time (NTP): 2024-11-18 16:56:14.189000


## Person 4

In [48]:
df_four= pd.read_csv('../../data/RoadRoughness/Raw/Asphalt/P4/E1/HandleBar/Accelerometer/Accelerometer.0.csv') 
plot_accelerometer_data(df_four,None)

In [49]:
# Apply the trimming function to your dataset
trimmed_df_four = trim_initial_quiet_period(df_four, threshold=0.9)

# Visualize the result
plot_accelerometer_data(trimmed_df_four, None)

Activity detected starting at index 2326
Trimmed 2276 datapoints (32.2% of the dataset)
Activity start time (NTP): 2024-11-23 15:05:37.941000
