# Preprocessing for Dirt Road Data

In [1]:
# Enable autoload for just updated files
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import numpy as np
sys.path.append('../../')   # Add parent directory to Python path
from utils.preprocessing import *
from utils.segmentation import *
from utils.visualization import *

# #P1 Dirt Road E1-E3

In [3]:
#Missing value and combined data
#Load the E1 Data and handle missing values
df_one= pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/E1/HandleBar/Accelerometer/Accelerometer.0.csv')  
df_two= pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/E2/HandleBar/Accelerometer/Accelerometer.0.csv')  
df_three= pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/E3/HandleBar/Accelerometer/Accelerometer.0.csv')  
df_combined = pd.concat([df_one, df_two, df_three], ignore_index=True)
print(df_combined.isnull().sum())
print(df_combined.shape)

Date         0
NTP          0
GNSS-Time    0
Acc-X        0
Acc-Y        0
Acc-Z        0
dtype: int64
(37533, 6)


In [4]:
df_combined

Unnamed: 0,Date,NTP,GNSS-Time,Acc-X,Acc-Y,Acc-Z
0,2024-11-18T16:43:48.217,2024-11-18 16:43:47.930,-1,0.188734,3.580859,9.596416
1,2024-11-18T16:43:48.220,2024-11-18 16:43:47.932,-1,0.306281,3.691228,9.253345
2,2024-11-18T16:43:48.224,2024-11-18 16:43:47.937,-1,0.276371,4.219444,9.180363
3,2024-11-18T16:43:48.229,2024-11-18 16:43:47.942,-1,0.420539,4.119543,8.195717
4,2024-11-18T16:43:48.232,2024-11-18 16:43:47.944,-1,0.499801,4.034897,8.148159
...,...,...,...,...,...,...
37528,2024-11-18T16:47:59.085,2024-11-18 16:47:58.798,-1,-0.382553,4.278367,8.721540
37529,2024-11-18T16:47:59.089,2024-11-18 16:47:58.802,-1,-0.474377,4.296612,8.516953
37530,2024-11-18T16:47:59.097,2024-11-18 16:47:58.809,-1,-0.704686,4.242774,8.593225
37531,2024-11-18T16:47:59.097,2024-11-18 16:47:58.809,-1,-0.579063,4.229613,8.708977


In [5]:
plot_accelerometer_data(df_combined, 'E1 P1 Handlebar Accelerometer Data of Dirt Road')

In [None]:
# 1. before 16:45:45
# 2. between 16:44:31 and 16:45:44
# 3. between 16:46:10 and 16:47:35


In [6]:
df_combined['NTP'] = pd.to_datetime(df_combined['NTP'])

mask = ~(
    (df_combined['NTP'] < '2024-11-18 16:45:45') |
    ((df_combined['NTP'] >= '2024-11-18 16:44:31') & (df_combined['NTP'] <= '2024-11-18 16:45:44')) |
    ((df_combined['NTP'] >= '2024-11-18 16:46:10') & (df_combined['NTP'] <= '2024-11-18 16:47:35'))
)

df_combined = df_combined[mask].reset_index(drop=True)

In [7]:
print(df_combined.shape)

(12074, 6)


In [8]:
df_combined.to_csv('../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered.csv', index=False)

In [9]:
# Downsampling:
df = pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered.csv')
df_selected = df[['NTP', 'Acc-X', 'Acc-Y', 'Acc-Z']].copy()
df_100hz = downsample_to_frequency(
    df_selected,
    target_hz=100,
    timestamp_col='NTP',
    output_path='../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered_combined_100hz.csv',
    categorical_attributes=None
)
df_30hz = downsample_to_frequency(
    df_selected,
    target_hz=30,
    timestamp_col='NTP',
    output_path='../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered_combined_30hz.csv',
    categorical_attributes=None
)

In [10]:
# plot the combined data
name = 'P3 Handlebar Accelerometer Data of 100Hz'
plot_accelerometer_data(df_100hz, name)
name = 'P3 Handlebar Accelerometer Data of 30Hz'
plot_accelerometer_data(df_30hz, name)

In [None]:
# 100hz, 0.5s window size, 50% overlapping
df_combined_100hz = pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered_combined_100hz.csv')
#Segmentation into 50% Overlapping
segments_100hz = segment_acceleration_data_overlapping_numpy(
    df_combined_100hz,
    window_size=50,   # 0.5s at 100Hz
    overlap=50,       # 50% overlap
    channels=['Acc-X', 'Acc-Y', 'Acc-Z'],
)
print(segments_100hz.shape)
np.savez(
    '../../data/RoadRoughness/Raw/Dirt/P1/segments_100hz_0.5s_50overlap.npz',
    segments = segments_100hz
)

(534, 50, 3)

In [None]:
# 30hz, 0.5s window size, 50% overlapping
df_combined_30hz = pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered_combined_30hz.csv')
#Segmentation into 50% Overlapping
segments = segment_acceleration_data_overlapping_numpy(
    df_combined_30hz,
    window_size=15,   # 0.5s at 30Hz
    overlap=50,       # 50% overlap
    channels=['Acc-X', 'Acc-Y', 'Acc-Z'],
)
print(segments.shape)
np.savez(
    '../../data/RoadRoughness/Raw/Dirt/P1/segments_30hz_0.5s_50overlap.npz',
    segments = segments
)

(578, 15, 3)


In [3]:
# 100hz, 1s window size, 50% overlapping
df_combined_100hz = pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered_combined_100hz.csv')
#Segmentation into 50% Overlapping
segments_100hz = segment_acceleration_data_overlapping_numpy(
    df_combined_100hz,
    window_size=100,   # 1s at 100Hz
    overlap=50,       # 50% overlap
    channels=['Acc-X', 'Acc-Y', 'Acc-Z'],
)
print(segments_100hz.shape)
np.savez(
    '../../data/RoadRoughness/Raw/Dirt/P1/segments_100hz_1s_50overlap.npz',
    segments = segments_100hz
)

(266, 100, 3)


In [4]:
# 30hz, 1s window size, 50% overlapping
df_combined_30hz = pd.read_csv('../../data/RoadRoughness/Raw/Dirt/P1/Accelerometer_filtered_combined_30hz.csv')
#Segmentation into 50% Overlapping
segments = segment_acceleration_data_overlapping_numpy(
    df_combined_30hz,
    window_size=30,   # 1s at 30Hz
    overlap=50,       # 50% overlap
    channels=['Acc-X', 'Acc-Y', 'Acc-Z'],
)
print(segments.shape)
np.savez(
    '../../data/RoadRoughness/Raw/Dirt/P1/segments_30hz_1s_50overlap.npz',
    segments = segments
)

(269, 30, 3)
