## Combine all datasets and Train Test Spilt

In [19]:
%load_ext autoreload
%autoreload 2

In [41]:
import sys
import numpy as np
sys.path.append('../../')   # Add parent directory to Python path
from utils.preprocessing import *
from utils.segmentation import *
from utils.visualization import *
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
np.random.seed(42)  # For reproducibility

In [21]:
def select_random_samples(data, n=190):
    indices = np.random.choice(data.shape[0], n, replace=False)
    return data[indices]

## 1. Combine all datasets

In [None]:
data = np.load('../../data/Curb/P3/handlebar/Accelerometer/segments_100hz_0.5s_50overlap.npz')
data_curb_0 = data['segments_0']
data_curb_1 = data['segments_1']
data= np.load('../../data/RoadRoughness/Raw/Asphalt/P1/segments_100hz_0.5s_50overlap.npz')
data_asphalt= data['segments']
data_asphalt = select_random_samples(data_asphalt, 190)
data= np.load('../../data/RoadRoughness/Raw/Cobblestone/P1/segments_100hz_0.5s_50overlap.npz')
data_cobblestone= data['segments']
data_cobblestone = select_random_samples(data_cobblestone, 190)
data= np.load('../../data/RoadRoughness/Raw/CompactGravel/P1/segments_100hz_0.5s_50overlap.npz')
data_compact_gravel= data['segments']
data_compact_gravel = select_random_samples(data_compact_gravel, 190)
data= np.load('../../data/RoadRoughness/Raw/Dirt/P1/segments_100hz_0.5s_50overlap.npz')
data_Dirt= data['segments']
data_Dirt = select_random_samples(data_Dirt, 190)
data= np.load('../../data/RoadRoughness/Raw/PavingStone/P1/segments_100hz_0.5s_50overlap.npz')
data_PavingStone= data['segments']
data_PavingStone = select_random_samples(data_PavingStone, 190)



In [24]:
# Add labels
# Assign labels for each surface type
datasets = [
    (data_curb_0, "curb_0"),
    (data_curb_1, "curb_1"),
    (data_asphalt, "asphalt"),
    (data_cobblestone, "cobblestone"),
    (data_compact_gravel, "compact_gravel"),
    (data_Dirt, "dirt"),
    (data_PavingStone, "paving_stone")
]

# Combine all into a single list of (sensor_values, label)
combined_dataset = []
for data, label in datasets:
    for segment in data:
        combined_dataset.append((segment, label))

In [None]:
# save the combined dataset
with open('../../data/TrainTest/combined_dataset.pkl', 'wb') as f:
    pickle.dump(combined_dataset, f)

In [25]:
# Show number of samples
print("Number of samples:", len(combined_dataset))
# Show shape of sensor values for the first sample
print("Shape of first sample's sensor values:", combined_dataset[0][0].shape)
# # Show first 3 samples (sensor values and labels)
# for i in range(3):
#     print(combined_dataset[i][0], combined_dataset[i][1])

# Count samples for each label
from collections import Counter
label_counts = Counter(label for _, label in combined_dataset)
print("Number of samples per label:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Number of samples: 1326
Shape of first sample's sensor values: (50, 3)
Number of samples per label:
curb_0: 188
curb_1: 188
asphalt: 190
cobblestone: 190
compact_gravel: 190
dirt: 190
paving_stone: 190


## 2. Train Test Spilt


In [30]:
from sklearn.model_selection import train_test_split
# 80% train, 20% test
train_set, test_set = train_test_split(combined_dataset, test_size=0.2, random_state=42, shuffle=True, stratify=[label for data, label in combined_dataset])
print(len(train_set), len(test_set))
# Separate sensor values and labels
X_train = [x for x, _ in train_set]
y_train = [label for _, label in train_set]
X_test = [x for x, _ in test_set]
y_test = [label for _, label in test_set]
print(f"Label train: {len(y_train)}, Label test: {len(y_test)}")

1060 266
Label train: 1060, Label test: 266


In [None]:
# save the train and test sets
with open('../../data/TrainTest/train_set.pkl', 'wb') as f:
    pickle.dump(train_set, f)
with open('../../data/TrainTest/test_set.pkl', 'wb') as f:
    pickle.dump(test_set, f)


## 3. Normalise dataset

In [37]:
X_train[0]

array([[-2.94318687,  4.45824211,  5.85580769],
       [-2.94290638,  4.458214  ,  5.85604172],
       [-2.94262588,  4.45818589,  5.85627574],
       [-2.94234539,  4.45815778,  5.85650977],
       [-2.9420649 ,  4.45812967,  5.85674379],
       [-2.94178441,  4.45810157,  5.85697782],
       [-2.94150391,  4.45807346,  5.85721184],
       [-2.94122342,  4.45804535,  5.85744586],
       [-2.94094293,  4.45801724,  5.85767989],
       [-2.94066243,  4.45798913,  5.85791391],
       [-2.94038194,  4.45796102,  5.85814794],
       [-2.94010145,  4.45793292,  5.85838196],
       [-2.93982095,  4.45790481,  5.85861599],
       [-2.93954046,  4.4578767 ,  5.85885001],
       [-2.93925997,  4.45784859,  5.85908403],
       [-2.93897947,  4.45782048,  5.85931806],
       [-2.93869898,  4.45779237,  5.85955208],
       [-2.93841849,  4.45776426,  5.85978611],
       [-2.93813799,  4.45773616,  5.86002013],
       [-2.9378575 ,  4.45770805,  5.86025416],
       [-2.93757701,  4.45767994,  5.860

In [36]:
X_train_array = np.stack(X_train)
print(X_train_array.shape)
print(X_train_array[0])

(1060, 50, 3)
[[-2.94318687  4.45824211  5.85580769]
 [-2.94290638  4.458214    5.85604172]
 [-2.94262588  4.45818589  5.85627574]
 [-2.94234539  4.45815778  5.85650977]
 [-2.9420649   4.45812967  5.85674379]
 [-2.94178441  4.45810157  5.85697782]
 [-2.94150391  4.45807346  5.85721184]
 [-2.94122342  4.45804535  5.85744586]
 [-2.94094293  4.45801724  5.85767989]
 [-2.94066243  4.45798913  5.85791391]
 [-2.94038194  4.45796102  5.85814794]
 [-2.94010145  4.45793292  5.85838196]
 [-2.93982095  4.45790481  5.85861599]
 [-2.93954046  4.4578767   5.85885001]
 [-2.93925997  4.45784859  5.85908403]
 [-2.93897947  4.45782048  5.85931806]
 [-2.93869898  4.45779237  5.85955208]
 [-2.93841849  4.45776426  5.85978611]
 [-2.93813799  4.45773616  5.86002013]
 [-2.9378575   4.45770805  5.86025416]
 [-2.93757701  4.45767994  5.86048818]
 [-2.93729652  4.45765183  5.8607222 ]
 [-2.93701602  4.45762372  5.86095623]
 [-2.93673553  4.45759561  5.86119025]
 [-2.93645504  4.45756751  5.86142428]
 [-2.936174

In [None]:
# Stack X_train to shape (num_samples, segment_length, num_channels)
X_train_array = np.stack(X_train)  # shape: (N, L, C)

# Reshape to 2D for scaler: (N*L, C)
# Your sensor data is 3D: (N, L, C) where, N = number of samples, L = segment length (timesteps), C = number of channels (features per timestep)
N, L, C = X_train_array.shape
X_train_reshaped = X_train_array.reshape(-1, C)
# you are converting your 3D array (N, L, C) into a 2D array with shape (N*L, C).
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_reshaped)

# Reshape back to original shape
X_train_normalized = X_train_scaled.reshape(N, L, C)

# 4. Labels from string to integer

In [40]:
label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(y_train)
y_test_int = label_encoder.transform(y_test)

print("Classes:", label_encoder.classes_)
print("First 10 y_train_int:", y_train_int[:10])
print("First 10 y_test_int:", y_test_int[:10])
for idx, label in enumerate(label_encoder.classes_):
    print(f"{idx}: {label}")


Classes: ['asphalt' 'cobblestone' 'compact_gravel' 'curb_0' 'curb_1' 'dirt'
 'paving_stone']
First 10 y_train_int: [6 5 5 5 1 1 4 4 2 6]
First 10 y_test_int: [0 3 4 2 6 5 4 2 5 6]
0: asphalt
1: cobblestone
2: compact_gravel
3: curb_0
4: curb_1
5: dirt
6: paving_stone


## 5: One-hot encode the labels

In [42]:
y_train_onehot = to_categorical(y_train_int)
y_test_onehot = to_categorical(y_test_int)

print(y_train_onehot.shape)
print(y_test_onehot.shape)

(1060, 7)
(266, 7)


In [44]:
# Randomly select an index and check that the one-hot encoding matches the original label
r = np.random.randint(len(y_train_int))
assert y_train_onehot[r].argmax() == y_train_int[r]
r = np.random.randint(len(y_test_int))
assert y_test_onehot[r].argmax() == y_test_int[r]

## 6. Save train, test data and labels