## Combine all datasets and Train Test Spilt

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import numpy as np
sys.path.append('../../')   # Add parent directory to Python path
import pickle
from utils.preprocessing import *
from utils.segmentation import *
from utils.visualization import *

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

np.random.seed(42)  # For reproducibility


In [49]:
def select_random_samples(data, n=190):
    indices = np.random.choice(data.shape[0], n, replace=False)
    return data[indices]

## 1. Combine all datasets

In [5]:
# Load the curb data (which appears to be stored as a dictionary with scene_0 and scene_1)
with open('../data_all/curb_0.5s_combined_all.pkl', 'rb') as f:
    curb_data = pickle.load(f)
    data_curb_0 = curb_data['scene_0']
    data_curb_1 = curb_data['scene_1']
    
# Load the other surface types (these appear to be stored as arrays)
with open('../data_all/asphalt_0.5s_combined_all.pkl', 'rb') as f:
    data_asphalt = pickle.load(f)
        
with open('../data_all/cobblestone_0.5s_combined_all.pkl', 'rb') as f:
    data_cobblestone = pickle.load(f)
    
with open('../data_all/compactgravel_0.5s_combined_all.pkl', 'rb') as f:
    data_compact_gravel = pickle.load(f)
    
with open('../data_all/dirt_0.5s_combined_all.pkl', 'rb') as f:
    data_Dirt = pickle.load(f)
    
with open('../data_all/pavingstone_0.5s_combined_all.pkl', 'rb') as f:
    data_PavingStone = pickle.load(f)

# Print shapes to verify the data was loaded correctly
print("Curb (scene 0):", data_curb_0.shape)
print("Curb (scene 1):", data_curb_1.shape)
print("Asphalt:", data_asphalt.shape)
print("Cobblestone:", data_cobblestone.shape)
print("Compact Gravel:", data_compact_gravel.shape)
print("Dirt:", data_Dirt.shape)
print("Paving Stone:", data_PavingStone.shape)


Curb (scene 0): (1189, 50, 3)
Curb (scene 1): (1189, 50, 3)
Asphalt: (1413, 50, 3)
Cobblestone: (985, 50, 3)
Compact Gravel: (970, 50, 3)
Dirt: (1166, 50, 3)
Paving Stone: (1299, 50, 3)


In [6]:
# Add labels
# Assign labels for each surface type
datasets = [
    (data_curb_0, "curb_0"),
    (data_curb_1, "curb_1"),
    (data_asphalt, "asphalt"),
    (data_cobblestone, "cobblestone"),
    (data_compact_gravel, "compact_gravel"),
    (data_Dirt, "dirt"),
    (data_PavingStone, "paving_stone")
]

# Combine all into a single list of (sensor_values, label)
combined_dataset = []
for data, label in datasets:
    for segment in data:
        combined_dataset.append((segment, label))

In [7]:
# save the combined dataset
with open('../data_all/TrainTest/combined_dataset.pkl', 'wb') as f:
    pickle.dump(combined_dataset, f)

In [8]:
print("Number of samples:", len(combined_dataset))
print("Shape of first sample's sensor values:", combined_dataset[0][0].shape)
# # Show first 3 samples (sensor values and labels)
# for i in range(3):
#     print(combined_dataset[i][0], combined_dataset[i][1])

from collections import Counter
label_counts = Counter(label for _, label in combined_dataset)
print("Number of samples per label:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Number of samples: 8211
Shape of first sample's sensor values: (50, 3)
Number of samples per label:
curb_0: 1189
curb_1: 1189
asphalt: 1413
cobblestone: 985
compact_gravel: 970
dirt: 1166
paving_stone: 1299


## 2. Train Test Spilt


In [9]:
# 80% train, 20% test
train_set, test_set = train_test_split(combined_dataset, test_size=0.2, random_state=42, shuffle=True, stratify=[label for data, label in combined_dataset])
print(len(train_set), len(test_set))
# Separate sensor values and labels
X_train = [x for x, _ in train_set]
y_train = [label for _, label in train_set]
X_test = [x for x, _ in test_set]
y_test = [label for _, label in test_set]
print(f"Label train: {len(y_train)}, Label test: {len(y_test)}")

6568 1643
Label train: 6568, Label test: 1643


## 3. Normalise dataset

In [12]:
X_train[0]

array([[-1.67996092e+00,  3.95304273e+00,  1.03451697e+01],
       [-1.06012012e+00,  5.73828797e+00,  6.95753070e+00],
       [-1.46754798e+00,  7.08828850e+00,  7.79297480e+00],
       [ 1.82811650e+00,  6.70812865e+00,  5.70718100e+00],
       [ 9.21934633e-01,  6.30229603e+00,  8.07477933e+00],
       [ 9.38285550e-01,  7.18983400e+00,  5.40643325e+00],
       [ 1.23449707e+00,  6.59761007e+00,  5.02213583e+00],
       [-2.51645187e-01,  4.56759937e+00,  1.03034940e+01],
       [-1.58973160e+00,  2.58305220e+00,  1.35897370e+01],
       [ 3.53041048e-01,  4.43180657e+00,  8.56301517e+00],
       [ 3.55732961e-01,  3.44476737e+00,  9.74447133e+00],
       [ 8.84746215e-01,  2.73365035e+00,  1.47447225e+01],
       [-1.41844533e+00,  4.95892560e+00,  7.00179770e+00],
       [ 2.07397903e+00,  4.39611357e+00,  8.68465000e+00],
       [-9.72383289e-01,  3.75224520e+00,  1.14302148e+01],
       [-2.06036985e+00,  5.83968400e+00,  7.98260550e+00],
       [-1.45952217e+00,  5.07567547e+00

In [57]:
X_train_array = np.stack(X_train)
print(X_train_array.shape)
print(X_train_array[0])

(1060, 50, 3)
[[-2.94318687  4.45824211  5.85580769]
 [-2.94290638  4.458214    5.85604172]
 [-2.94262588  4.45818589  5.85627574]
 [-2.94234539  4.45815778  5.85650977]
 [-2.9420649   4.45812967  5.85674379]
 [-2.94178441  4.45810157  5.85697782]
 [-2.94150391  4.45807346  5.85721184]
 [-2.94122342  4.45804535  5.85744586]
 [-2.94094293  4.45801724  5.85767989]
 [-2.94066243  4.45798913  5.85791391]
 [-2.94038194  4.45796102  5.85814794]
 [-2.94010145  4.45793292  5.85838196]
 [-2.93982095  4.45790481  5.85861599]
 [-2.93954046  4.4578767   5.85885001]
 [-2.93925997  4.45784859  5.85908403]
 [-2.93897947  4.45782048  5.85931806]
 [-2.93869898  4.45779237  5.85955208]
 [-2.93841849  4.45776426  5.85978611]
 [-2.93813799  4.45773616  5.86002013]
 [-2.9378575   4.45770805  5.86025416]
 [-2.93757701  4.45767994  5.86048818]
 [-2.93729652  4.45765183  5.8607222 ]
 [-2.93701602  4.45762372  5.86095623]
 [-2.93673553  4.45759561  5.86119025]
 [-2.93645504  4.45756751  5.86142428]
 [-2.936174

In [13]:
# Stack X_train to shape (num_samples, segment_length, num_channels)
X_train_array = np.stack(X_train)  # shape: (N, L, C)

# Reshape to 2D for scaler: (N*L, C)
# Your sensor data is 3D: (N, L, C) where, N = number of samples, L = segment length (timesteps), C = number of channels (features per timestep)
N, L, C = X_train_array.shape
X_train_reshaped = X_train_array.reshape(-1, C)
# you are converting your 3D array (N, L, C) into a 2D array with shape (N*L, C).
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_reshaped)

# Reshape back to original shape
X_train_normalized = X_train_scaled.reshape(N, L, C)

# 4. Labels from string to integer

In [14]:
label_encoder = LabelEncoder()
y_train_int = label_encoder.fit_transform(y_train)
y_test_int = label_encoder.transform(y_test)

print("Classes:", label_encoder.classes_)
print("First 10 y_train_int:", y_train_int[:10])
print("First 10 y_test_int:", y_test_int[:10])
for idx, label in enumerate(label_encoder.classes_):
    print(f"{idx}: {label}")


Classes: ['asphalt' 'cobblestone' 'compact_gravel' 'curb_0' 'curb_1' 'dirt'
 'paving_stone']
First 10 y_train_int: [0 3 0 4 0 0 6 4 6 5]
First 10 y_test_int: [5 4 1 5 4 4 5 1 0 5]
0: asphalt
1: cobblestone
2: compact_gravel
3: curb_0
4: curb_1
5: dirt
6: paving_stone


## 5: One-hot encode the labels

In [15]:
y_train_onehot = to_categorical(y_train_int)
y_test_onehot = to_categorical(y_test_int)

print(y_train_onehot.shape)
print(y_test_onehot.shape)

(6568, 7)
(1643, 7)


In [16]:
# Randomly select an index and check that the one-hot encoding matches the original label
r = np.random.randint(len(y_train_int))
assert y_train_onehot[r].argmax() == y_train_int[r]
r = np.random.randint(len(y_test_int))
assert y_test_onehot[r].argmax() == y_test_int[r]

## 6. Save train, test data and labels

In [None]:
# Save test data
with open('../data_all/TrainTest/X_test_data.pkl', 'wb') as f:
    pickle.dump(X_test, f)
with open('../data_all/TrainTest/y_test_onehot.pkl', 'wb') as f:
    pickle.dump(y_test_onehot, f)

## 6. Train, validation Spilt

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_normalized,
    y_train_onehot,
    test_size=0.2,           # 20% for validation
    random_state=42,
    shuffle=True
)

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)

Train shape: (5254, 50, 3) (5254, 7)
Validation shape: (1314, 50, 3) (1314, 7)


In [None]:
with open('../data_all/TrainTest/X_train_data_normalized.pkl', 'wb') as f:
    pickle.dump(X_train_normalized, f)
with open('../data_all/TrainTest/X_val_data_normalized.pkl', 'wb') as f:
    pickle.dump(X_val, f)
with open('../data_all/TrainTest/y_train_onehot.pkl', 'wb') as f:
    pickle.dump(y_train_onehot, f)
with open('../data_all/TrainTest/y_val_onehot.pkl', 'wb') as f:
    pickle.dump(y_val_onehot, f)