In [None]:
import torch
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from torch import nn, optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

RANDOM_SEED = 42

In [2]:
df = pd.read_csv('../../data/handlebar/Accelerometer_data_combined_segmented.csv') 
# 0 for normal, 1 for abnormal
df['curb_scene'].value_counts()

curb_scene
0.0    4351
1.0      76
Name: count, dtype: int64

In [3]:
# Suppose df is your DataFrame, and df['curb_scene'] is your label column
train_data, test_data, train_labels, test_labels = train_test_split(df, df['curb_scene'], 
                                                                    test_size=0.2, 
                                                                    random_state=RANDOM_SEED)

# Now split the (train_data, train_labels) again into train and validation
train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, 
                                                                  test_size=0.25, 
                                                                  random_state=RANDOM_SEED)

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
#scaling data, because neural network works better with scaled data
scaler = StandardScaler()
data_scaled = scaler.fit(train_data.iloc[:, 3:])
#transform data
train_data_scaled = scaler.transform(train_data.iloc[:, 3:])
val_data_scaled = scaler.transform(val_data.iloc[:, 3:])
test_data_scaled = scaler.transform(test_data.iloc[:, 3:])
print(train_data_scaled.shape)
print(val_data_scaled.shape)
print(test_data_scaled.shape)

(2655, 100)
(886, 100)
(886, 100)


In [5]:
# Convert the scaled data back to DataFrames for further processing
train_data_scaled_df = pd.DataFrame(train_data_scaled, columns=train_data.columns[3:])
test_data_scaled_df = pd.DataFrame(test_data_scaled, columns=test_data.columns[3:])
val_data_scaled_df = pd.DataFrame(val_data_scaled, columns=val_data.columns[3:])
# Add the 'curb_scene', 'start_time', and 'end_time' columns back to the scaled DataFrames
train_data_scaled_df['curb_scene'] = train_data['curb_scene'].values
train_data_scaled_df['start_time'] = train_data['start_time'].values
train_data_scaled_df['end_time'] = train_data['end_time'].values
test_data_scaled_df['curb_scene'] = test_data['curb_scene'].values
test_data_scaled_df['start_time'] = test_data['start_time'].values
test_data_scaled_df['end_time'] = test_data['end_time'].values
val_data_scaled_df['curb_scene'] = val_data['curb_scene'].values
val_data_scaled_df['start_time'] = val_data['start_time'].values
val_data_scaled_df['end_time'] = val_data['end_time'].values

In [6]:
# Filter and slice the data in numpy.ndarray format
normal_train_data_scaled = train_data_scaled_df[train_data_scaled_df['curb_scene'] == 0.0].values[:, :-3]
abnormal_train_data_scaled = train_data_scaled_df[train_data_scaled_df['curb_scene'] == 1.0].values[:, :-3]
normal_val_data_scaled = val_data_scaled_df[val_data_scaled_df['curb_scene'] == 0.0].values[:, :-3]
abnormal_val_data_scaled = val_data_scaled_df[val_data_scaled_df['curb_scene'] == 1.0].values[:, :-3]
val_data_scaled = val_data_scaled_df.values[:, :-3]
normal_test_data_scaled = test_data_scaled_df[test_data_scaled_df['curb_scene'] == 0.0].values[:, :-3]
abnormal_test_data_scaled = test_data_scaled_df[test_data_scaled_df['curb_scene'] == 1.0].values[:, :-3]

In [7]:
def create_dataset(df):
  #Cast to float32 and list
  sequences = df.astype(np.float32).tolist()
  # Convert each sequence to a PyTorch tensor, add a new dimension, and cast to float
  dataset = [torch.tensor(s).unsqueeze(1).float() for s in sequences]
  # Stack the tensors to get the shape of number of sequences, sequence length, number of features
  number_sequence, sequence_length, number_features = torch.stack(dataset).shape
  return dataset, sequence_length, number_features

In [8]:
#Cast to float32 and list then tensor
normal_train_data_scaled_tensor, sequence_length, number_features = create_dataset(normal_train_data_scaled)
normal_val_data_scaled_tensor, _, _ = create_dataset(normal_val_data_scaled)
normal_test_data_scaled_tensor, _, _ = create_dataset(normal_test_data_scaled)
abnormal_test_data_scaled_tensor, _, _ = create_dataset(abnormal_test_data_scaled)

In [None]:
#print(normal_val_data_scaled_tensor[0], sequence_length,number_features)
#tensor, 100, 1, 33:19

In [11]:
#LSTM Autoencoder
#Based on https://github.com/shobrook/sequitur

class Encoder(nn.Module):
  """
    An LSTM-based Encoder for the Recurrent Autoencoder.
    Parameters
    ----------
    seq_len : int
        The length of the input sequences.
    n_features : int
        The number of features in the input sequences.
    embedding_dim : int, optional
        The number of features in the bottleneck layer.
    """
  def __init__(self, seq_len, n_features, embedding_dim=32):
    super(Encoder, self).__init__()
    self.seq_len, self.n_features = seq_len, n_features
    self.embedding_dim, self.hidden_dim = embedding_dim, 2 * embedding_dim
    # First LSTM layer
    self.rnn1 = nn.LSTM(
      input_size=n_features,
      hidden_size=self.hidden_dim,
      num_layers=1,
      batch_first=True
    )
    # Second LSTM layer
    self.rnn2 = nn.LSTM(
      input_size=self.hidden_dim,
      hidden_size=embedding_dim,
      num_layers=1,
      batch_first=True
    )
  def forward(self, x):
    # Reshape input to (batch_size=1, seq_len, n_features)
    x = x.reshape((1, self.seq_len, self.n_features))
    # Pass through the first LSTM layer
    x, (_, _) = self.rnn1(x)
    # Pass through the second LSTM layer
    x, (hidden_n, _) = self.rnn2(x)
    # Reshape the hidden state to (n_features, embedding_dim)
    return hidden_n.reshape((self.n_features, self.embedding_dim))

In [12]:
class Decoder(nn.Module):
    """
    An LSTM-based Decoder for the Recurrent Autoencoder.

    Parameters
    ----------
    seq_len : int
        The length of the input sequences.
    input_dim : int, optional
        The number of features in the input sequences (default is 32).
    n_features : int, optional
        The number of features in the output sequences (default is 1).
    """
    def __init__(self, seq_len, input_dim=32, n_features=1):
      super(Decoder, self).__init__()
      self.seq_len, self.input_dim = seq_len, input_dim
      self.hidden_dim, self.n_features = 2 * input_dim, n_features
      # First LSTM layer
      self.rnn1 = nn.LSTM(
        input_size=input_dim,
        hidden_size=input_dim,
        num_layers=1,
        batch_first=True
      )
      # Second LSTM layer
      self.rnn2 = nn.LSTM(
        input_size=input_dim,
        hidden_size=self.hidden_dim,
        num_layers=1,
        batch_first=True
      )
      # Output layer to map the hidden state to the desired number of features
      self.output_layer = nn.Linear(self.hidden_dim, n_features)


    def forward(self, x):
        x = x.repeat(self.seq_len, self.n_features)
        x = x.reshape((self.n_features, self.seq_len, self.input_dim))
        x, (hidden_n, cell_n) = self.rnn1(x)
        x, (hidden_n, cell_n) = self.rnn2(x)
        x = x.reshape((self.seq_len, self.hidden_dim))
        return self.output_layer(x)

In [13]:
class RecurrentAutoencoder(nn.Module):
    def __init__(self, seq_len, n_features, embedding_dim=32):
        super(RecurrentAutoencoder, self).__init__()
        # Initialize the encoder 
        self.encoder = Encoder(seq_len, n_features, embedding_dim).to(device)
        # Initialize the decoder 
        self.decoder = Decoder(seq_len, embedding_dim, n_features).to(device)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [14]:
model = RecurrentAutoencoder(sequence_length, number_features, embedding_dim=64)
model = model.to(device)

In [None]:
def train_model(model, train_dataset, val_dataset, n_epochs):
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
  #mean absolut error
  criterion = nn.L1Loss(reduction='sum').to(device)
  #record history for lossed
  history = dict(train=[], val=[])
  best_model_wts = copy.deepcopy(model.state_dict())
  best_loss = 10000.0
  for epoch in range(1, n_epochs + 1):
    model = model.train()
    train_losses = []
    #iterate over each sequence
    for seq_true in train_dataset:
      optimizer.zero_grad()
      seq_true = seq_true.to(device)
      seq_pred = model(seq_true)
      loss = criterion(seq_pred, seq_true)
      loss.backward()
      optimizer.step()
      train_losses.append(loss.item())
    val_losses = []
    model = model.eval()
    with torch.no_grad():
      for seq_true in val_dataset:
        seq_true = seq_true.to(device)
        seq_pred = model(seq_true)
        loss = criterion(seq_pred, seq_true)
        val_losses.append(loss.item())
    train_loss = np.mean(train_losses)
    val_loss = np.mean(val_losses)
    history['train'].append(train_loss)
    history['val'].append(val_loss)
    if val_loss < best_loss:
      best_loss = val_loss
      best_model_wts = copy.deepcopy(model.state_dict())
    print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')
  model.load_state_dict(best_model_wts)
  return model.eval(), history