# **IMPORT LIBS**

In [None]:
import warnings

import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.optimizers import Adam


import keras
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import keras.backend as K

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#from skimage.transform import rotate, shear, zoom
#from imgaug import augmenters as iaa

warnings.filterwarnings('ignore')

In [None]:
# Make auxiliar folders
if not os.path.exists('runtime_saves'):
    os.makedirs('runtime_saves')
if not os.path.exists('runtime_saves/models'):
    os.makedirs('runtime_saves/models')
if not os.path.exists('runtime_saves/train&test'):
    os.makedirs('runtime_saves/train&test')
    
current_dir = os.getcwd()

root_dir = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir, os.pardir))

datasetUAH_dir = os.path.join(root_dir, 'datasets', 'UAH-DRIVESET-v1', 'UAH-Processed')

print(f'Root directory: {root_dir}')
print(f'Dataset directory: {datasetUAH_dir}')

# **AUX FUNCTIONS**

In [None]:
def save_manovers_positions_to_csv_file(gps_positions, manovers, filename):
  output = np.zeros_like(gps_positions)

  # Iterate through the elements of arr2
  for i in range(len(manovers)):
    # Check if the element in arr2 is 1
    if manovers[i] == 1:
      # Copy the corresponding values from arr1 to the output array
      output[i] = gps_positions[i]

  output = output[~np.all(output == 0, axis=1)]
  
  filename = 'runtime_saves/' + filename
    
  np.savetxt(filename, output, delimiter=',', fmt='%.9f')



def separate_positives_negatives(data):
  # Ensure the input is converted to a NumPy array for easier manipulation
  data = np.array(data)

  # Create two empty arrays to store positive and negative values
  positives = np.zeros_like(data)
  negatives = np.zeros_like(data)

  # Use boolean indexing to separate positive and negative values
  positives[data > 0] = data[data > 0]
  negatives[data < 0] = -data[data < 0]

  # Combine the positive and negative values into a single 2D array
  return (positives, negatives)

def normalize_between_0_and_max(data):
  max_value = np.max(data)
  return data / max_value

def normalize_between_0_and_max_v2(data, max_value):
  return data / max_value

def split_train_test(data, test_size=0.2):
  # Check if test_size is between 0 and 1
  if test_size < 0 or test_size > 1:
    raise ValueError("test_size must be between 0 and 1.")

  # Get the number of samples
  num_samples = data.shape[0]

  # Calculate the number of samples for each set
  train_size = int(num_samples * (1 - test_size))
  test_size = num_samples - train_size

  # Randomly shuffle the data for better splitting (optional)
  #np.random.shuffle(data)

  # Split the data into training and test sets
  train_data = data[:train_size]
  test_data = data[train_size:]

  return train_data, test_data

def y_classification(data, threshold):
  classification = np.zeros_like(data, dtype=int)  # Initialize output array

  for col in range(0, 12):  # Loop through each column
    max_value = np.max(data[:, col])
    threshold_pos = max_value * threshold
    classification[:, col] = np.where(data[:, col] >= threshold_pos, 1, 0)

  return classification

def max_of_vectors(vec1, vec2, vec3, vec4, vec5, vec6):
  # Combine all vectors into a single array
  all_vectors = np.array([vec1, vec2, vec3, vec4, vec5, vec6])

  # Find the maximum value in the array
  max_value = np.max(all_vectors)

  return max_value

def has_one(data):
  """
  This function receives a numpy array and returns a new array
  with 1 if the correspondent row of input array has at least one cellule with 1.
  In other case the cellule is 0.

  Args:
      data: A numpy array of shape (n, 12) with 0 or 1 values in each cell.

  Returns:
      A numpy array of shape (n, 1) with 1s where the corresponding row in data has at least one 1, and 0s otherwise.
  """
  # We sum each row, and any value greater than zero indicates at least one 1 in that row
  return np.sum(data, axis=1)[:, np.newaxis] > 0

# **IMPORT DATA**

In [None]:
dataset = os.path.join(os.getcwd(), os.pardir, 'docs', 'v2', 'dataset-all.csv')
# dataset = os.path.join(os.getcwd(), os.pardir, 'docs', 'v1', 'Abrantes-Leiria.csv')


df = pd.read_csv(dataset)


acelX = df['accelerometerXAxis']
acelY = df['accelerometerYAxis']
acelZ = df['accelerometerZAxis']

gyrX = df['gyroscopeXAxis']
gyrY = df['gyroscopeYAxis']
gyrZ = df['gyroscopeZAxis']

latitude = df['latitude']
longitude = df['longitude']

In [None]:
print(df['accelerometerXAxis'].describe())
print(df['accelerometerYAxis'].describe())
print(df['accelerometerZAxis'].describe())
print(df['gyroscopeXAxis'].describe())
print(df['gyroscopeYAxis'].describe())
print(df['gyroscopeZAxis'].describe())

# **SEPARATE DATA BY MANOVER**

In [None]:
turnRightX, turnLeftX = separate_positives_negatives(acelX)

accelY, breakY = separate_positives_negatives(acelY)

positiveZ, negativeZ = separate_positives_negatives(acelZ)

gyrPositiveX, gyrNegativeX = separate_positives_negatives(gyrX)
gyrPositiveY, gyrNegativeY = separate_positives_negatives(gyrY)
gyrPositiveZ, gyrNegativeZ = separate_positives_negatives(gyrZ)

In [None]:
turnRightX.shape

# **NORMALIZE DATA**

In [None]:
max_accel = max_of_vectors(turnRightX, turnLeftX, accelY, breakY, positiveZ, negativeZ)
max_gyr = max_of_vectors(gyrPositiveX, gyrNegativeX, gyrPositiveY, gyrNegativeY, gyrPositiveZ, gyrNegativeZ)

turnRightXn = normalize_between_0_and_max_v2(turnRightX, max_accel)
turnLeftXn = normalize_between_0_and_max_v2(turnLeftX, max_accel)
accelYn = normalize_between_0_and_max_v2(accelY, max_accel)
breakYn = normalize_between_0_and_max_v2(breakY, max_accel)
positiveZn = normalize_between_0_and_max_v2(positiveZ, max_accel)
negativeZn = normalize_between_0_and_max_v2(negativeZ, max_accel)
gyrPositiveXn = normalize_between_0_and_max_v2(gyrPositiveX, max_gyr)
gyrNegativeXn = normalize_between_0_and_max_v2(gyrNegativeX, max_gyr)
gyrPositiveYn = normalize_between_0_and_max_v2(gyrPositiveY, max_gyr)
gyrNegativeYn = normalize_between_0_and_max_v2(gyrNegativeY, max_gyr)
gyrPositiveZn = normalize_between_0_and_max_v2(gyrPositiveZ, max_gyr)
gyrNegativeZn = normalize_between_0_and_max_v2(gyrNegativeZ, max_gyr)

# **CREATE AN ARRAY WITH ALL DATA**

In [None]:
x = np.array(list(zip(turnRightXn, turnLeftXn, accelYn, breakYn, positiveZn, negativeZn, gyrPositiveXn, gyrNegativeXn, gyrPositiveYn, gyrNegativeYn, gyrPositiveZn, gyrNegativeZn)))

In [None]:
x.shape

In [None]:
y = y_classification(x, 0.3)
print (np.sum(y, axis=0))

filename = 'runtime_saves/' + 'Y.csv'
print(y)

np.savetxt(filename, y, delimiter=',', fmt='%.0i')

# y classification based on outliers
#TODO: implement y classification based on outliers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

# Load the dataset
dataset = os.path.join(os.getcwd(), os.pardir, 'docs', 'v2', 'dataset-all.csv')
data = pd.read_csv(dataset)

columns = ['accelerometerXAxis', 'accelerometerYAxis', 'accelerometerZAxis', 'gyroscopeXAxis', 'gyroscopeYAxis', 'gyroscopeZAxis']

data = data[columns]

#convert all columns to float
data = data.astype(float)


# Detect and visualize outliers for each sensor column using IQR method
for column in data.columns:
    #print max value in column
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 6 * IQR
    upper_bound = Q3 + 6 * IQR

    # Create a mask for outliers
    mask = (data[column] >= lower_bound) & (data[column] <= upper_bound)

    # Plot the results
    # fig, ax = plt.subplots(figsize=(16, 6))
    # fig.suptitle(f'IQR-Based Outlier Detection for {column}', size=20)

    # # Plot without needing a timestamp
    # sns.scatterplot(data=data, x=np.arange(len(data)), y=column, hue=np.where(mask, 'No Outlier', 'Outlier'), ax=ax)
    
    # plt.tight_layout()
    # plt.show()

    # print outlier values
    print(f'Outliers for {column}:')
    #how many of ofliers in that column
    print(data[~mask][column].shape)
    print(data[~mask][column])
    
#create new data with only outliers
outliers = data[~mask]

agressive_values_positive = {}
agressive_values_negative = {}

for column in outliers.columns:
    print(f"Max value for {column}: {data[column].max()} and Min value for {column}: {data[column].min()}")
    
    top_positive_values = data[data[column] > 0][column].nlargest(outliers[column].shape[0])
    top_negative_values = data[data[column] < 0][column].nsmallest(outliers[column].shape[0])
    
    agressive_values_positive[column] = top_positive_values.mean()
    agressive_values_negative[column] = top_negative_values.mean()
print()
# Print the representative values for each column
print("Agressive values for top positive values:")
for column, value in agressive_values_positive.items():
    print(f"Value for {column}: {value}")
print()
print("Agressive values for top negative values:")
for column, value in agressive_values_negative.items():
    print(f"Value for {column}: {value}")


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
dataset = os.path.join(os.getcwd(), os.pardir, 'docs', 'v2', 'dataset-all.csv')
data = pd.read_csv(dataset)

columns = ['accelerometerXAxis', 'accelerometerYAxis', 'accelerometerZAxis', 'gyroscopeXAxis', 'gyroscopeYAxis', 'gyroscopeZAxis']

data = data[columns]

# Convert all columns to float
data = data.astype(float)

# Detect and visualize outliers for each sensor column using IQR method
for column in data.columns:
    # Calculate IQR
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds for extreme outliers based on previous outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Create a mask for outliers
    mask = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Further filter extreme outliers using the median
    median = data[column].median()
    mask_extreme_outliers = abs(data[column] - median) > 15 * IQR  # Adjust 3 as needed for your data

    # Combine the masks to identify extreme outliers
    mask_extreme = mask & mask_extreme_outliers

    # # Plot the results
    # fig, ax = plt.subplots(figsize=(16, 6))
    # fig.suptitle(f'Extreme Outlier Detection for {column}', size=20)

    # # Plot without needing a timestamp
    # sns.scatterplot(data=data, x=np.arange(len(data)), y=column, hue=np.where(mask_extreme, 'Extreme Outlier', 'Normal'), ax=ax)
    
    # plt.tight_layout()
    # plt.show()

    # Print extreme outlier values
    print(f'Extreme Outliers for {column}:')
    print(data[mask][column])


In [None]:
import os
import pandas as pd
import numpy as np

# Load the dataset
dataset = os.path.join(os.getcwd(), os.pardir, 'docs', 'v2', 'dataset-all.csv')
data = pd.read_csv(dataset)

columns = ['accelerometerXAxis', 'accelerometerYAxis', 'accelerometerZAxis', 'gyroscopeXAxis', 'gyroscopeYAxis', 'gyroscopeZAxis']

data = data[columns]

# Convert all columns to float
data = data.astype(float)

# Define the number of top values to consider
top_n = 1000

# Calculate the representative values for each column
agressive_values_positive = {}
agressive_values_negative = {}
for column in data.columns:
    print(f"Max value for {column}: {data[column].max()} and Min value for {column}: {data[column].min()}")
    
    top_positive_values = data[data[column] > 0][column].nlargest(top_n)
    top_negative_values = data[data[column] < 0][column].nsmallest(top_n)
    
    agressive_values_positive[column] = top_positive_values.mean()
    agressive_values_negative[column] = top_negative_values.mean()
print()
# Print the representative values for each column
print("Agressive values for top positive values:")
for column, value in agressive_values_positive.items():
    print(f"Value for {column}: {value}")
print()
print("Agressive values for top negative values:")
for column, value in agressive_values_negative.items():
    print(f"Value for {column}: {value}")


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
dataset = os.path.join(os.getcwd(), os.pardir, 'docs', 'v2', 'dataset-all.csv')
data = pd.read_csv(dataset)

columns = ['accelerometerXAxis', 'accelerometerYAxis', 'accelerometerZAxis', 'gyroscopeXAxis', 'gyroscopeYAxis', 'gyroscopeZAxis']

data = data[columns]

# Convert all columns to float
data = data.astype(float)

# Define the number of top values to consider
top_n = 5

# Calculate statistics based on the top N values
top_values = data.stack().nlargest(top_n)
top_stats = {
    'mean': top_values.mean(),
    'std': top_values.std()
}

# Detect and visualize outliers for each sensor column using top N values statistics
for column in data.columns:
    # Calculate statistics for the column
    column_stats = {
        'mean': data[column].mean(),
        'std': data[column].std()
    }

    # Detect outliers based on the top N values statistics
    lower_bound = top_stats['mean'] - 3 * top_stats['std']  # Adjust the multiplier as needed
    upper_bound = top_stats['mean'] + 3 * top_stats['std']  # Adjust the multiplier as needed
    mask = (data[column] < lower_bound) | (data[column] > upper_bound)

    # Plot the results
    fig, ax = plt.subplots(figsize=(16, 6))
    fig.suptitle(f'Outlier Detection for {column}', size=20)

    # Plot without needing a timestamp
    sns.scatterplot(data=data, x=np.arange(len(data)), y=column, hue=np.where(mask, 'Outlier', 'Normal'), ax=ax)
    
    plt.tight_layout()
    plt.show()

    # Print outlier values
    print(f'Outliers for {column}:')
    print(data[mask][column])


# **SEPARATE DATA IN TRAIN AND TEST**

In [None]:
#create sequences
# if there is a label as agressive in the sequence, the sequence is labeled as agressive
# if there is a label as normal in the sequence, the sequence is labeled as normal
# y is the 2D array with the labels

def create_sequences(data, y, sequence_length, step_size=1):
    # Initialize the output arrays
    X = []
    Y = []

    # Iterate through the data to create sequences
    for i in range(0, len(data) - sequence_length + 1, step_size):
        # Define the start and end indices for the current sequence
        start_index = i
        end_index = i + sequence_length

        # Extract the data and labels for the current sequence
        sequence = data[start_index:end_index]
        
        # check by columns in the sequence, if there is a label as agressive, that column is labeled as agressive in the sequence
        # For each column in the sequence, check if there is a label as agressive in that column. IF so, that column in the label array is labeled as agressive
        
        # Initialize the label for the current sequence
        label = np.zeros(y.shape[1])
        
        # Iterate through the columns of the sequence
        for j in range(y.shape[1]):
            # Check if the label is agressive in the current column
            if 1 in y[start_index:end_index, j]:
                # Set the label as agressive for the current column
                label[j] = 1
                
        # Append the sequence and label to the output arrays
        X.append(sequence)
        Y.append(label)
        
    return np.array(X), np.array(Y) 

xs, ys = x, y

x, y = create_sequences(xs, ys, 40, 4)

# Split the data into training and test sets with sklearn
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
# x_train, x_test = split_train_test(x, test_size=0.2)

# y_train, y_test = split_train_test(y, test_size=0.2)

# **CREATE THE INPUT TENSORES DATA**

In [None]:
train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1])
test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1])

print(train.shape)
print(test.shape)

np.savetxt("runtime_saves/train&test/x_train.csv", x_train, delimiter=',', fmt='%.9f')
np.savetxt("runtime_saves/train&test/x_test.csv", x_test, delimiter=',', fmt='%.9f')
np.savetxt("runtime_saves/train&test/y_train.csv", y_train, delimiter=',', fmt='%.0i')
np.savetxt("runtime_saves/train&test/y_test.csv", y_test, delimiter=',', fmt='%.0i')

np.savetxt("runtime_saves/train&test/train.csv", train.reshape(train.shape[0], train.shape[2]), delimiter=',', fmt='%.9f')
np.savetxt("runtime_saves/train&test/test.csv", test.reshape(test.shape[0], test.shape[2]), delimiter=',', fmt='%.9f')

# **CREATE THE MODEL**

In [None]:
K.clear_session()

dropout_l0 = 0.2
dropout_l1 = 0.2
# learning_rate = 0.001

model_lstm = Sequential()
model_lstm.add(LSTM(50, input_shape=(1, train.shape[2]), return_sequences=True))
model_lstm.add(Dropout(dropout_l0))
model_lstm.add(LSTM(50, return_sequences=False))
model_lstm.add(Dropout(dropout_l1))
model_lstm.add(Dense(12,activation='sigmoid'))

# Compile the model
model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# **TRAIN THE MODEL**

In [None]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)

# Train the model
model_lstm_output = model_lstm.fit(train, y_train, epochs=30, batch_size=32, validation_split=0.2, shuffle=True, callbacks=[early_stopping, reduce_lr])

# **SHOW THE RESULTS**

In [None]:
plt.plot(model_lstm_output.history['loss'])
plt.plot(model_lstm_output.history['val_loss'])
plt.title('Historico de train')
plt.xlabel('Epocas de train')
plt.ylabel('Função custo')
plt.legend(['Erro train', 'Erro test'])
plt.show()

In [None]:
accuracy = model_lstm.evaluate(test, y_test)[1]  # Assuming accuracy is the second metric
print('Test Accuracy:', accuracy)

# **TEST THE NETWORK**

In [None]:
loss, accurary = model_lstm.evaluate(test, y_test, batch_size=16)
print('Test loss/accurary:', loss, accurary)


## **Confusion Matrix**

In [None]:
# y_pred = model_lstm.predict(treino)
# y_pred_classes = np.argmax(y_pred, axis=1)

# # Plot confusion matrix
# cm = confusion_matrix(y_treino, y_pred_classes)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Matriz de Confusão')
# plt.show()

# # Additional debugging: Print training history
# print(history.history)

In [None]:
test[0]
test.shape

In [None]:
i = 0
for i in range(100):
    a = x_test[i]
    b = a.reshape(1, 1, 12)

    # Make predictions on new data
    prediction = model_lstm.predict(b)
    #predicted_class = label_encoder.inverse_transform(prediction)[0]

    np.round(prediction, decimals=1, out=prediction)
    np.round(x_test[i], decimals=1, out=x_test[i])
    #print("Value:", newArray[i + start])
    if (np.sum(y_test[i]) > 0):
      print("X [:", x_test[i])
      print("Y [:", y_test[i])
      print("PC:", prediction)
      print (i)
    i = 1 + 1
#PREDICTIONS WITH COLAB MODEL
#prediction = model.predict(teste)
#print("Predicted class:", prediction)
#print("Predicted class:", predicted_class)

In [None]:
test_value = np.array([0., 0.363, 0.313, 0., 0., 0.31, 0.393, 0., 0., 0.244, 0.247, 0.])
test_value = test_value.reshape(1, 1, 12)

# Make predictions on new data
prediction = model_lstm.predict(test_value)
np.round(prediction, decimals=2, out=prediction)

print("Value    :", test_value[0][0])
print("Predicted:", prediction[0])

# **SAVE THE MODEL**

In [None]:
import os
import datetime

model_name = 'lstm_model_' + datetime.datetime.now().strftime("%Y-%m-%d %HH%Mm%Ss") + '.h5'

# Save the model in runtime_saves/models folder
model_lstm.save(os.path.join(".", 'runtime_saves', 'models', model_name))

In [None]:
# model sumary
model_lstm.summary()