In [None]:
# Select a split point
split_date = '2018-02-01'  # Example split date

# Split the data into train and test sets
train_df = combined[combined['UTC_TIME'] < split_date]
test_df = combined[combined['UTC_TIME'] >= split_date]

# Display the shapes of the resulting dataframes
print(f'Training data shape: {train_df.shape}')
print(f'Testing data shape: {test_df.shape}')

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Prepare the data for the autoencoder
scaler = StandardScaler()

# Use only the 'VALUE_FOB' column for training
train_x = scaler.fit_transform(train_df[['Real_FOB-predicted_FOB']])
test_x = scaler.transform(test_df[['Real_FOB-predicted_FOB']])

# Ensure the input dimension is set correctly
input_dim = train_x.shape[1]

# Build the autoencoder model
encoding_dim = 12  # Example encoding dimension

inputArray = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='tanh')(inputArray)
decoded = Dense(input_dim, activation='softmax')(encoded)

autoencoder = Model(inputArray, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.summary()

# Train the autoencoder
autoencoder.fit(train_x, train_x, epochs=50, batch_size=10, shuffle=False, validation_data=(test_x, test_x))

# Evaluate the autoencoder on test data
reconstructed_test = autoencoder.predict(test_x)
loss = np.mean(np.square(test_x - reconstructed_test), axis=1)

# Print some results
print(f'Mean Squared Error on test data: {np.mean(loss)}')

# Set a threshold for reconstruction error to identify anomalies
threshold = np.percentile(loss, 95)  # for example, 95th percentile

# Convert reconstruction errors to binary labels
predicted_labels = (loss > threshold).astype(int)

# True labels (assuming they are binary: 1 for anomaly, 0 for normal)
true_labels = test_df['leak'].values

# Print the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)
print(cm)

# Plot the loss distribution
plt.figure(figsize=(10, 6))
plt.hist(loss, bins=50, density=True, alpha=0.6, color='g')
plt.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
plt.xlabel('Loss')
plt.ylabel('Density')
plt.title('Loss Distribution')
plt.show()


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

class AnomalyDetector(tf.keras.Model):
    def __init__(self):
        super(AnomalyDetector, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(80, activation="relu"),
            layers.Dense(70, activation="relu"),
            layers.Dense(60, activation="relu"),                                  
            layers.Dense(50, activation="relu"),
            layers.Dense(40, activation="relu"),
            layers.Dense(30, activation="relu"),
            layers.Dense(20, activation="relu")
        ])
        
        self.decoder = tf.keras.Sequential([ 
            layers.Dense(20, activation="relu"),                                 
            layers.Dense(30, activation="relu"),
            layers.Dense(40, activation="relu"),
            layers.Dense(50, activation="relu"),
            layers.Dense(60, activation="relu"),
            layers.Dense(70, activation="relu"),
            layers.Dense(80, activation="relu"),
            layers.Dense(10, activation="sigmoid")
        ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = AnomalyDetector()


In [None]:
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss= 'mse')

In [None]:
history = autoencoder.fit(train_x, train_x, 
          epochs=300, 
          batch_size=100,
          validation_data=(test_x, test_x),
          shuffle=True)

In [None]:
test_x_predictions = autoencoder.predict(test_x)
mse = np.mean(np.power(test_x - test_x_predictions, 2), axis=1)

In [None]:
#score my model
test_x_predictions = autoencoder.predict(test_x)
mse = np.mean(np.power(test_x - test_x_predictions, 2), axis=1)
error_df = pd.DataFrame({'Reconstruction_error': mse,
                        'True_class': test_df['FAKE_LEAKAGE']})

error_df.describe()

# Include 'Flight' column when creating error_df DataFrame
error_df = pd.DataFrame({'Reconstruction_error': mse,
                         'True_class': test_df['FAKE_LEAKAGE'],
                         'Flight': test_df['Flight']})

# Filter for predicted leakages
predicted_leakages = error_df[error_df['True_class'] == 1]

# Print the predicted leakages and the flight they belong to
print(predicted_leakages)

# Group by 'Flight' and count the number of predicted leakages for each flight
predicted_leakages_count = predicted_leakages.groupby('Flight').size()

# Print the result
print(predicted_leakages_count)


In [None]:
#create confusion matrix
from sklearn.metrics import confusion_matrix
threshold_fixed = 0.05
y_pred = [1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.True_class, y_pred)

plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'], annot=True, fmt="d");
plt.title('Confusion matrix')
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

#Print the flights in which the anomalys was detected and how many were detected in those flights
# Include 'Flight' column when creating anomalies DataFrame
anomalies = error_df[error_df['True_class'] == 1][['Flight', 'True_class']]

# Group by 'Flight' and count the number of anomalies for each flight
anomalies_count = anomalies.groupby('Flight').size()

# Print the result
print(anomalies_count)


In [None]:
#plot the flights with most anomalies from higher to lower
anomalies_count.sort_values(ascending=False).plot(kind='bar', figsize=(15, 6))
plt.title('Number of anomalies detected in each flight')
plt.xlabel('Flight')
plt.ylabel('Number of anomalies')
plt.show()

In [None]:
autoencoder.save('autoencoder_model_new.h5')