In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

2024-08-01 20:10:03.679103: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-01 20:10:03.692779: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-01 20:10:03.696862: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-01 20:10:03.708154: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_c = pd.read_parquet('./Local/2017_Clean/Combined.parquet')
data_f1 = pd.read_parquet('./Local/2017_Final/Combined_1.parquet')
data_f2 = pd.read_parquet('./Local/2017_Final/Combined_2.parquet')

datasets = {'Clean data': data_c, 'Final data 1': data_f1, 'Final data 2': data_f2}
autoencoders = {}
reconstruction_errors = {}

In [3]:
def preprocess_data(df):
    label_encoder = LabelEncoder()
    
    if 'Label' in df.columns:
        label_col = 'Label'
    elif ' Label' in df.columns:
        label_col = ' Label'
    else:
        raise ValueError("DataFrame does not contain a label column")
    
    df[label_col] = label_encoder.fit_transform(df[label_col])
    
    X = df.drop(label_col, axis=1)
    y = df[label_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

processed_datasets = {name: preprocess_data(df) for name, df in datasets.items()}

In [22]:
batch_size = 256
epochs = 100
encoding_dim = 32

In [23]:
def build_autoencoder(input_dim, encoding_dim):
    # Encoder
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    
    # Decoder
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    
    # Autoencoder
    autoencoder = Model(input_layer, decoded)
    
    # Compile the model
    autoencoder.compile(optimizer=Adam(), loss='mse')
    
    return autoencoder

def train_autoencoder(autoencoder, X_train, X_test, epochs=epochs, batch_size=batch_size):
    # Train the autoencoder
    history = autoencoder.fit(X_train, X_train,
                              epochs=epochs,
                              batch_size=batch_size,
                              shuffle=True,
                              validation_data=(X_test, X_test),
                              verbose=1)
    return history

def evaluate_autoencoder(autoencoder, X_test):
    # Get the reconstruction errors
    X_test_pred = autoencoder.predict(X_test)
    reconstruction_errors = np.mean(np.square(X_test - X_test_pred), axis=1)
    return reconstruction_errors

def autoencoder_anomaly_detection(X_train, X_test, encoding_dim=encoding_dim, epochs=epochs, batch_size=batch_size):
    input_dim = X_train.shape[1]
    autoencoder = build_autoencoder(input_dim, encoding_dim)
    train_autoencoder(autoencoder, X_train, X_test, epochs, batch_size)
    reconstruction_errors = evaluate_autoencoder(autoencoder, X_test)
    return autoencoder, reconstruction_errors


In [24]:
for name, (X_train, X_test, y_train, y_test) in processed_datasets.items():
    autoencoder, errors = autoencoder_anomaly_detection(X_train, X_test)
    autoencoders[name] = autoencoder
    reconstruction_errors[name] = errors

Epoch 1/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 749us/step - loss: 0.6925 - val_loss: 0.7708
Epoch 2/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 727us/step - loss: 0.6623 - val_loss: 0.7700
Epoch 3/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 709us/step - loss: 0.6570 - val_loss: 0.7698
Epoch 4/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 710us/step - loss: 0.6021 - val_loss: 0.7698
Epoch 5/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 705us/step - loss: 0.6465 - val_loss: 0.7698
Epoch 6/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 717us/step - loss: 0.7241 - val_loss: 0.7697
Epoch 7/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 713us/step - loss: 0.6149 - val_loss: 0.7697
Epoch 8/100
[1m8040/8040[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 705us/step - loss: 0.6638 - val_loss: 0.7697
