# Autoencoder

This notebook implements an unsupervised Autoencoder model to detect anomalies in thermal images collected from Directed Energy Deposition (DED) processes. The model is trained to reconstruct normal data, and samples with high reconstruction error are flagged as anomalous.


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix
from keras.models import Model
from keras.layers import Input, Dense
from sklearn.impute import SimpleImputer

In [2]:
# Path to frame directory (update as needed)
frames_dir = 'data/images'

# Load all available frames with .csv extension
frame_files = sorted([f for f in os.listdir(frames_dir) if f.startswith('Frame_') and f.endswith('.csv')])

# Initialize data structure
all_data = []

# Load frame data into memory
for file_name in frame_files:
    file_path = os.path.join(frames_dir, file_name)
    frame_data = pd.read_csv(file_path, header=None)
    all_data.append(frame_data.values)

# Convert all_data to a NumPy array and ensure float type
all_data = np.array(all_data, dtype=np.float32).reshape(len(all_data), -1)  # Convert to float

# Treat 0s as missing values by replacing them with NaN
all_data[all_data == 0] = np.nan

# Use mean imputation to replace NaNs with the column-wise mean
imputer = SimpleImputer(strategy="mean")
all_data = imputer.fit_transform(all_data)

# Normalize data
scaler = MinMaxScaler()
all_data = scaler.fit_transform(all_data)


In [3]:

# Define autoencoder architecture
def create_autoencoder(input_dim, encoding_dim=64):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

# Autoencoder setup
input_dim = all_data.shape[1]
autoencoder = create_autoencoder(input_dim)

# Train the autoencoder
autoencoder.fit(all_data, all_data, epochs=50, batch_size=16, verbose=1)

# Evaluate all frames using the trained model
mse_results = []

for file_name, frame in zip(frame_files, all_data):
    # Replace NaN in each frame before prediction
    frame = np.nan_to_num(frame)
    reconstructed = autoencoder.predict(frame.reshape(1, -1))
    reconstructed = np.nan_to_num(reconstructed)  # Handle NaN in the reconstructed output
    mse = mean_squared_error(frame.flatten(), reconstructed.flatten())
    mse_results.append((file_name, mse))

# Convert MSE results to a DataFrame
mse_results = pd.DataFrame(mse_results, columns=['Frame', 'MSE'])

mse_threshold_99 = np.percentile(mse_results['MSE'], 99)
mse_results['Predicted_99'] = mse_results['MSE'].apply(lambda mse: 1 if mse > mse_threshold_99 else 0)
mse_results.to_csv('result.csv', index=False)

mse_threshold_95 = np.percentile(mse_results['MSE'], 95)
mse_results['Predicted_95'] = mse_results['MSE'].apply(lambda mse: 1 if mse > mse_threshold_95 else 0)
mse_results.to_csv('result_95.csv', index=False)

# Output threshold results
print("Anomaly threshold (99th percentile):", mse_threshold_99)
print("Anomaly threshold (95th percentile):", mse_threshold_95)
print("Anomaly detection completed. Results saved to 'result_95.csv'.")



Epoch 1/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - loss: 0.0241
Epoch 2/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0047
Epoch 3/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 0.0042
Epoch 4/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 0.0034
Epoch 5/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - loss: 0.0031
Epoch 6/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - loss: 0.0029
Epoch 7/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - loss: 0.0027
Epoch 8/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - loss: 0.0026
Epoch 9/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 0.0022
Epoch 10/50
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 0.0025

In [4]:
# If labels.csv is available, use it for evaluation
labels_path = 'data/frame_porosity_labels.csv'
if os.path.exists(labels_path):
    # Read CSV with correct delimiter (default is ',')
    labels_data = pd.read_csv(labels_path)

    # Ensure column names are stripped of any leading/trailing spaces
    labels_data.columns = labels_data.columns.str.strip()

    # Rename columns to match expected format
    labels_data.rename(columns={"Frame": "Frame", "Porosity Label": "Label"}, inplace=True)

    # Ensure Frame column is treated as a string and formatted correctly
    labels_data['Frame'] = labels_data['Frame'].astype(str).apply(lambda x: f"Frame_{x}")

    # Fix Frame names in mse_results (remove '.csv' extension)
    mse_results['Frame'] = mse_results['Frame'].str.replace('.csv', '', regex=False)

    # Merge with predictions
    merged_data = pd.merge(mse_results, labels_data, on='Frame', how='inner')

    # Check if merged data is empty before evaluation
    if merged_data.shape[0] == 0:
        print("Error: No matching frames between MSE results and labels data!")
    else:
        # Evaluate metrics for 99th percentile threshold
        accuracy_99 = accuracy_score(merged_data['Label'], merged_data['Predicted_99'])
        conf_matrix_99 = confusion_matrix(merged_data['Label'], merged_data['Predicted_99'])
        classification_rep_99 = classification_report(merged_data['Label'], merged_data['Predicted_99'])

        print("\nEvaluation for 99th Percentile Threshold:")
        print(f"Accuracy: {accuracy_99:.2f}")
        print("Confusion Matrix:")
        print(conf_matrix_99)
        print("Classification Report:")
        print(classification_rep_99)

        # Evaluate metrics for 95th percentile threshold
        accuracy_95 = accuracy_score(merged_data['Label'], merged_data['Predicted_95'])
        conf_matrix_95 = confusion_matrix(merged_data['Label'], merged_data['Predicted_95'])
        classification_rep_95 = classification_report(merged_data['Label'], merged_data['Predicted_95'])

        print("\nEvaluation for 95th Percentile Threshold:")
        print(f"Accuracy: {accuracy_95:.2f}")
        print("Confusion Matrix:")
        print(conf_matrix_95)
        print("Classification Report:")
        print(classification_rep_95)



Evaluation for 99th Percentile Threshold:
Accuracy: 0.96
Confusion Matrix:
[[1489    4]
 [  59   12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1493
           1       0.75      0.17      0.28        71

    accuracy                           0.96      1564
   macro avg       0.86      0.58      0.63      1564
weighted avg       0.95      0.96      0.95      1564


Evaluation for 95th Percentile Threshold:
Accuracy: 0.96
Confusion Matrix:
[[1457   36]
 [  28   43]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1493
           1       0.54      0.61      0.57        71

    accuracy                           0.96      1564
   macro avg       0.76      0.79      0.78      1564
weighted avg       0.96      0.96      0.96      1564

