In [1]:
import pandas as pd
import os

# Path to your image folder
frames_dir = 'data/images'

# Get a sample frame file
sample_file = next((f for f in os.listdir(frames_dir) if f.endswith('.csv') and f.startswith('Frame_')), None)

if sample_file:
    sample_path = os.path.join(frames_dir, sample_file)
    sample_frame = pd.read_csv(sample_path, header=None)
    print(f"✅ Sample frame loaded: {sample_file}")
    print(f"🖼️ Image dimensions: {sample_frame.shape[0]} (height) × {sample_frame.shape[1]} (width)")
else:
    print("❌ No valid .csv frame files found in the directory.")


✅ Sample frame loaded: Frame_46.csv
🖼️ Image dimensions: 200 (height) × 201 (width)


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Cropping2D
from keras.optimizers import Adam

# Set frame dimensions
frame_height, frame_width = 200, 201

# === Step 1: Load and preprocess data ===
frames_dir = 'data/images'
frame_files = sorted([f for f in os.listdir(frames_dir) if f.startswith('Frame_') and f.endswith('.csv')])

all_data = []
for file_name in frame_files:
    file_path = os.path.join(frames_dir, file_name)
    frame = pd.read_csv(file_path, header=None).values.astype(np.float32)
    frame[frame == 0] = np.nan  # Treat 0s as missing
    all_data.append(frame)

all_data = np.array(all_data)  # Shape: (num_samples, 200, 201)

# Impute and normalize each frame
for i in range(all_data.shape[0]):
    frame = all_data[i]
    nan_mask = np.isnan(frame)
    if np.any(nan_mask):
        col_mean = np.nanmean(frame, axis=0)
        frame[nan_mask] = np.take(col_mean, np.where(nan_mask)[1])
    all_data[i] = frame

# Scale all values to [0, 1]
scaler = MinMaxScaler()
reshaped_data = all_data.reshape(all_data.shape[0], -1)  # Flatten for scaler
reshaped_data = scaler.fit_transform(reshaped_data)
num_frames = all_data.shape[0]
all_data = reshaped_data.reshape(num_frames, frame_height, frame_width, 1)

# === Step 2: Define convolutional autoencoder ===
def create_conv_autoencoder(input_shape):
    input_layer = Input(shape=input_shape)

    from keras.layers import Dropout, BatchNormalization

    # Encoder
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    #? Dropout layer 
    
        # Decoder
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)  # -> 100x102
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)  # -> 200x204
    x = Cropping2D(cropping=((0, 0), (0, 3)))(x)  # -> 200x201
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer=Adam(), loss='mse')
    return autoencoder

input_shape = (frame_height, frame_width, 1)
autoencoder = create_conv_autoencoder(input_shape)

# === Step 3: Train the model ===
autoencoder.fit(all_data, all_data,
                epochs=50,
                batch_size=16,
                shuffle=True,
                verbose=1)

# === Step 4: Calculate reconstruction errors ===
reconstructions = autoencoder.predict(all_data)
recon_errors = []

for original, reconstructed, fname in zip(all_data, reconstructions, frame_files):
    mse = mean_squared_error(original.flatten(), reconstructed.flatten())
    recon_errors.append((fname, mse))

mse_df = pd.DataFrame(recon_errors, columns=["Frame", "MSE"])

# === Step 5: Thresholding ===
threshold_99 = np.percentile(mse_df['MSE'], 99)
threshold_95 = np.percentile(mse_df['MSE'], 95)

mse_df['Predicted_99'] = mse_df['MSE'].apply(lambda x: 1 if x > threshold_99 else 0)
mse_df['Predicted_95'] = mse_df['MSE'].apply(lambda x: 1 if x > threshold_95 else 0)

# Save results
mse_df.to_csv("conv_autoencoder_results.csv", index=False)

print(f"✅ Threshold (95th percentile): {threshold_95:.6f}")
print(f"✅ Threshold (99th percentile): {threshold_99:.6f}")
print(f"📄 Results saved to 'conv_autoencoder_results.csv'")


Epoch 1/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 873ms/step - loss: 0.6570 - val_loss: 0.6065
Epoch 2/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 831ms/step - loss: 0.5547 - val_loss: 0.6045
Epoch 3/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 810ms/step - loss: 0.5543 - val_loss: 0.5935
Epoch 4/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 754ms/step - loss: 0.5534 - val_loss: 0.5881
Epoch 5/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 765ms/step - loss: 0.5546 - val_loss: 0.5716
Epoch 6/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 750ms/step - loss: 0.5545 - val_loss: 0.5660
Epoch 7/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 753ms/step - loss: 0.5546 - val_loss: 0.5615
Epoch 8/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 754ms/step - loss: 0.5531 - val_loss: 0.5605
Epoch 9/40
[1m88/88[0m [32m━━

In [29]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Recalculate 90th percentile threshold
threshold_90 = np.percentile(mse_df['MSE'], 97)
mse_df['Predicted_90'] = mse_df['MSE'].apply(lambda x: 1 if x > threshold_90 else 0)

# Merge again with labels if not already done
labels_path = 'data/frame_porosity_labels.csv'
labels_df = pd.read_csv(labels_path)
labels_df.columns = labels_df.columns.str.strip()
labels_df['Frame'] = labels_df['Frame'].astype(str).apply(lambda x: f"Frame_{x}")
mse_df['Frame'] = mse_df['Frame'].str.replace('.csv', '', regex=False)

merged = pd.merge(mse_df, labels_df, on='Frame', how='inner')

# Evaluate
y_true = merged['Porosity Label']
y_pred_90 = merged['Predicted_90']

print("\n📊 Evaluation at 90th Percentile Threshold:")
print("F1 Score:", f1_score(y_true, y_pred_90))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred_90))
print("Classification Report:\n", classification_report(y_true, y_pred_90))



📊 Evaluation at 90th Percentile Threshold:
F1 Score: 0.5254237288135594
Confusion Matrix:
 [[1477   16]
 [  40   31]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1493
           1       0.66      0.44      0.53        71

    accuracy                           0.96      1564
   macro avg       0.82      0.71      0.75      1564
weighted avg       0.96      0.96      0.96      1564

