In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel

In [2]:
# Load the dataset
csv_file = "data/thermal-porosity-table.csv"  
df = pd.read_csv(csv_file)

# Separate features (X) and labels (y)
X = df.drop(columns=["Porosity Label"])  
y = df["Porosity Label"]

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # Increase anomalies to ~20%
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Verify class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    n_estimators=100,
    learning_rate=0.05,  # Reduce learning rate
    max_depth=3,  # Reduce complexity
    subsample=0.7,  # Reduce overfitting
    colsample_bytree=0.7,
    reg_lambda=1,  # Add regularization
    reg_alpha=0.5,
    random_state=42
)

# Train the model on the resampled data
xgb_classifier.fit(X_train_resampled, y_train_resampled)

# Apply Feature Selection
selector = SelectFromModel(xgb_classifier, threshold="mean", prefit=True)
X_train_selected = selector.transform(X_train_resampled)
X_test_selected = selector.transform(X_test)

# Train the model again with selected features
xgb_classifier.fit(X_train_selected, y_train_resampled)

# Make predictions
y_pred = xgb_classifier.predict(X_test_selected)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print evaluation results
print("\nXGBoost Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

# Optional: Save model
xgb_classifier.save_model("xgboost_anomaly_model_smote_feature_selection.json")
print("XGBoost model saved as 'xgboost_anomaly_model_smote_feature_selection.json'")


Class distribution after SMOTE:
Porosity Label
0    1194
1     238
Name: count, dtype: int64

XGBoost Model Evaluation:
Accuracy: 1.00
Confusion Matrix:
[[299   0]
 [  0  14]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       299
           1       1.00      1.00      1.00        14

    accuracy                           1.00       313
   macro avg       1.00      1.00      1.00       313
weighted avg       1.00      1.00      1.00       313

XGBoost model saved as 'xgboost_anomaly_model_smote_feature_selection.json'


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [3]:
print(f"Dataset Shape: {df.shape}")  # Should be (1564, N) if all frames are included


Dataset Shape: (1564, 12)


In [4]:
print("Missing Labels:", df["Porosity Label"].isna().sum())  # Should be 0


Missing Labels: 0


In [5]:
print(df["Porosity Label"].value_counts())  # Should show how many 0s and 1s exist


Porosity Label
0    1493
1      71
Name: count, dtype: int64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Train set size: 1251
Test set size: 313
