In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

In [5]:
# Load data
data_file_path = 'dataset/secom.data'
labels_file_path = 'dataset/secom_labels.data'

data = pd.read_csv(data_file_path, sep='\s+', header=None)
labels = pd.read_csv(labels_file_path, sep='\s+', header=None)

# Extract the label column (assuming the first column in labels is the target)
y = labels.iloc[:, 0]


In [6]:
# Handle missing values by imputing with the mean of each column
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(data)

In [7]:
# Balance the dataset using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [9]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Train a RandomForest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [11]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Confusion Matrix:
[[448   1]
 [  6 423]]

Classification Report:
              precision    recall  f1-score   support

          -1       0.99      1.00      0.99       449
           1       1.00      0.99      0.99       429

    accuracy                           0.99       878
   macro avg       0.99      0.99      0.99       878
weighted avg       0.99      0.99      0.99       878


Accuracy Score:
0.9920273348519362


In [12]:
# Perform GridSearchCV to find the best parameters for the RandomForest model
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print("Best Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))
print("\nBest Model Classification Report:")
print(classification_report(y_test, y_pred_best))
print("\nBest Model Accuracy Score:")
print(accuracy_score(y_test, y_pred_best))


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Model Confusion Matrix:
[[445   4]
 [  5 424]]

Best Model Classification Report:
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99       449
           1       0.99      0.99      0.99       429

    accuracy                           0.99       878
   macro avg       0.99      0.99      0.99       878
weighted avg       0.99      0.99      0.99       878


Best Model Accuracy Score:
0.989749430523918


In [13]:
import joblib

# Save the model
joblib.dump(best_model, 'predictive_quality_control_model.pkl')


['predictive_quality_control_model.pkl']