In [1]:
from sklearn.ensemble import RandomForestClassifier
import pickle
import joblib
import json
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, make_scorer, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load data
with open('../dataset/secom_data_smoted.pkl', 'rb') as file:
    secom_data = pickle.load(file)
with open('../dataset/secom_pca_optimal.pkl', 'rb') as file:
    secom_data_pca = pickle.load(file)
with open('../dataset/secom_labels_smoted.pkl', 'rb') as file:
    secom_labels = pickle.load(file).squeeze()

print(secom_data.head())
print(secom_data_pca.head())
print(secom_labels)

       0        1          2          3       4      5         6       7    \
0  3031.79  2479.69  2186.9889   949.2201  1.2981  100.0  103.3322  0.1219   
1  3049.97  2441.12  2263.3222  1383.8334  2.3986  100.0   85.1778  0.1196   
2  2870.76  2583.56  2192.1889  1435.9611  2.3870  100.0  107.3989  0.1229   
3  2996.90  2448.07  2162.7556  1041.1557  0.8479  100.0  107.2622  0.1221   
4  2943.83  2445.17  2236.0667  1680.1825  1.4834  100.0   98.6889  0.1221   

      8       9    ...       580         581     582     583     584     585  \
0  1.4636  0.0009  ...  0.010500   70.061800  0.5035  0.0085  0.0024  1.6818   
1  1.3792  0.0054  ...  0.003300   39.552800  0.5053  0.0159  0.0038  3.1456   
2  1.4719 -0.0003  ...  0.006300  193.828600  0.5010  0.0127  0.0037  2.5344   
3  1.4446 -0.0016  ...  0.005372   98.442837  0.5039  0.0169  0.0046  3.3497   
4  1.4674  0.0161  ...  0.003900   49.945400  0.5001  0.0190  0.0043  3.7940   

      586     587     588       589  
0  0.0484  0

In [3]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(secom_data, secom_labels, test_size=0.3, random_state=42)

# Train-test split PCA
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(secom_data_pca, secom_labels, test_size=0.3, random_state=42)

In [4]:
## Model definition

# Define Base RF
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Wrap the SVC model with Bagging
bagging_rf = BaggingClassifier(estimator=rf_model, random_state=42)

# Use StratifiedKFold to maintain the proportion of classes in each fold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Change evaluation metrics to F1-score
f1_scorer = make_scorer(f1_score, pos_label=0)

# Define grid parameters
param_grid = {
    'estimator__n_estimators': [100, 200, 500],              # Number of trees in the forest
    'estimator__max_features': ['sqrt', 'log2'],     # Number of features to consider at each split
    'estimator__max_depth': [None, 10, 20, 30],              # Maximum depth of the tree
    'estimator__min_samples_split': [2, 5, 10],              # Minimum number of samples required to split an internal node
    'estimator__min_samples_leaf': [1, 2, 4],                # Minimum number of samples required to be at a leaf node
}


In [5]:
# Train using the smoted data first

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=bagging_rf, param_grid=param_grid, cv=stratified_kfold, scoring=f1_scorer, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters found
print(f"Best parameters found: {grid_search.best_params_}")

# Use the best found model
best_rf_model = grid_search.best_estimator_

# Predict and evaluate the model
rf_pred_best = best_rf_model.predict(X_test)
print("Best SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred_best):.4f}")
print(f"Precision: {precision_score(y_test, rf_pred_best, pos_label=1):.4f}")
print(f"Recall: {recall_score(y_test, rf_pred_best, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(y_test, rf_pred_best, pos_label=1):.4f}")

conf_matrix = confusion_matrix(y_test, rf_pred_best)
f1_score_smoted = f1_score(y_test, rf_pred_best)
report = classification_report(y_test, rf_pred_best)

print("Confusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(report)

print("\n F1 Score:")
print(f1_score_smoted)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  10.5s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   9.8s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   9.6s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=   9.7s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  10.0s
[CV] END estimator__max_depth=None, estimator__max_

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found: {'estimator__max_depth': None, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 100}
Best SVM Performance:
Accuracy: 0.9989
Precision: 1.0000
Recall: 0.9977
F1-Score: 0.9989
Confusion Matrix:
[[435   0]
 [  1 442]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       435
           1       1.00      1.00      1.00       443

    accuracy                           1.00       878
   macro avg       1.00      1.00      1.00       878
weighted avg       1.00      1.00      1.00       878


 F1 Score:
0.9988700564971752


In [6]:
# Now train using the PCA data

# Perform grid search with cross-validation
grid_search_pca = GridSearchCV(estimator=bagging_rf, param_grid=param_grid, cv=stratified_kfold, scoring=f1_scorer, verbose=2)
grid_search_pca.fit(X_train_pca, y_train_pca)

# Best parameters found
print(f"Best parameters found: {grid_search_pca.best_params_}")

# Use the best found model
best_rf_model_pca = grid_search_pca.best_estimator_

# Predict and evaluate the model
rf_pred_best_pca = best_rf_model_pca.predict(X_test_pca)
print("Best SVM Performance:")
print(f"Accuracy: {accuracy_score(y_test_pca, rf_pred_best_pca):.4f}")
print(f"Precision: {precision_score(y_test_pca, rf_pred_best_pca, pos_label=1):.4f}")
print(f"Recall: {recall_score(y_test_pca, rf_pred_best_pca, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(y_test_pca, rf_pred_best_pca, pos_label=1):.4f}")

conf_matrix_pca = confusion_matrix(y_test_pca, rf_pred_best_pca)
f1_score_pca = f1_score(y_test_pca, rf_pred_best_pca)
report_pca = classification_report(y_test_pca, rf_pred_best_pca)

print("Confusion Matrix:")
print(conf_matrix_pca)

print("\nClassification Report:")
print(report_pca)

print("\n F1 Score:")
print(f1_score_pca)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  13.4s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  13.4s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  13.3s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  13.2s
[CV] END estimator__max_depth=None, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  13.3s
[CV] END estimator__max_depth=None, estimator__max_

In [None]:
# Save the models as pickle file

joblib.dump(best_rf_model, '../trained_models/rf_model.pkl')
joblib.dump(best_rf_model_pca, '../trained_models/rf_model_pca.pkl')

# Also save best grid parameters as JSON for references
with open('../trained_models/params/rf_model_params.json', 'w') as file:
    json.dump(grid_search.best_params_, file)

with open('../trained_models/params/rf_model_pca_params.json', 'w') as file:
    json.dump(grid_search_pca.best_params_, file)

Random Forest Performance:
Accuracy: 0.9331
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Let's try using the unprocessed data

secom_data_raw = pd.read_csv('../dataset/secom.data', sep='\s+', header=None)
secom_labels_raw = pd.read_csv('../dataset/secom_labels.data', sep='\s+', header=None, usecols=[0], names=['Label'])

secom_labels_raw = secom_labels_raw.replace(1, 0)  # Change 1 to 0, fail = 0
secom_labels_raw = secom_labels_raw.replace(-1, 1)  # Change -1 to 1, pass = 1

# Fill the NaN with mean using the imputer
imputer = SimpleImputer()
secom_data_raw_filled = imputer.fit_transform(secom_data_raw)

# Let's check if our models are saved successfuly
rf_model = joblib.load('../trained_models/rf_model.pkl')

new_prediction = rf_model.predict(secom_data_raw_filled)

print("Best SVM Performance:")
print(f"Accuracy: {accuracy_score(secom_labels_raw, new_prediction):.4f}")
print(f"Precision: {precision_score(secom_labels_raw, new_prediction, pos_label=1):.4f}")
print(f"Recall: {recall_score(secom_labels_raw, new_prediction, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(secom_labels_raw, new_prediction, pos_label=1):.4f}")

conf_matrix_new = confusion_matrix(secom_labels_raw, new_prediction)
f1_score_new = f1_score(secom_labels_raw, new_prediction)
report_new = classification_report(secom_labels_raw, new_prediction)

print("Confusion Matrix:")
print(conf_matrix_new)

print("\nClassification Report:")
print(report_new)

print("\n F1 Score:")
print(f1_score_new)