In [7]:
import pickle
import joblib
import json
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, make_scorer, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load data
with open('../dataset/secom_data_smoted.pkl', 'rb') as file:
    secom_data = pickle.load(file)
with open('../dataset/secom_pca_optimal.pkl', 'rb') as file:
    secom_data_pca = pickle.load(file)
with open('../dataset/secom_labels_smoted.pkl', 'rb') as file:
    secom_labels = pickle.load(file).squeeze()

print(secom_data.head())
print(secom_data_pca.head())
print(secom_labels)

       0        1          2          3       4      5         6       7    \
0  3031.79  2479.69  2186.9889   949.2201  1.2981  100.0  103.3322  0.1219   
1  3049.97  2441.12  2263.3222  1383.8334  2.3986  100.0   85.1778  0.1196   
2  2870.76  2583.56  2192.1889  1435.9611  2.3870  100.0  107.3989  0.1229   
3  2996.90  2448.07  2162.7556  1041.1557  0.8479  100.0  107.2622  0.1221   
4  2943.83  2445.17  2236.0667  1680.1825  1.4834  100.0   98.6889  0.1221   

      8       9    ...       580         581     582     583     584     585  \
0  1.4636  0.0009  ...  0.010500   70.061800  0.5035  0.0085  0.0024  1.6818   
1  1.3792  0.0054  ...  0.003300   39.552800  0.5053  0.0159  0.0038  3.1456   
2  1.4719 -0.0003  ...  0.006300  193.828600  0.5010  0.0127  0.0037  2.5344   
3  1.4446 -0.0016  ...  0.005372   98.442837  0.5039  0.0169  0.0046  3.3497   
4  1.4674  0.0161  ...  0.003900   49.945400  0.5001  0.0190  0.0043  3.7940   

      586     587     588       589  
0  0.0484  0

In [3]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(secom_data, secom_labels, test_size=0.3, random_state=42)

# Train-test split PCA
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(secom_data_pca, secom_labels, test_size=0.3, random_state=42)

In [20]:
## Model definition

# Define Base model
base_sgd_log = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)

# Wrap the logistic regression model with Bagging
bagging_sgd_log = BaggingClassifier(estimator=base_sgd_log, random_state=42)

# Use StratifiedKFold to maintain the proportion of classes in each fold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Change evaluation metrics to F1-score
f1_scorer = make_scorer(f1_score, pos_label=0)

# Perform Grid search for best parameters

# Define grid parameters
param_grid = {
    'estimator__alpha': [0.0001, 0.001, 0.01],  # Regularization strength
    'estimator__penalty': ['l2', 'l1', 'elasticnet'],  # Regularization type
    'estimator__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'estimator__eta0': [0.01, 0.1, 1.0]  # Initial learning rate
}

In [21]:
# Train using the smoted data first

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=bagging_sgd_log, param_grid=param_grid, cv=stratified_kfold, scoring=f1_scorer, verbose=2)
grid_search.fit(X_train, y_train)

# Use the best found model
best_sgd_model = grid_search.best_estimator_

# Best parameters found
print(f"Best parameters found: {grid_search.best_params_}")

# Predict and evaluate the model
sgd_pred_best = best_sgd_model.predict(X_test)

print("Best Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, sgd_pred_best):.4f}")
print(f"Precision: {precision_score(y_test, sgd_pred_best, pos_label=1):.4f}")
print(f"Recall: {recall_score(y_test, sgd_pred_best, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(y_test, sgd_pred_best, pos_label=1):.4f}")

conf_matrix = confusion_matrix(y_test, sgd_pred_best)
f1_score_smoted = f1_score(y_test, sgd_pred_best)
report = classification_report(y_test, sgd_pred_best)

print("Confusion Matrix:")
print(conf_matrix)

print("\nClassification Report:")
print(report)

print("\n F1 Score:")
print(f1_score_smoted)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.6s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l1; total time=   1.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l1; total time=   1.3s
[CV] E

In [22]:
# Now train using the PCA data

# Perform grid search with cross-validation
grid_search_pca = GridSearchCV(estimator=bagging_sgd_log, param_grid=param_grid, cv=stratified_kfold, scoring=f1_scorer, verbose=2)
grid_search_pca.fit(X_train_pca, y_train_pca)

# Best parameters found
print(f"Best parameters found: {grid_search_pca.best_params_}")

# Use the best found model
best_sgd_model_pca = grid_search_pca.best_estimator_

# Predict and evaluate the model
sgd_pred_best_pca = best_sgd_model_pca.predict(X_test_pca)
print("Best Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test_pca, sgd_pred_best_pca):.4f}")
print(f"Precision: {precision_score(y_test_pca, sgd_pred_best_pca, pos_label=1):.4f}")
print(f"Recall: {recall_score(y_test_pca, sgd_pred_best_pca, pos_label=1):.4f}")
print(f"F1-Score: {f1_score(y_test_pca, sgd_pred_best_pca, pos_label=1):.4f}")

conf_matrix_pca = confusion_matrix(y_test_pca, sgd_pred_best_pca)
f1_score_pca = f1_score(y_test_pca, sgd_pred_best_pca)
report_pca = classification_report(y_test_pca, sgd_pred_best_pca)

print("Confusion Matrix:")
print(conf_matrix_pca)

print("\nClassification Report:")
print(report_pca)

print("\n F1 Score:")
print(f1_score_pca)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.4s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.2s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.1s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.1s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.2s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l1; total time=   0.3s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l1; total time=   0.4s
[CV] E



[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   2.6s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.1s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.01, estimator__learning_rate=invscaling, estimator__penalty=l2; total time=   0.3s
[CV] END estimator__alpha=0.0001,



[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   2.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.3s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.9s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=0.1, estimator__learning_rate=invscaling, estimator__penalty=l2; total time=   0.3s
[CV] END estimator__alpha=0.0001, estima



[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   2.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.2s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.7s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.9s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   1.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=invscaling, estimator__penalty=l2; total time=   0.5s
[CV] END estimator__alpha=0.0001, estima



[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=l1; total time=   7.2s




[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=l1; total time=   7.6s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=l1; total time=   1.3s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.8s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.6s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   4.0s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   4.5s
[CV] END estimator__alpha=0.0001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   3.0s
[CV] END estimator__alpha=0.001, est



[CV] END estimator__alpha=0.001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=l1; total time=   7.8s
[CV] END estimator__alpha=0.001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.2s
[CV] END estimator__alpha=0.001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.0s
[CV] END estimator__alpha=0.001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.1s
[CV] END estimator__alpha=0.001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.1s
[CV] END estimator__alpha=0.001, estimator__eta0=1.0, estimator__learning_rate=adaptive, estimator__penalty=elasticnet; total time=   2.3s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=constant, estimator__penalty=l2; total time=   0.3s
[CV] END estimator__alpha=0.01, estimator__



[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  10.3s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   8.1s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  10.0s




[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  11.5s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  13.0s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.6s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.6s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.4s
[CV] END estimator__alpha=0.01, estimator__eta0=0.01, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=0.



[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   9.7s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   8.0s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   9.7s




[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  11.1s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  13.0s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.6s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.4s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=0.1, esti



[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   9.7s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   8.1s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=l1; total time=   9.8s




[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  11.1s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=l1; total time=  12.8s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.6s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.6s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.6s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, estimator__learning_rate=optimal, estimator__penalty=elasticnet; total time=   0.5s
[CV] END estimator__alpha=0.01, estimator__eta0=1.0, esti

In [23]:
# Save the models as pickle file

joblib.dump(best_sgd_model, '../trained_models/sgd_model.pkl')
joblib.dump(best_sgd_model_pca, '../trained_models/sgd_model_pca.pkl')

# Also save best grid parameters as JSON for references
with open('../trained_models/params/sgd_model_params.json', 'w') as file:
    json.dump(grid_search.best_params_, file)

with open('../trained_models/params/sgd_model_pca_params.json', 'w') as file:
    json.dump(grid_search_pca.best_params_, file)