In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

random_seed = 33

# Load data
data = pd.read_csv("/home/b.cassoli@PTW.Maschinenbau.TU-Darmstadt.de/projects/phd/phd/cip_dmd/notebooks/cip_dmgd_n5/baselines/dataset_baselines_5.csv", sep=";")

In [None]:
# X contains features, y contains labels (0 or 1)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

# Apply random oversampling to the training set
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=random_seed)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define the logistic regression model
model = LogisticRegression()

# Define hyperparameter grid for random search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization penalty
    'solver': ['liblinear', 'saga']  # Solver for logistic regression
}

# Create a random search cross-validation object
random_search = RandomizedSearchCV(
    model, param_distributions=param_grid, n_iter=50, cv=3, n_jobs=-1, random_state=random_seed)

# Fit the model using random search and cross-validation
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Report metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)
print("MCC:", mcc)

In [None]:
# Random Forest
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest
from scipy.stats import randint  # Use randint for integer parameters
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define the Random Forest model
model = RandomForestClassifier()

# Define hyperparameter grid for random search
param_dist = {
    'n_estimators': [50, 100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 4, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
}

# Create a random search cross-validation object
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=random_seed)

# Fit the model using random search and cross-validation
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Report metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)
print("MCC:", mcc)

In [None]:
# SVM
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC  # Import Support Vector Machine
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define the Support Vector Machine model
model = SVC()

# Define hyperparameter grid for random search
param_dist = {
    'C': [0.1, 1.0, 100],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (only for 'poly' kernel)
    'gamma': ['scale', 'auto'],
}

# Create a random search cross-validation object
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=random_seed)

# Fit the model using random search and cross-validation
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Report metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)
print("MCC:", mcc)

In [None]:
# k-Nearest Neighbors
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier  # Import k-Nearest Neighbors
from scipy.stats import randint
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define the k-Nearest Neighbors model
model = KNeighborsClassifier()

# Define hyperparameter grid for random search
param_dist = {
    'n_neighbors': [3, 5, 7],  # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'p': [1, 2],  # Power parameter for the Minkowski distance metric
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']  # Algorithm used to compute nearest neighbors
}

# Create a random search cross-validation object
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=random_seed)

# Fit the model using random search and cross-validation
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Report metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)
print("MCC:", mcc)

In [None]:
# eXtreme Gradient Boosting
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier  # Import XGBoost
from scipy.stats import uniform, randint
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define the XGBoost model
model = XGBClassifier()

# Define hyperparameter grid for random search
param_dist = {
    'n_estimators': [50, 100, 150, 200],  # Number of boosting rounds
    'learning_rate': [0.05, 0.1, 0.2],  # Step size shrinkage to prevent overfitting
    'max_depth': [2, 3, 4, 5],  # Maximum depth of a tree
}

# Create a random search cross-validation object
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=random_seed)

# Fit the model using random search and cross-validation
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Report metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)
print("MCC:", mcc)

In [None]:
# MLP
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier  # Import MLPClassifier
from scipy.stats import randint
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# Define the MLPClassifier model
model = MLPClassifier()

# Define hyperparameter grid for random search
param_dist = {
    'hidden_layer_sizes': [(50,), (100,)],  # Number of units in hidden layers
    'activation': ['relu', 'tanh'],  # Activation function for hidden layers
    'alpha': [0.0001, 0.001, 0.01],  # L2 regularization parameter
}

# Create a random search cross-validation object
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=random_seed)

# Fit the model using random search and cross-validation
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)

# Report metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)
print("MCC:", mcc)