In [None]:
# Link to EPIC_nonDL.ipynb

# https://colab.research.google.com/drive/1TvsNvHjSHeT4NOynNcGrpAQkK2UfW_dU?usp=sharing

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# read data
df = pd.read_csv('dataset_EPICA_raw.csv')
df = df.drop(df.columns[0:2], axis=1)
df = df.dropna()

# label
X = df.drop('status', axis=1)
y = df['status']

# standaization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df

Unnamed: 0,TotW,TotVar,A.phsA,Phv.neut.ang,modbus_speed,scada_q2c_sync_activated,q2c_in_sync,gen1_p_negative,gen2_p_negative,status
0,0.000000,0.000000,0.000,0.000000,-7487.0,0,0,0,0,Normal
1,0.000000,0.000000,0.000,0.000000,-7487.0,0,0,0,0,Normal
2,0.000000,0.000000,0.000,0.000000,-7487.0,0,0,0,0,Normal
3,0.000000,0.000000,0.000,0.000000,-7487.0,0,0,0,0,Normal
4,0.000000,0.000000,0.000,0.000000,-7487.0,0,0,0,0,Normal
...,...,...,...,...,...,...,...,...,...,...
22354,814.100525,514.805420,1.350,-0.164185,-7488.0,1,1,1,0,Normal
22355,787.395569,489.114624,1.275,-0.118637,-7489.0,1,1,1,0,Normal
22356,718.387878,536.451843,1.225,-0.127289,-7488.0,1,1,1,0,Normal
22357,827.416687,468.494415,1.275,-0.180862,-7488.0,1,1,1,0,Normal


# Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

results = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

# Run 3 trials
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=i)

    base_model = DecisionTreeClassifier(random_state=i)

    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Store metrics
    results['accuracy'].append(accuracy_score(y_test, y_pred))
    results['precision'].append(precision_score(y_test, y_pred, average='macro'))
    results['recall'].append(recall_score(y_test, y_pred, average='macro'))
    results['f1'].append(f1_score(y_test, y_pred, average='macro'))

    # Show best parameters from this trial
    print(f"Trial {i+1} - Best Parameters: {grid_search.best_params_}")

print("Average Evaluation Results:")
print("Accuracy: ", np.mean(results['accuracy']))
print("Precision: ", np.mean(results['precision']))
print("Recall: ", np.mean(results['recall']))
print("F1 Score: ", np.mean(results['f1']))

Trial 1 - Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10}
Trial 2 - Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10}
Trial 3 - Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10}
Average Evaluation Results:
Accuracy:  0.9074330208493416
Precision:  0.8903569527124753
Recall:  0.889582159812622
F1 Score:  0.8898203419911308


# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []


param_grid = {
    'solver': ['liblinear', 'lbfgs'],
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2']
}

# Run 3 trials
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=i)

    # Initialize Logistic Regression model
    base_model = LogisticRegression(max_iter=1000)

    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)

    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred, average='macro'))
    recall_list.append(recall_score(y_test, y_pred, average='macro'))
    f1_list.append(f1_score(y_test, y_pred, average='macro'))


    # Show best parameters from this trial
    print(f"Trial {i+1} - Best Parameters: {grid_search.best_params_}")

print("Average Evaluation Results:")
print("Average Accuracy: ", np.mean(accuracy_list))
print("Average Precision: ", np.mean(precision_list))
print("Average Recall: ", np.mean(recall_list))
print("Average F1 Score: ", np.mean(f1_list))

Trial 1 - Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Trial 2 - Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Trial 3 - Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Average Evaluation Results:
Average Accuracy:  0.749449310998412
Average Precision:  0.7305983213693007
Average Recall:  0.6301331307536264
Average F1 Score:  0.6370469546498434


#SVM

In [None]:
from sklearn.svm import SVC

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

param_grid = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

# Run 3 trials
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=i)

    # Base SVC model
    base_model = SVC()

    # GridSearchCV
    grid_search = GridSearchCV(base_model, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Append metrics
    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred, average='macro'))
    recall_list.append(recall_score(y_test, y_pred, average='macro'))
    f1_list.append(f1_score(y_test, y_pred, average='macro'))


    # Show best parameters from this trial
    print(f"Trial {i+1} - Best Parameters: {grid_search.best_params_}")

print("Average Evaluation Results:")
print("Average Accuracy: ", np.mean(accuracy_list))
print("Average Precision: ", np.mean(precision_list))
print("Average Recall: ", np.mean(recall_list))
print("Average F1 Score: ", np.mean(f1_list))

Trial 1 - Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Trial 2 - Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Trial 3 - Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Average Evaluation Results:
Average Accuracy:  0.8363301060396496
Average Precision:  0.805883500832462
Average Recall:  0.827480636990746
Average F1 Score:  0.8112044379228468


# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize result lists
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Define parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform 3 trials
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=i)

    # Base model
    knn_base = KNeighborsClassifier()

    # GridSearchCV
    grid_search = GridSearchCV(knn_base, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(X_train, y_train)

    best_knn = grid_search.best_estimator_
    y_pred = best_knn.predict(X_test)

    # Append metrics
    accuracy_list.append(accuracy_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred, average='macro'))
    recall_list.append(recall_score(y_test, y_pred, average='macro'))
    f1_list.append(f1_score(y_test, y_pred, average='macro'))

    # Show best parameters from this trial
    print(f"Trial {i+1} - Best Parameters: {grid_search.best_params_}")

# Print average results
print("Average Accuracy: ", np.mean(accuracy_list))
print("Average Precision: ", np.mean(precision_list))
print("Average Recall: ", np.mean(recall_list))
print("Average F1 Score: ", np.mean(f1_list))

Trial 1 - Best Parameters: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
Trial 2 - Best Parameters: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
Trial 3 - Best Parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
Average Accuracy:  0.9081501972235029
Average Precision:  0.8891301041322307
Average Recall:  0.8961184332549035
Average F1 Score:  0.8923569607884297
