# SCADA ML paper

## Import needed libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


## Data loading and preprocessing

In [2]:
print("="*70)
print("LOADING AND PREPROCESSING DATA")
print("="*70)

df = pd.read_csv(r"C:\Users\risto\PycharmProjects\ids-scada-ml\data\scadadataset.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset info:")
print(df.info())
print(f"\nClass distribution:")
print(df.iloc[:, -1].value_counts())

# Separate features and target
X = df.iloc[:, :-1] 
y = df.iloc[:, -1]   

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

LOADING AND PREPROCESSING DATA
Dataset shape: (7037983, 7)

First few rows:
   Sport  TotPkts  TotBytes  SrcPkts  DstPkts  SrcBytes  Target
0    143        2       180        2        0       180       0
1     68        2       684        2        0       684       0
2      0        1        60        1        0        60       0
3  54949       10       628        4        6       248       0
4  54943        8       496        4        4       248       0

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7037983 entries, 0 to 7037982
Data columns (total 7 columns):
 #   Column    Dtype
---  ------    -----
 0   Sport     int64
 1   TotPkts   int64
 2   TotBytes  int64
 3   SrcPkts   int64
 4   DstPkts   int64
 5   SrcBytes  int64
 6   Target    int64
dtypes: int64(7)
memory usage: 375.9 MB
None

Class distribution:
Target
0    6634581
1     403402
Name: count, dtype: int64

Training set size: 5630386
Test set size: 1407597


## Helper functions

In [5]:
def calculate_metrics(y_true, y_pred):
    """Calculate ACC, FAR, and UND"""
    cm = confusion_matrix(y_true, y_pred)
    
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]
    
    ACC = ((TP + TN) / (TP + TN + FP + FN)) * 100
    
    FAR = (FP / (FP + TN)) * 100 if (FP + TN) > 0 else 0
    
    UND = (FN / (FN + TP)) * 100 if (FN + TP) > 0 else 0
    
    return {
        'TN': TN, 'FP': FP, 'FN': FN, 'TP': TP,
        'Accuracy': ACC, 'FAR': FAR, 'UND': UND
    }

def print_metrics(model_name, metrics):
    print(f"\n{model_name} Results:")
    print("-" * 50)
    print(f"Confusion Matrix:")
    print(f"  TN: {metrics['TN']:6d}  |  FP: {metrics['FP']:6d}")
    print(f"  FN: {metrics['FN']:6d}  |  TP: {metrics['TP']:6d}")
    print(f"\nPerformance Metrics:")
    print(f"  Accuracy (ACC):        {metrics['Accuracy']:.2f}%")
    print(f"  False Alarm Rate (FAR): {metrics['FAR']:.2f}%")
    print(f"  Un-Detection Rate (UND): {metrics['UND']:.2f}%")

results = {}


## Logistic regression

In [6]:
print("\n" + "="*70)
print("LOGISTIC REGRESSION")
print("="*70)

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

lr_metrics = calculate_metrics(y_test, lr_pred)
results['Logistic Regression'] = lr_metrics
print_metrics('Logistic Regression', lr_metrics)


LOGISTIC REGRESSION

Logistic Regression Results:
--------------------------------------------------
Confusion Matrix:
  TN: 1325420  |  FP:   1497
  FN:    382  |  TP:  80298

Performance Metrics:
  Accuracy (ACC):        99.87%
  False Alarm Rate (FAR): 0.11%
  Un-Detection Rate (UND): 0.47%


## Random forest

In [7]:
print("\n" + "="*70)
print("RANDOM FOREST")
print("="*70)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

rf_metrics = calculate_metrics(y_test, rf_pred)
results['Random Forest'] = rf_metrics
print_metrics('Random Forest', rf_metrics)


RANDOM FOREST

Random Forest Results:
--------------------------------------------------
Confusion Matrix:
  TN: 1326917  |  FP:      0
  FN:      1  |  TP:  80679

Performance Metrics:
  Accuracy (ACC):        100.00%
  False Alarm Rate (FAR): 0.00%
  Un-Detection Rate (UND): 0.00%


In [7]:
print("\n" + "="*70)
print("RANDOM FOREST")
print("="*70)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

rf_metrics = calculate_metrics(y_test, rf_pred)
results['Random Forest'] = rf_metrics
print_metrics('Random Forest', rf_metrics)


RANDOM FOREST

Random Forest Results:
--------------------------------------------------
Confusion Matrix:
  TN: 1326917  |  FP:      0
  FN:      1  |  TP:  80679

Performance Metrics:
  Accuracy (ACC):        100.00%
  False Alarm Rate (FAR): 0.00%
  Un-Detection Rate (UND): 0.00%


## Naive Bayes

In [None]:
print("\n" + "="*70)
print("NAIVE BAYES")
print("="*70)

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
nb_pred = nb_model.predict(X_test_scaled)

nb_metrics = calculate_metrics(y_test, nb_pred)
results['Naive Bayes'] = nb_metrics
print_metrics('Naive Bayes', nb_metrics)

##  K-NEAREST NEIGHBORS (KNN)

In [None]:
print("\n" + "="*70)
print("K-NEAREST NEIGHBORS (KNN)")
print("="*70)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)

knn_metrics = calculate_metrics(y_test, knn_pred)
results['KNN'] = knn_metrics
print_metrics('KNN', knn_metrics)

## Decision Tree

In [None]:
print("\n" + "="*70)
print("DECISION TREE")
print("="*70)

dt_model = DecisionTreeClassifier(random_state=42, max_depth=20)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

dt_metrics = calculate_metrics(y_test, dt_pred)
results['Decision Tree'] = dt_metrics
print_metrics('Decision Tree', dt_metrics)

##  Comparison results

In [None]:
print("\n" + "="*70)
print("GENERATING COMPARISON CHARTS")
print("="*70)

# Prepare data for visualization
models = list(results.keys())
accuracy = [results[m]['Accuracy'] for m in models]
far = [results[m]['FAR'] for m in models]
und = [results[m]['UND'] for m in models]

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy (%)': accuracy,
    'FAR (%)': far,
    'UND (%)': und
})

print("\nSummary Table:")
print(comparison_df.to_string(index=False))

plt.figure(figsize=(12, 6))
bars = plt.bar(models, accuracy, color=['#2E86AB', '#E63946', '#A23B72', '#F18F01', '#C73E1D', '#6A994E', '#457B9D'])
plt.ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
plt.title('Accuracy Comparison Across ML Algorithms', fontsize=14, fontweight='bold')
plt.ylim([0, 105])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


plt.figure(figsize=(12, 6))
bars = plt.bar(models, far, color=['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E'])
plt.ylabel('False Alarm Rate (%)', fontsize=12, fontweight='bold')
plt.title('False Alarm Rate (FAR) Comparison Across ML Algorithms', fontsize=14, fontweight='bold')
plt.ylim([0, max(far) * 1.2])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
bars = plt.bar(models, und, color=['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E'])
plt.ylabel('Un-Detection Rate (%)', fontsize=12, fontweight='bold')
plt.title('Un-Detection Rate (UND) Comparison Across ML Algorithms', fontsize=14, fontweight='bold')
plt.ylim([0, max(und) * 1.2])
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Accuracy
axes[0].bar(models, accuracy, color=['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E'])
axes[0].set_ylabel('Accuracy (%)', fontweight='bold')
axes[0].set_title('Accuracy', fontweight='bold', fontsize=12)
axes[0].set_ylim([0, 105])
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# FAR
axes[1].bar(models, far, color=['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E'])
axes[1].set_ylabel('False Alarm Rate (%)', fontweight='bold')
axes[1].set_title('False Alarm Rate (FAR)', fontweight='bold', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

# UND
axes[2].bar(models, und, color=['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#6A994E'])
axes[2].set_ylabel('Un-Detection Rate (%)', fontweight='bold')
axes[2].set_title('Un-Detection Rate (UND)', fontweight='bold', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', alpha=0.3)

plt.suptitle('Complete Performance Comparison Across ML Algorithms', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
heatmap_data = comparison_df.set_index('Model')[['Accuracy (%)', 'FAR (%)', 'UND (%)']]
sns.heatmap(heatmap_data.T, annot=True, fmt='.2f', cmap='RdYlGn_r', 
            cbar_kws={'label': 'Percentage (%)'}, linewidths=1)
plt.title('Performance Metrics Heatmap', fontsize=14, fontweight='bold')
plt.ylabel('Metrics', fontweight='bold')
plt.xlabel('ML Algorithms', fontweight='bold')
plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("ANALYSIS COMPLETE!")
print("="*70)