In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


In [2]:
# Load your unbalanced dataset
data = pd.read_csv('dataset.csv')

# Display the first 5 rows
data.head()


Unnamed: 0,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,class,attackType,attackID,attackDescription
0,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,0,normal,---,---,---
1,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,0,normal,---,---,---
2,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,0,normal,---,---,---
3,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,0,normal,---,---,---
4,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,0,normal,---,---,---


In [3]:
# Separate features and labels
X = data.drop('class', axis=1)  # Replace 'class' with the actual target column name
y = data['class']

# Show dataset shape
print(f"Features Shape: {X.shape}, Labels Shape: {y.shape}")


Features Shape: (15520, 14), Labels Shape: (15520,)


In [4]:
# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['float64']).columns

print("Categorical Columns:", categorical_columns)
print("Numerical Columns:", numerical_columns)


Categorical Columns: Index(['Proto', 'Src IP Addr', 'Dst IP Addr', 'Bytes', 'Flags', 'attackType',
       'attackID', 'attackDescription'],
      dtype='object')
Numerical Columns: Index(['Duration', 'Dst Pt'], dtype='object')


In [5]:
# Label Encode categorical columns
for col in categorical_columns:
    label_encoder = LabelEncoder()
    X[col] = label_encoder.fit_transform(X[col])

# Standardize numerical columns
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

# Display processed data (first 5 rows)
X.head()


Unnamed: 0,Duration,Proto,Src IP Addr,Src Pt,Dst IP Addr,Dst Pt,Packets,Bytes,Flows,Flags,Tos,attackType,attackID,attackDescription
0,20.551292,2,1160,8082,1160,1.53991,3057,2,1,14,0,0,0,0
1,20.551292,2,1161,56978,1159,-0.544165,4748,4,1,14,0,0,0,0
2,20.574624,2,1160,8082,1160,1.539952,8639,8,1,14,0,0,0,0
3,20.574624,2,1161,56979,1159,-0.544165,12024,10,1,14,0,0,0,0
4,20.725605,2,1160,8082,1160,1.312774,11012,12,1,15,0,0,0,0


In [6]:
# Initialize SMOTE and Tomek Links
smote = SMOTE(sampling_strategy='minority')
tomek = TomekLinks(sampling_strategy='all')
smt = SMOTETomek(smote=smote, tomek=tomek)

# Resample the data
X_resampled, y_resampled = smt.fit_resample(X, y)

# Display new shape of resampled data
print(f"Resampled Features Shape: {X_resampled.shape}, Resampled Labels Shape: {y_resampled.shape}")


Resampled Features Shape: (15510, 14), Resampled Labels Shape: (15510,)


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print(f"Training Data Shape: {X_train.shape}, Test Data Shape: {X_test.shape}")


Training Data Shape: (12408, 14), Test Data Shape: (3102, 14)


In [8]:
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear', C=1)
svm_classifier.fit(X_train[0:500], y_train[0:500])  # Training on a subset to avoid overfitting

print("Model training complete.")


Model training complete.


In [9]:
# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.961960025789813
Classification Report:
              precision    recall  f1-score   support

      normal       0.98      0.99      0.98      1047
  suspicious       0.97      0.94      0.96      1062
     unknown       0.93      0.95      0.94       993

    accuracy                           0.96      3102
   macro avg       0.96      0.96      0.96      3102
weighted avg       0.96      0.96      0.96      3102

Confusion Matrix:
[[1036    0   11]
 [   0 1003   59]
 [  21   27  945]]
