In [1]:
# !pip install numpy pandas scikit-learn tensorflow snort

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load NSL-KDD dataset
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
           "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
           "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
           "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", 
           "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", 
           "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", 
           "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
           "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
           "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
           "dst_host_srv_rerror_rate", "label"]

data = pd.read_csv(url, names=columns)
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,tcp,ftp_data,SF,491,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
0,udp,other,SF,146,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
0,tcp,private,S0,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
0,tcp,http,SF,232,8153,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
0,tcp,http,SF,199,420,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


3. Feature Engineering

In [3]:
# Convert categorical features to numeric using one-hot encoding
data = pd.get_dummies(data, columns=["protocol_type", "service", "flag"])

# Convert labels to binary: 'normal' as 0 and others as 1
data['label'] = data['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Separate features and labels
X = data.drop(columns=['label'])
y = data['label']

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols)

# Drop non-numeric columns if necessary
X = X.drop(columns=non_numeric_cols)

# Fill missing values only for numeric columns
X.fillna(X.select_dtypes(include=['number']).mean(), inplace=True)

Non-numeric columns: Index(['duration', 'dst_host_srv_rerror_rate', 'protocol_type_IRC',
       'protocol_type_X11', 'protocol_type_Z39_50', 'protocol_type_aol',
       'protocol_type_auth', 'protocol_type_bgp', 'protocol_type_courier',
       'protocol_type_csnet_ns',
       ...
       'flag_18828976', 'flag_21945520', 'flag_24418776', 'flag_89581520',
       'flag_217277339', 'flag_381709090', 'flag_621568663', 'flag_693375640',
       'flag_1167519497', 'flag_1379963888'],
      dtype='object', length=3424)


4. Behavior Modeling: Anomaly-Based Detection

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate model
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00     37792

    accuracy                           1.00     37792
   macro avg       1.00      1.00      1.00     37792
weighted avg       1.00      1.00      1.00     37792



5. Behavior Modeling: Signature-Based Detection

In [5]:
def signature_based_detection(X):
    # A simple rule-based system: If 'src_bytes' is very high, flag as intrusion
    return np.where(X['src_bytes'] > 100000, 1, 0)

# Apply signature-based detection
sid_predictions = signature_based_detection(X_test)

6. Model Integration (USAID)

In [7]:
# Combine SID and AID predictions
combined_predictions = np.where((sid_predictions == 1) | (y_pred == 1), 1, 0)

# Evaluate the combined model
print("Combined Model Accuracy:", accuracy_score(y_test, combined_predictions))
print(classification_report(y_test, combined_predictions))

Combined Model Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     37792

    accuracy                           1.00     37792
   macro avg       1.00      1.00      1.00     37792
weighted avg       1.00      1.00      1.00     37792



one for all

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load NSL-KDD dataset
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
           "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", 
           "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
           "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", 
           "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", 
           "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", 
           "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
           "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
           "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
           "dst_host_srv_rerror_rate", "label"]

data = pd.read_csv(url, names=columns)
data.head()

# Convert categorical features to numeric using one-hot encoding
data = pd.get_dummies(data, columns=["protocol_type", "service", "flag"])

# Convert labels to binary: 'normal' as 0 and others as 1
data['label'] = data['label'].apply(lambda x: 0 if x == 'normal' else 1)

# Separate features and labels
X = data.drop(columns=['label'])
y = data['label']

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols)

# Drop non-numeric columns if necessary
X = X.drop(columns=non_numeric_cols)

# Fill missing values only for numeric columns
X.fillna(X.select_dtypes(include=['number']).mean(), inplace=True)

# 4. Behavior Modeling: Anomaly-Based Detection
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate model
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

# 5. Behavior Modeling: Signature-Based Detection
def signature_based_detection(X):
    # A simple rule-based system: If 'src_bytes' is very high, flag as intrusion
    return np.where(X['src_bytes'] > 100000, 1, 0)

# Apply signature-based detection
sid_predictions = signature_based_detection(X_test)

# 6. Model Integration (USAID)
# Combine SID and AID predictions
combined_predictions = np.where((sid_predictions == 1) | (y_pred == 1), 1, 0)

# Evaluate the combined model
print("Combined Model Accuracy:", accuracy_score(y_test, combined_predictions))
print(classification_report(y_test, combined_predictions))


Non-numeric columns: Index(['duration', 'dst_host_srv_rerror_rate', 'protocol_type_IRC',
       'protocol_type_X11', 'protocol_type_Z39_50', 'protocol_type_aol',
       'protocol_type_auth', 'protocol_type_bgp', 'protocol_type_courier',
       'protocol_type_csnet_ns',
       ...
       'flag_18828976', 'flag_21945520', 'flag_24418776', 'flag_89581520',
       'flag_217277339', 'flag_381709090', 'flag_621568663', 'flag_693375640',
       'flag_1167519497', 'flag_1379963888'],
      dtype='object', length=3424)
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     37792

    accuracy                           1.00     37792
   macro avg       1.00      1.00      1.00     37792
weighted avg       1.00      1.00      1.00     37792

Combined Model Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     37792

    accuracy                           1.00     37792
   macro avg      