<a href="https://colab.research.google.com/github/aerospaceng25/astrodynamics/blob/master/BMUH565_BusraGUL_2307060010__ipynb_adl%C4%B1_not_defterinin_kopyas%C4%B1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import urllib.request
import numpy as np
np.random.seed(1337)  # for reproducibility
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Normalizer
from keras.models import Sequential
from keras.layers import Convolution1D, Dense, Dropout, Flatten, MaxPooling1D, Activation, Lambda
from keras import callbacks
from keras.layers import LSTM, GRU, SimpleRNN
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)

In [None]:
# Define the URLs for the training and testing sets from a reliable GitHub mirror
train_url = "https://raw.githubusercontent.com/HoaNP/NSL-KDD-DataSet/master/KDDTrain+.txt"
test_url = "https://raw.githubusercontent.com/HoaNP/NSL-KDD-DataSet/master/KDDTest+.txt"

In [None]:
# Define the column names (standard 43 columns for NSL-KDD)
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'attack', 'level'
]

# Load the datasets
print("Downloading and loading datasets...")
traindata = pd.read_csv(train_url, header=None)
testdata = pd.read_csv(test_url, header=None)

# 2. Assign names to your dataframe (assuming it's named 'traindata')
traindata.columns = columns
testdata.columns = columns

Downloading and loading datasets...


In [None]:
# Display the first few rows
print("Training Data Loaded. Shape:", traindata.shape)
print("Test Data Loaded. Shape:", testdata.shape)
traindata.head()
testdata.head()

Training Data Loaded. Shape: (125973, 43)
Test Data Loaded. Shape: (22544, 43)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [None]:
# 2. Data Cleaning
# Remove 'level' (difficulty score) as it is not a feature used for detection
traindata.drop('level', axis=1, inplace=True)

# 3. Label Encoding (Binary Classification)
# Transform 'normal' to 0 and any attack type to 1
traindata['attack'] = traindata['attack'].apply(lambda x: 0 if x == 'normal' else 1)

# 4. Categorical Encoding (One-Hot Encoding)
# 'protocol_type', 'service', and 'flag' are non-numeric
traindata = pd.get_dummies(traindata, columns=['protocol_type', 'service', 'flag'])

# 5. Normalization (Min-Max Scaling)
# Identify numerical columns (exclude the target 'attack')
numeric_cols = traindata.select_dtypes(include=['float64', 'int64', 'int32']).columns.tolist()
if 'attack' in numeric_cols:
    numeric_cols.remove('attack')

scaler = MinMaxScaler()
traindata[numeric_cols] = scaler.fit_transform(traindata[numeric_cols])

# Result Summary
print("Preprocessing Complete!")
print(f"New Shape: {traindata.shape}")
print(f"Normal (0) vs Attack (1) counts:\n{traindata['attack'].value_counts()}")
traindata.head()

X = traindata.iloc[:,1:42]
Y = traindata.iloc[:,0]

Preprocessing Complete!
New Shape: (125973, 123)
Normal (0) vs Attack (1) counts:
attack
0    67343
1    58630
Name: count, dtype: int64


In [None]:
# 2. Data Cleaning
# Remove 'level' (difficulty score) as it is not a feature used for detection
testdata.drop('level', axis=1, inplace=True)

# 3. Label Encoding (Binary Classification)
# Transform 'normal' to 0 and any attack type to 1
testdata['attack'] = testdata['attack'].apply(lambda x: 0 if x == 'normal' else 1)

# 4. Categorical Encoding (One-Hot Encoding)
# 'protocol_type', 'service', and 'flag' are non-numeric
testdata = pd.get_dummies(testdata, columns=['protocol_type', 'service', 'flag'])

# 5. Normalization (Min-Max Scaling)
# Identify numerical columns (exclude the target 'attack')
numeric_cols = testdata.select_dtypes(include=['float64', 'int64', 'int32']).columns.tolist()
if 'attack' in numeric_cols:
    numeric_cols.remove('attack')

scaler = MinMaxScaler()
testdata[numeric_cols] = scaler.fit_transform(testdata[numeric_cols])

# Result Summary
print("Preprocessing Complete!")
print(f"New Shape: {testdata.shape}")
print(f"Normal (0) vs Attack (1) counts:\n{testdata['attack'].value_counts()}")
testdata.head()

C = testdata.iloc[:,0]
T = testdata.iloc[:,1:42]

Preprocessing Complete!
New Shape: (22544, 117)
Normal (0) vs Attack (1) counts:
attack
1    12833
0     9711
Name: count, dtype: int64


In [None]:
scaler = Normalizer().fit(X)
trainX = scaler.transform(X)

scaler = Normalizer().fit(T)
testT = scaler.transform(T)

In [None]:
# reshape input to be [samples, time steps, features]
X_train = np.reshape(trainX, (trainX.shape[0],trainX.shape[1],1))
X_test = np.reshape(testT, (testT.shape[0],testT.shape[1],1))

y_train = np.array(Y)
y_test = np.array(C)

In [None]:
lstm_output_size = 70

cnn = Sequential()
cnn.add(Convolution1D(64, 3 ,activation="relu",input_shape=(41, 1)))
cnn.add(MaxPooling1D(pool_size=(2)))
cnn.add(LSTM(lstm_output_size))
cnn.add(Dropout(0.1))
cnn.add(Dense(1, activation="sigmoid"))

In [None]:
cnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
loss, accuracy = cnn.evaluate(X_test, y_test)
print("\nLoss: %.2f, Accuracy: %.2f%%" % (loss, accuracy*100))

[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.8446 - loss: 0.6728

Loss: 0.67, Accuracy: 84.36%


In [None]:
y_pred_proba = cnn.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int) # Convert probabilities to binary predictions

# IMPORTANT: The current y_test (from the 'duration' column) is continuous,
# but classification metrics like accuracy_score expect binary (0 or 1) labels for y_test.
# You need to ensure y_test contains the actual binary 'attack' labels (0 or 1)
# from your preprocessed data (e.g., by changing C = testdata.iloc[:,0] to C = testdata['attack']
# and Y = traindata.iloc[:,0] to Y = traindata['attack'] in previous cells).
# Assuming y_test *should* be binary for these metrics to be meaningful:

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred , average="binary")
precision = precision_score(y_test, y_pred , average="binary")
f1 = f1_score(y_test, y_pred, average="binary")

[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step


ValueError: Classification metrics can't handle a mix of continuous and binary targets

In [None]:
#np.savetxt('res/expected1.txt', y_test, fmt='%01d')
#np.savetxt('res/predicted1.txt', y_pred, fmt='%01d')

In [None]:
print("confusion matrix")
print("----------------------------------------------")
print("accuracy")
print("%.6f" %accuracy)
print("racall")
print("%.6f" %recall)
print("precision")
print("%.6f" %precision)
print("f1score")
print("%.6f" %f1)
cm = metrics.confusion_matrix(y_test, y_pred)
print("==============================================")