In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('../input/nslkdd/KDDTrain+.txt')
columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted'
,'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate'
,'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','attack','level'])

data.columns = columns
data = data.drop('level', axis=1)
data.tail()

In [None]:
data_test=pd.read_csv('../input/nslkdd/KDDTest+.txt')
data_test.columns = columns
data_test = data_test.drop('level', axis=1)
data_test.tail()

In [None]:
data['attack'].unique()

In [None]:
print(data.isnull().values.any())

## Labeling malicious attacks in one category for binary classification:

In [None]:
data["attack"] = np.where(data["attack"] != "normal", 'malicious','normal')
data_test["attack"] = np.where(data_test["attack"] != "normal", 'malicious','normal')

In [None]:
plt.figure(figsize = (10,10))
r = data.attack.value_counts().plot(kind='pie', explode=(0.1, 0), startangle=90,autopct='%1.1f%%')
r.set_xticklabels(r.get_xticklabels(),rotation=90);
plt.title('Attack variations')
plt.show()

In [None]:
data.info()

## Label Encoding:

In [None]:
le = LabelEncoder()
data['protocol_type']=le.fit_transform(data['protocol_type'])
data['service']=le.fit_transform(data['service'])
data['flag']=le.fit_transform(data['flag'])

data['attack']=le.fit_transform(data['attack'])

data_test['protocol_type']=le.fit_transform(data_test['protocol_type'])
data_test['service']=le.fit_transform(data_test['service'])
data_test['flag']=le.fit_transform(data_test['flag'])

data_test['attack']=le.fit_transform(data_test['attack'])

In [None]:
False in np.isfinite(data).values

In [None]:
# Statistical description of the dataset.
data.describe()

In [None]:
def cross_val(x_train, y_train, model):
    accuracies = cross_val_score(estimator = model, X = x_train, y = y_train, cv=5)
    return accuracies.mean()

def fit_and_evaluate(model, x_train , x_test , y_train , y_test):
    model.fit(x_train, y_train)
    
    model_pred = model.predict(x_test)
    model_cross = cross_val(x_train, y_train, model)
    
    return model_cross

def run_experiment(model, x_train , x_test , y_train , y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))
    report=classification_report(y_test,y_pred)
    print(report)

## Train Test split:

In [None]:
x = data.drop(['attack'], axis=1).values
y = data['attack'].values

In [None]:
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=5)

In [None]:
x_train.shape, x_test.shape

In [None]:
xl_test = data_test.drop(['attack'], axis=1).values
yl_test = data_test['attack'].values
scaler = MinMaxScaler()
xl_test = scaler.fit_transform(xl_test)

## Logistic Regression:

In [None]:
model_cross = fit_and_evaluate(model,x_traincpy , x_testcpy , y_traincpy , y_testcpy)

In [None]:
x_traincpy = np.copy(x_train)
y_traincpy = np.copy(y_train)
x_testcpy = np.copy(x_test)
y_testcpy = np.copy(y_test)
logr = LogisticRegression()
logr_cross = fit_and_evaluate(logr, x_traincpy , x_testcpy , y_traincpy , y_testcpy) 

print('Logistic Regression Performance on the validatoin set: Cross Validation Score = %0.4f' % logr_cross)

In [None]:
xl_testcpy = np.copy(xl_test)
yl_testcpy = np.copy(yl_test)

y_pred = logr.predict(xl_testcpy)
print("Accuracy on test dataset: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=logr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logr.classes_)
disp.plot()
plt.show()

## Random Forest:

In [None]:
x_traincpy = np.copy(x_train)
y_traincpy = np.copy(y_train)
x_testcpy = np.copy(x_test)
y_testcpy = np.copy(y_test)

random = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
random_cross = fit_and_evaluate(random, x_traincpy , x_testcpy , y_traincpy , y_testcpy)

print('Random Forest Performance on the validation set: Cross Validation Score = %0.4f' % random_cross)

In [None]:
xl_testcpy = np.copy(xl_test)
yl_testcpy = np.copy(yl_test)

y_pred = random.predict(xl_testcpy)
print("Accuracy on test: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=random.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=random.classes_)
disp.plot()
plt.show()

## Decision Tree:

In [None]:
x_traincpy = np.copy(x_train)
y_traincpy = np.copy(y_train)
x_testcpy = np.copy(x_test)
y_testcpy = np.copy(y_test)

DT=DecisionTreeClassifier()
DT_cross = fit_and_evaluate(DT, x_traincpy , x_testcpy , y_traincpy , y_testcpy)

print('Decision Tree Performance on the validation set: Cross Validation Score = %0.4f' % DT_cross)

In [None]:
xl_testcpy = np.copy(xl_test)
yl_testcpy = np.copy(yl_test)

y_pred = DT.predict(xl_testcpy)
print("Accuracy on test dataset: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=DT.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=DT.classes_)
disp.plot()
plt.show()

## KNN:

In [None]:
x_traincpy = np.copy(x_train)
y_traincpy = np.copy(y_train)
x_testcpy = np.copy(x_test)
y_testcpy = np.copy(y_test)

knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_cross = fit_and_evaluate(knn, x_traincpy , x_testcpy , y_traincpy , y_testcpy)

print('KNN Performance on the validation dataset: Cross Validation Score = %0.4f' % knn_cross)

In [None]:
xl_testcpy = np.copy(xl_test)
yl_testcpy = np.copy(yl_test)

y_pred = knn.predict(xl_testcpy)
print("Accuracy on test dataset: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
disp.plot()
plt.show()

## outliers that may affect classification:

In [None]:
plt.figure(figsize = (25,8))
u = sns.boxplot(palette = 'cool', data=data)
u.set_xticklabels(u.get_xticklabels(),rotation=90);

In [None]:
fig = px.box(data, y='dst_bytes')
fig.show()

In [None]:
fig = px.box(data, y='src_bytes')
fig.show()

## removing outliers in train dataset:

In [None]:
data = data.drop(data[data['dst_bytes']>1500].index)
data = data.drop(data[data['src_bytes']>750].index)

## Outliers in test dataset:

In [None]:
plt.figure(figsize = (25,8))
u = sns.boxplot(palette = 'cool', data=data_test)
u.set_xticklabels(u.get_xticklabels(),rotation=90);

In [None]:
fig = px.box(data_test, y='dst_bytes')
fig.show()

In [None]:
fig = px.box(data_test, y='src_bytes')
fig.show()

## removing outliers in test dataset:

In [None]:
data_test = data_test.drop(data_test[data_test['dst_bytes'] >1750].index)
data_test = data_test.drop(data_test[data_test['src_bytes'] > 717].index)

## Train Test split after removing outliers:

In [None]:
x_wo = data.drop(['attack'], axis=1).values
y_wo = data['attack'].values

In [None]:
# scaling the dataset.
ro_scaler = RobustScaler()
x_wo_scaled = ro_scaler.fit_transform(x_wo)

scaler = MinMaxScaler()
x_wo_scaled = scaler.fit_transform(x_wo_scaled)

In [None]:
x_wo_train , x_wo_test , y_wo_train , y_wo_test = train_test_split(x_wo_scaled, y_wo, test_size= 0.2 , random_state=42)

In [None]:
x_wo_train.shape, x_wo_test.shape

In [None]:
xl_wo_test = data_test.drop(['attack'], axis=1).values
yl_wo_test = data_test['attack'].values
scaler = MinMaxScaler()
xl_wo_test = scaler.fit_transform(xl_wo_test)

## Logistic Regression after removing outliers:

In [None]:
x_traincpy = np.copy(x_wo_train)
y_traincpy = np.copy(y_wo_train)
x_testcpy = np.copy(x_wo_test)
y_testcpy = np.copy(y_wo_test)

logr = LogisticRegression()
logr_cross = fit_and_evaluate(logr, x_traincpy , x_testcpy , y_traincpy , y_testcpy)

print('Logistic Regression Performance on the test set: Cross Validation Score = %0.4f' % logr_cross)

In [None]:
xl_testcpy = np.copy(xl_wo_test)
yl_testcpy = np.copy(yl_wo_test)

model = LogisticRegression()
run_experiment(model, x_traincpy , xl_testcpy , y_traincpy , yl_testcpy)

In [None]:
y_pred = model.predict(xl_testcpy)
print("Accuracy: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=logr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logr.classes_)
disp.plot()
plt.show()

## Random Forest after removing outlier:

In [None]:
x_traincpy = np.copy(x_wo_train)
y_traincpy = np.copy(y_wo_train)
x_testcpy = np.copy(x_wo_test)
y_testcpy = np.copy(y_wo_test)

random = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
random_cross = fit_and_evaluate(random, x_traincpy , x_testcpy , y_traincpy , y_testcpy)

print('Random Forest Performance on the test set: Cross Validation Score = %0.4f' % random_cross)

In [None]:
xl_testcpy = np.copy(xl_wo_test)
yl_testcpy = np.copy(yl_wo_test)

y_pred = random.predict(xl_testcpy)
print("Accuracy: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=random.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=random.classes_)
disp.plot()
plt.show()

## KNN after removing outlier:

In [None]:
x_traincpy = np.copy(x_wo_train)
y_traincpy = np.copy(y_wo_train)
x_testcpy = np.copy(x_wo_test)
y_testcpy = np.copy(y_wo_test)

knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_cross = fit_and_evaluate(knn, x_traincpy , x_testcpy , y_traincpy , y_testcpy)

print('KNN Performance on the validation dataset: Cross Validation Score = %0.4f' % knn_cross)

In [None]:
xl_testcpy = np.copy(xl_wo_test)
yl_testcpy = np.copy(yl_wo_test)

y_pred = knn.predict(xl_testcpy)
print("Accuracy on test dataset: ", accuracy_score(yl_testcpy, y_pred))

In [None]:
cm = confusion_matrix(yl_testcpy, y_pred, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
disp.plot()
plt.show()