In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_curve, roc_auc_score


import warnings

%matplotlib inline

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [8, 4]
warnings.filterwarnings("ignore", category=FutureWarning)
%config InlineBackend.figure_format = 'retina'

In [None]:
# Load the data
dataset = pd.read_csv("UNSW_2018_IoT_Botnet_Final_10_Best.csv", sep=';')
del dataset['Unnamed: 0']

In [None]:
dataset.head(10)

In [None]:
dataset.info()

In [None]:
dataset = dataset.drop(['pkSeqID'], axis = 1)

dataset.head() # shows the result after dropping unimportant features

In [None]:
dataset.info()

In [None]:
plt.figure(figsize=(14,4))
plt.title('Dataset Features with Null Value')
sns.heatmap(dataset.isnull()[:-1],yticklabels=False,cbar=False,cmap='hot', linecolor='green')

In [None]:
dataset.isna().any().sum()

In [None]:
plt.figure(figsize=(14,4))
sns.set_style('whitegrid')
sns.displot(dataset['mean'],kde=False,color='darkred',bins=20)

In [None]:
dataset.info()

In [None]:
characters = dataset.columns[dataset.dtypes.eq('object')]
characters 

In [None]:
from sklearn.preprocessing import LabelEncoder

cols_to_convert = ['proto', 'saddr', 'sport', 'daddr', 'dport', 'category', 'subcategory']

le = LabelEncoder()

for col in cols_to_convert:
    dataset[col] = dataset[col].astype(str)  # Convert column to string
    dataset[col] = le.fit_transform(dataset[col])

real_data = dataset

In [None]:
# real_data.head()

In [None]:
real_data.info()

In [None]:
#rearrange
arra = list(real_data.columns.values) 
arra.pop(arra.index('attack')) 
real_data = real_data[arra+['attack']]
real_data.head()

In [None]:
real_data.isnull().any().sum()

In [None]:
# imbalanaced dataset
plt.figure(figsize=(10,4))
sns.set_style('whitegrid')
sns.countplot(x='attack',data=real_data,palette='Set2_r')

In [None]:
plt.figure(figsize=(10,6))
real_data[real_data['attack']==1]['mean'].hist(alpha=0.5,color='red',
                                              bins=30,label='Fraudulent vs Mean Value=True')
plt.legend()
plt.xlabel('Bad network with Mean')

In [None]:
# Get number of phished and non-phished examples
neg = real_data[real_data["attack"] == 1].shape[0]
pos = real_data[real_data["attack"] == 0].shape[0]
print(f"Non-Fraudulent = {pos}")
print(f"Fraudulent = {neg}")
print(f"Ratio of non-Fraudulent networks against Fraudulent networks = {(pos / neg) * 100:.2f}%")

In [None]:
#checks the number of columns with string
data = real_data.columns[real_data.dtypes.eq('object')]
data 

In [None]:
#just for assurance to remove nan value
real_data.dropna(axis=0)
real_data.head(4)

In [None]:
X=real_data.iloc[:,:-1].values
y=real_data.iloc[:, -1].values
y

In [None]:
# Calculate feature importance using mutual information
importance = mutual_info_classif(X, y)

# Create a DataFrame to store feature importance
feat_importance = pd.Series(importance, real_data.columns[0:len(real_data.columns)-1])

# Plot feature importance
plt.figure(figsize=(20, 5))
feat_importance.plot(kind='bar', color='r') 
plt.title('Feature Selection Using Mutual Information Algorithm')
plt.xticks(rotation=90)
plt.ylabel('Importance')
plt.xlabel('Available features')
plt.legend(['Importance to dependent variable'], loc='upper right')
plt.show()

In [None]:
# Create a DataFrame to store feature importance
feat_importance = pd.Series(importance, real_data.columns[:-1])

# Filter features based on the chosen threshold
threshold = 0.05
selected_features = feat_importance[feat_importance < threshold].index.tolist()

# Drop features below the threshold from new_data
new_data = real_data.drop(columns=[col for col in real_data.columns if col in selected_features])

In [None]:
new_data.head()

In [None]:
X = new_data.iloc[:,:].drop(['attack'] , axis=1)
y=new_data['attack']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#implementation of RamdomOverSampler
rand_oversampler = RandomOverSampler()
X_train, y_train = rand_oversampler.fit_resample(X_train, y_train)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.svm import LinearSVC

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
svm = LinearSVC(penalty='l2', max_iter=10000)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [None]:
y_pred = svm.predict(X_test)

print(classification_report(y_test,y_pred))
print(f'SVM Accuracy based on validation Dataset {100*accuracy_score(y_test,y_pred):.2f}%')

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True, fmt='g')
plt.ylabel('Actual Value')
plt.xlabel('Predicted Value')
plt.title('SVM Confusion Matrix')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [None]:
model = Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy','mse','mae'])

In [None]:
model.fit(x=X_train,y=y_train,
          validation_data=(X_test,y_test),
          callbacks=[early_stop],
          batch_size=64,epochs=100)

### Evaluation

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()
plt.plot(figsize=(8,5))
plt.grid(True)
plt.gca().set_ylim(0,1)
plt.show()

In [None]:
pred= (model.predict(X_test) > 0.5).astype("int32")

In [None]:
print(classification_report(y_test,pred))

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test,pred),annot=True, fmt='g')
plt.ylabel('Actual Value')
plt.xlabel('Predicted Value')
plt.title('DNN Confusion Matrix')

In [None]:
# predict probabilities
pred_prob = svm._predict_proba_lr(X_test)
pred_prob1 = model.predict(X_test)

# roc curve for models
fpr, tpr, thresh = roc_curve(y_test, pred_prob[:,1], pos_label=1)
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1, pos_label=1)


# roc curve for tpr = fpr 
svm_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, svm_probs, pos_label=1)

# roc curve for tpr = fpr 
model_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, model_probs, pos_label=1)

# auc scores
auc_score = roc_auc_score(y_test, pred_prob[:,1])
auc_score1 = roc_auc_score(y_test, pred_prob1)


print('AUC Score of the developed model\n')
print(f'The AUC score for the SVM is :{auc_score}:\nThe AUC score for the ANN is :{auc_score1}\n')

In [None]:
plt.figure(figsize=(10,10))

# Without Spearman Ranking
plt.plot(fpr, tpr, linestyle='--',color='orange', label='SVM Model')
plt.plot(fpr1, tpr1, linestyle='--',color='green', label='ANN Model')


# plot for 50% probability (no skill classifier)
plt.plot(p_fpr, p_tpr, linestyle='--', color='black')

plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.show()
