In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
import tensorflow as tf
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Import Dataset

In [2]:
dataset_dir = '../../Dataset/Revisi 4/'

# Modeling

In [3]:
def predictionStack(clf1, clf2, X_test):
    y_pred = []
    count = 0
    for index, row in X_test.iterrows():
        df_predict = pd.DataFrame([row])

        # Pred 1
        pred1 = clf1.predict(df_predict)
        if pred1[0] == 0:
            pred = 0
        elif pred1[0] == 1:
            # Pred 2
            pred2 = clf2.predict(df_predict)
            if pred2[0] == 0:
                pred = 1
            elif pred2[0] == 1:
                pred = 2
        y_pred.append(pred)
        count = count+1
        # print(count)
    return y_pred

In [None]:
# def predictionStack(clf1, clf2, X_test, botnet_features, spam_features):
#     y_pred = []
#     count = 0
#     for index, row in X_test.iterrows():
#         # create dataframe
#         df_predict = pd.DataFrame([row])
#         df_predict_botnet = df_predict.copy()
        
#         # masking feature
#         selected_features_botnet = botnet_features
#         selected_mask_botnet = np.array(selected_features_botnet, dtype=bool)
#         selected_columns_botnet = df_predict_botnet.columns[selected_mask_botnet]

#         # create feature
#         X_selected_botnet = df_predict_botnet.loc[:, selected_columns_botnet]

#         # predict 1
#         pred1 = clf1.predict(X_selected_botnet)
#         if pred1[0] == 0:
#             pred = 0
#         elif pred1[0] == 1:
#             # masking feature
#             selected_features_spam = spam_features
#             selected_mask_spam = np.array(selected_features_spam, dtype=bool)
#             selected_columns_spam = df_predict.columns[selected_mask_spam]
            
#             # create feature
#             X_selected_spam = df_predict.loc[:, selected_columns_spam]

#             # predict 2
#             pred2 = clf2.predict(X_selected_spam)
#             if pred2[0] == 0:
#                 pred = 1
#             elif pred2[0] == 1:
#                 pred = 2
#         y_pred.append(pred)
#         count = count+1
#         # print(count)
#     return y_pred

# Sensor 3

In [4]:
# Sensor 3
train_freqlabel = pd.read_csv(dataset_dir + 'train3_freqlabelencoded.csv')
test_freqlabel = pd.read_csv(dataset_dir + 'test3_freqlabelencoded.csv')

In [5]:
test_freqlabel['Label'].value_counts()

Label
0    897948
1     67750
2      5750
Name: count, dtype: int64

In [6]:
train_freqlabel.head(1)

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,3.478816,3,163,46,0,188,1707,76,0,0,24,4982,1685,0,0,0


## Freq - Label Encoding

In [7]:
X_botnet = train_freqlabel.drop(columns=['isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [8]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [9]:
X_test = test_freqlabel.drop(columns=['Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [10]:
X_botnet.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,3.478816,3,163,46,0,188,1707,76,0,0,24,4982,1685
1,1333.144409,4,2,13,3,335793,315437,9,0,0,6,792,611
2,0.000127,4,478320,44,3,1179362,1190339,9,0,0,2,214,81
3,0.000196,4,478320,33,3,1179362,1190339,9,0,0,2,214,81
4,0.000277,4,436875,73,3,1179362,1190339,9,0,0,2,239,83


In [11]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [12]:
X_spam.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
7,0.0,4,3002,178,0,1179362,1190339,107,0,0,1,82,82
24,0.292281,4,4,36,3,13,4,9,0,0,2,336,247
25,0.0,4,3804,203,0,1179362,1190339,107,0,0,1,79,79
51,6.469925,3,3730,51,0,626,3938,76,0,0,34,10396,5712
62,3.444259,4,11218,8,3,3,308,9,0,0,10,3192,1406


In [13]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Predictions

In [14]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_freqlabelenc.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluations

In [15]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99996   0.99658   0.99827    897948
           1    0.95673   0.99942   0.97761     67750
           2    0.99861   0.99965   0.99913      5750

    accuracy                        0.99680    971448
   macro avg    0.98510   0.99855   0.99167    971448
weighted avg    0.99693   0.99680   0.99683    971448



In [16]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.999956,0.996583,0.998267
Botnet Non SPAM,0.956735,0.999424,0.977614
Botnet SPAM,0.99861,0.999652,0.999131


In [17]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9967996228310728


In [18]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[894880   3061      7]
 [    38  67711      1]
 [     1      1   5748]]


## 5 Label Encoding

In [19]:
X_botnet = train_freqlabel.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [20]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [21]:
X_test = test_freqlabel.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [22]:
X_botnet.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,3.478816,3,0,76,0,0,24,4982,1685
1,1333.144409,4,3,9,0,0,6,792,611
2,0.000127,4,3,9,0,0,2,214,81
3,0.000196,4,3,9,0,0,2,214,81
4,0.000277,4,3,9,0,0,2,239,83


In [23]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [24]:
X_spam.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
7,0.0,4,0,107,0,0,1,82,82
24,0.292281,4,3,9,0,0,2,336,247
25,0.0,4,0,107,0,0,1,79,79
51,6.469925,3,0,76,0,0,34,10396,5712
62,3.444259,4,3,9,0,0,10,3192,1406


In [25]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [26]:
X_test.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.331789,3,0,76,0,0,9,1535,699
1,1.10783,3,0,76,0,0,16,5578,1440
2,0.000289,4,3,9,0,0,2,205,65
3,0.000269,4,3,9,0,0,2,240,78
4,0.000277,4,3,9,0,0,2,284,76


In [27]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_nofreq.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [28]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99770   0.94326   0.96972    897948
           1    0.58090   0.96086   0.72406     67750
           2    0.50992   0.92504   0.65744      5750

    accuracy                        0.94438    971448
   macro avg    0.69617   0.94305   0.78374    971448
weighted avg    0.96574   0.94438   0.95074    971448



In [29]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.997701,0.943262,0.969718
Botnet Non SPAM,0.580895,0.960856,0.724055
Botnet SPAM,0.509922,0.925043,0.657438


In [30]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.94438096532187


In [31]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[847000  46536   4412]
 [  1952  65098    700]
 [     0    431   5319]]


## 4 Freq Encoding

In [32]:
X_botnet = train_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [33]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [34]:
X_test = test_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [35]:
X_botnet.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,TotPkts,TotBytes,SrcBytes
0,3.478816,163,46,188,1707,24,4982,1685
1,1333.144409,2,13,335793,315437,6,792,611
2,0.000127,478320,44,1179362,1190339,2,214,81
3,0.000196,478320,33,1179362,1190339,2,214,81
4,0.000277,436875,73,1179362,1190339,2,239,83


In [36]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [37]:
X_spam.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,TotPkts,TotBytes,SrcBytes
7,0.0,3002,178,1179362,1190339,1,82,82
24,0.292281,4,36,13,4,2,336,247
25,0.0,3804,203,1179362,1190339,1,79,79
51,6.469925,3730,51,626,3938,34,10396,5712
62,3.444259,11218,8,3,308,10,3192,1406


In [38]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [39]:
X_test.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,TotPkts,TotBytes,SrcBytes
0,0.331789,436875,49,11044,398218,9,1535,699
1,1.10783,4215,60,336,92953,16,5578,1440
2,0.000289,436875,47,1179362,1190339,2,205,65
3,0.000269,436875,47,1179362,1190339,2,240,78
4,0.000277,70155,73,1179362,1190339,2,284,76


In [40]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_nolabel.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [41]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99997   0.99693   0.99845    897948
           1    0.96098   0.99956   0.97989     67750
           2    0.99878   0.99965   0.99922      5750

    accuracy                        0.99713    971448
   macro avg    0.98658   0.99871   0.99252    971448
weighted avg    0.99724   0.99713   0.99716    971448



In [42]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.999966,0.996932,0.998447
Botnet Non SPAM,0.960976,0.999557,0.979887
Botnet SPAM,0.998784,0.999652,0.999218


In [43]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9971310867900289


In [44]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[895193   2749      6]
 [    29  67720      1]
 [     1      1   5748]]


## Only Numerical

In [45]:
X_botnet = train_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [46]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [47]:
X_test = test_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [48]:
X_botnet.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes
0,3.478816,24,4982,1685
1,1333.144409,6,792,611
2,0.000127,2,214,81
3,0.000196,2,214,81
4,0.000277,2,239,83


In [49]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [50]:
X_spam.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes
7,0.0,1,82,82
24,0.292281,2,336,247
25,0.0,1,79,79
51,6.469925,34,10396,5712
62,3.444259,10,3192,1406


In [51]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [52]:
X_test.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes
0,0.331789,9,1535,699
1,1.10783,16,5578,1440
2,0.000289,2,205,65
3,0.000269,2,240,78
4,0.000277,2,284,76


In [53]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_onlynumerical.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [54]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99708   0.98123   0.98909    897948
           1    0.82386   0.94897   0.88200     67750
           2    0.54677   0.92504   0.68730      5750

    accuracy                        0.97865    971448
   macro avg    0.78924   0.95175   0.85280    971448
weighted avg    0.98233   0.97865   0.97984    971448



In [55]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.997077,0.981235,0.989093
Botnet Non SPAM,0.823857,0.948974,0.882001
Botnet SPAM,0.546772,0.925043,0.687298


In [56]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9786524857738139


In [57]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[881098  13316   3534]
 [  2582  64293    875]
 [     1    430   5319]]
