In [19]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
import tensorflow as tf
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, fbeta_score

# Import Dataset

In [2]:
dataset_dir = '../../Dataset/Revisi 4/'

# Modeling

In [3]:
def predictionStack(clf1, clf2, X_test):
    y_pred = []
    count = 0
    for index, row in X_test.iterrows():
        df_predict = pd.DataFrame([row])

        # Pred 1
        pred1 = clf1.predict(df_predict)
        if pred1[0] == 0:
            pred = 0
        elif pred1[0] == 1:
            # Pred 2
            pred2 = clf2.predict(df_predict)
            if pred2[0] == 0:
                pred = 1
            elif pred2[0] == 1:
                pred = 2
        y_pred.append(pred)
        count = count+1
        # print(count)
    return y_pred

In [None]:
# def predictionStack(clf1, clf2, X_test, botnet_features, spam_features):
#     y_pred = []
#     count = 0
#     for index, row in X_test.iterrows():
#         # create dataframe
#         df_predict = pd.DataFrame([row])
#         df_predict_botnet = df_predict.copy()
        
#         # masking feature
#         selected_features_botnet = botnet_features
#         selected_mask_botnet = np.array(selected_features_botnet, dtype=bool)
#         selected_columns_botnet = df_predict_botnet.columns[selected_mask_botnet]

#         # create feature
#         X_selected_botnet = df_predict_botnet.loc[:, selected_columns_botnet]

#         # predict 1
#         pred1 = clf1.predict(X_selected_botnet)
#         if pred1[0] == 0:
#             pred = 0
#         elif pred1[0] == 1:
#             # masking feature
#             selected_features_spam = spam_features
#             selected_mask_spam = np.array(selected_features_spam, dtype=bool)
#             selected_columns_spam = df_predict.columns[selected_mask_spam]
            
#             # create feature
#             X_selected_spam = df_predict.loc[:, selected_columns_spam]

#             # predict 2
#             pred2 = clf2.predict(X_selected_spam)
#             if pred2[0] == 0:
#                 pred = 1
#             elif pred2[0] == 1:
#                 pred = 2
#         y_pred.append(pred)
#         count = count+1
#         # print(count)
#     return y_pred

# Sensor 3

In [4]:
# Sensor 3
train_freqlabel = pd.read_csv(dataset_dir + 'train3_freqlabelencoded.csv')
test_freqlabel = pd.read_csv(dataset_dir + 'test3_freqlabelencoded.csv')

In [5]:
test_freqlabel['Label'].value_counts()

Label
0    813365
1      9988
2      2487
Name: count, dtype: int64

In [6]:
train_freqlabel.head(1)

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,0.000276,4,437462,76,3,1174661,1185710,8,0.0,0.0,2,234,92,0,0,0


## Freq - Label Encoding

In [7]:
X_botnet = train_freqlabel.drop(columns=['isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [8]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [9]:
X_test = test_freqlabel.drop(columns=['Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [10]:
X_botnet.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.000276,4,437462,76,3,1174661,1185710,8,0.0,0.0,2,234,92
1,1.276696,3,3307,140,0,40,91770,69,0.0,0.0,23,10482,4298
2,0.00026,4,474139,39,3,1174661,1185710,8,0.0,0.0,2,214,81
3,0.000289,4,474139,43,3,1174661,1185710,8,0.0,0.0,2,214,81
4,3479.618164,4,662,468,3,24,28,8,0.0,0.0,40,4400,3160


In [11]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [12]:
X_spam.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
1,1.276696,3,3307,140,0,40,91770,69,0.0,0.0,23,10482,4298
43,0.0,3,3443,172,0,1,8972,216,0.0,0.0,1,62,62
53,2.312834,3,9787,58,0,6,91770,69,0.0,0.0,154,132087,6915
73,0.965746,3,3794,54,0,1524,91770,222,0.0,0.0,6,366,186
113,3.004589,3,3443,56,0,1,3626,216,0.0,0.0,2,124,124


In [13]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Predictions

In [14]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_freqlabelenc.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluations

In [15]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99993   0.99991   0.99992    813365
           1    0.99290   0.99429   0.99360      9988
           2    1.00000   0.99920   0.99960      2487

    accuracy                        0.99984    825840
   macro avg    0.99761   0.99780   0.99771    825840
weighted avg    0.99984   0.99984   0.99984    825840



In [16]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.999929,0.999914,0.999921
Botnet Non SPAM,0.992901,0.994293,0.993597
Botnet SPAM,1.0,0.999196,0.999598


In [17]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9998437954083116


In [18]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[813295     70      0]
 [    57   9931      0]
 [     1      1   2485]]


In [None]:
# Focus on precision (Cocok jika False Positives lebih berisiko)
f05_score = fbeta_score(y_test, y_pred, beta=0.5, average='weighted')
print(f"F0.5 Score (Weighted): {f05_score}")

F0.5 Score (Weighted): 0.9998438867331548


In [None]:
# Focus on recall (Cocok jika False Negatives lebih berisiko)
f2_score = fbeta_score(y_test, y_pred, beta=2, average='weighted')
print(f"F2 Score (Weighted Average): {f2_score}")

F2 Score (Weighted Average): 0.9998438151489041


## 3 Label Encoding

In [23]:
X_botnet = train_freqlabel.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [24]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [25]:
X_test = test_freqlabel.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [26]:
X_botnet.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.000276,4,3,8,0.0,0.0,2,234,92
1,1.276696,3,0,69,0.0,0.0,23,10482,4298
2,0.00026,4,3,8,0.0,0.0,2,214,81
3,0.000289,4,3,8,0.0,0.0,2,214,81
4,3479.618164,4,3,8,0.0,0.0,40,4400,3160


In [27]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [28]:
X_spam.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
1,1.276696,3,0,69,0.0,0.0,23,10482,4298
43,0.0,3,0,216,0.0,0.0,1,62,62
53,2.312834,3,0,69,0.0,0.0,154,132087,6915
73,0.965746,3,0,222,0.0,0.0,6,366,186
113,3.004589,3,0,216,0.0,0.0,2,124,124


In [29]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [30]:
X_test.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,18.350538,4,3,8,0.0,0.0,4,328,142
1,0.000838,4,3,8,0.0,0.0,2,140,79
2,0.000333,4,3,8,0.0,0.0,2,214,81
3,0.000159,4,3,8,0.0,0.0,2,214,81
4,0.000282,4,3,8,0.0,0.0,2,214,81


In [31]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_nofreq.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [32]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99650   0.99653   0.99652    813365
           1    0.66839   0.64878   0.65844      9988
           2    0.75018   0.83193   0.78894      2487

    accuracy                        0.99183    825840
   macro avg    0.80502   0.82575   0.81463    825840
weighted avg    0.99179   0.99183   0.99180    825840



In [33]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.996505,0.996532,0.996518
Botnet Non SPAM,0.668386,0.648779,0.658436
Botnet SPAM,0.750181,0.831926,0.788942


In [34]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9918301365882011


In [35]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[810544   2807     14]
 [  2833   6480    675]
 [    10    408   2069]]


In [36]:
# Focus on precision (Cocok jika False Positives lebih berisiko)
f05_score = fbeta_score(y_test, y_pred, beta=0.5, average='weighted')
print(f"F0.5 Score (Weighted): {f05_score}")

F0.5 Score (Weighted): 0.9917965831388593


In [37]:
# Focus on recall (Cocok jika False Negatives lebih berisiko)
f2_score = fbeta_score(y_test, y_pred, beta=2, average='weighted')
print(f"F2 Score (Weighted Average): {f2_score}")

F2 Score (Weighted Average): 0.9918177000375805


## 4 Freq Encoding

In [38]:
X_botnet = train_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [39]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['Proto', 'Dir', 'State', 'isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [40]:
X_test = test_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [41]:
X_botnet.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.000276,437462,76,1174661,1185710,0.0,0.0,2,234,92
1,1.276696,3307,140,40,91770,0.0,0.0,23,10482,4298
2,0.00026,474139,39,1174661,1185710,0.0,0.0,2,214,81
3,0.000289,474139,43,1174661,1185710,0.0,0.0,2,214,81
4,3479.618164,662,468,24,28,0.0,0.0,40,4400,3160


In [42]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [43]:
X_spam.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,sTos,dTos,TotPkts,TotBytes,SrcBytes
1,1.276696,3307,140,40,91770,0.0,0.0,23,10482,4298
43,0.0,3443,172,1,8972,0.0,0.0,1,62,62
53,2.312834,9787,58,6,91770,0.0,0.0,154,132087,6915
73,0.965746,3794,54,1524,91770,0.0,0.0,6,366,186
113,3.004589,3443,56,1,3626,0.0,0.0,2,124,124


In [44]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [45]:
X_test.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,18.350538,0,57,334237,313801,0.0,0.0,4,328,142
1,0.000838,0,8,197587,185027,0.0,0.0,2,140,79
2,0.000333,474139,50,1174661,1185710,0.0,0.0,2,214,81
3,0.000159,474139,38,1174661,1185710,0.0,0.0,2,214,81
4,0.000282,474139,99,1174661,1185710,0.0,0.0,2,214,81


In [46]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_nolabel.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [47]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99992   0.99992   0.99992    813365
           1    0.99369   0.99369   0.99369      9988
           2    1.00000   0.99960   0.99980      2487

    accuracy                        0.99985    825840
   macro avg    0.99787   0.99774   0.99780    825840
weighted avg    0.99985   0.99985   0.99985    825840



In [48]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.999923,0.999924,0.999923
Botnet Non SPAM,0.993692,0.993692,0.993692
Botnet SPAM,1.0,0.999598,0.999799


In [49]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9998474280732346


In [50]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[813303     62      0]
 [    63   9925      0]
 [     0      1   2486]]


In [51]:
# Focus on precision (Cocok jika False Positives lebih berisiko)
f05_score = fbeta_score(y_test, y_pred, beta=0.5, average='weighted')
print(f"F0.5 Score (Weighted): {f05_score}")

F0.5 Score (Weighted): 0.9998474280701017


In [52]:
# Focus on recall (Cocok jika False Negatives lebih berisiko)
f2_score = fbeta_score(y_test, y_pred, beta=2, average='weighted')
print(f"F2 Score (Weighted Average): {f2_score}")

F2 Score (Weighted Average): 0.9998474280138462


## Only Numerical

In [53]:
X_botnet = train_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [54]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['Proto', 'Dir', 'State', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [55]:
X_test = test_freqlabel.drop(columns=['Proto', 'Dir', 'State', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [56]:
X_botnet.head()

Unnamed: 0,Dur,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.000276,0.0,0.0,2,234,92
1,1.276696,0.0,0.0,23,10482,4298
2,0.00026,0.0,0.0,2,214,81
3,0.000289,0.0,0.0,2,214,81
4,3479.618164,0.0,0.0,40,4400,3160


In [57]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [58]:
X_spam.head()

Unnamed: 0,Dur,sTos,dTos,TotPkts,TotBytes,SrcBytes
1,1.276696,0.0,0.0,23,10482,4298
43,0.0,0.0,0.0,1,62,62
53,2.312834,0.0,0.0,154,132087,6915
73,0.965746,0.0,0.0,6,366,186
113,3.004589,0.0,0.0,2,124,124


In [59]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [60]:
X_test.head()

Unnamed: 0,Dur,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,18.350538,0.0,0.0,4,328,142
1,0.000838,0.0,0.0,2,140,79
2,0.000333,0.0,0.0,2,214,81
3,0.000159,0.0,0.0,2,214,81
4,0.000282,0.0,0.0,2,214,81


In [61]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_onlynumerical.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [62]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99629   0.99629   0.99629    813365
           1    0.64951   0.63156   0.64041      9988
           2    0.74810   0.82992   0.78689      2487

    accuracy                        0.99138    825840
   macro avg    0.79796   0.81926   0.80786    825840
weighted avg    0.99135   0.99138   0.99136    825840



In [63]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.99629,0.996294,0.996292
Botnet Non SPAM,0.649506,0.631558,0.640406
Botnet SPAM,0.748097,0.829916,0.786885


In [64]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.991382107914366


In [65]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[810351   2997     17]
 [  3002   6308    678]
 [    16    407   2064]]


In [66]:
# Focus on precision (Cocok jika False Positives lebih berisiko)
f05_score = fbeta_score(y_test, y_pred, beta=0.5, average='weighted')
print(f"F0.5 Score (Weighted): {f05_score}")

F0.5 Score (Weighted): 0.9913498402274886


In [67]:
# Focus on recall (Cocok jika False Negatives lebih berisiko)
f2_score = fbeta_score(y_test, y_pred, beta=2, average='weighted')
print(f"F2 Score (Weighted Average): {f2_score}")

F2 Score (Weighted Average): 0.9913700931139177
