In [25]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
import tensorflow as tf
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Import Dataset

In [26]:
dataset_dir = '../../Dataset/'

# Modeling

In [27]:
def predictionStack(clf1, clf2, X_test):
    y_pred = []
    count = 0
    for index, row in X_test.iterrows():
        df_predict = pd.DataFrame([row])

        # Pred 1
        pred1 = clf1.predict(df_predict)
        if pred1[0] == 0:
            pred = 0
        elif pred1[0] == 1:
            # Pred 2
            pred2 = clf2.predict(df_predict)
            if pred2[0] == 0:
                pred = 1
            elif pred2[0] == 1:
                pred = 2
        y_pred.append(pred)
        count = count+1
        # print(count)
    return y_pred

In [None]:
# def predictionStack(clf1, clf2, X_test, botnet_features, spam_features):
#     y_pred = []
#     count = 0
#     for index, row in X_test.iterrows():
#         # create dataframe
#         df_predict = pd.DataFrame([row])
#         df_predict_botnet = df_predict.copy()
        
#         # masking feature
#         selected_features_botnet = botnet_features
#         selected_mask_botnet = np.array(selected_features_botnet, dtype=bool)
#         selected_columns_botnet = df_predict_botnet.columns[selected_mask_botnet]

#         # create feature
#         X_selected_botnet = df_predict_botnet.loc[:, selected_columns_botnet]

#         # predict 1
#         pred1 = clf1.predict(X_selected_botnet)
#         if pred1[0] == 0:
#             pred = 0
#         elif pred1[0] == 1:
#             # masking feature
#             selected_features_spam = spam_features
#             selected_mask_spam = np.array(selected_features_spam, dtype=bool)
#             selected_columns_spam = df_predict.columns[selected_mask_spam]
            
#             # create feature
#             X_selected_spam = df_predict.loc[:, selected_columns_spam]

#             # predict 2
#             pred2 = clf2.predict(X_selected_spam)
#             if pred2[0] == 0:
#                 pred = 1
#             elif pred2[0] == 1:
#                 pred = 2
#         y_pred.append(pred)
#         count = count+1
#         # print(count)
#     return y_pred

# Sensor 3

## Freq - Label Encoding

In [28]:
# Sensor 3
train_freqlabel = pd.read_csv(dataset_dir + 'train_freqlabelencoded.csv')
test_freqlabel = pd.read_csv(dataset_dir + 'test_freqlabelencoded.csv')

In [29]:
train_freqlabel.head(1)

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,0.002248,4,1,106,3,338987,318377,7,0,0,2,548,488,0,0,0


In [30]:
X_botnet = train_freqlabel.drop(columns=['isBotnet', 'Label', 'isSpam'])
y_botnet = train_freqlabel['isBotnet']

In [31]:
spam_data = train_freqlabel[train_freqlabel['isBotnet'] == 1]
X_spam = spam_data.drop(columns=['isSpam', 'Label', 'isBotnet'])
y_spam = spam_data['isSpam']

In [32]:
X_test = test_freqlabel.drop(columns=['Label', 'isBotnet', 'isSpam'])
y_test = test_freqlabel['Label']

### isBotnet

In [12]:
X_botnet.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.002248,4,1,106,3,338987,318377,7,0,0,2,548,488
1,0.000298,4,492764,50,3,1211113,1220778,7,0,0,2,214,81
2,1.244523,3,6457,202,0,248,3257,76,0,0,53,17709,2206
3,3251.449219,4,1,6,3,338987,318377,7,0,0,6,1270,232
4,0.000255,4,440828,86,3,1211113,1220778,7,0,0,2,200,75


In [13]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [19]:
X_spam.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
21,4.689169,3,65253,102,0,751,4724,73,0,0,10,1011,568
23,93.495171,3,16364,833,0,1946,16744,78,0,0,10,2248,2186
27,3566.989502,3,65253,239,0,1375,117676,184,0,0,220,14760,8092
41,16.122133,3,17236,71,0,60,469208,207,0,0,17,7634,863
49,12.689653,3,65253,49,0,2273,9064,73,0,0,28,6190,4506


In [14]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Predictions

In [18]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_freqlabelenc.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluations

In [20]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99999   0.99997   0.99998    826003
           1    0.99969   0.99993   0.99981     67570
           2    1.00000   1.00000   1.00000      5766

    accuracy                        0.99997    899339
   macro avg    0.99989   0.99997   0.99993    899339
weighted avg    0.99997   0.99997   0.99997    899339



In [21]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.999994,0.999975,0.999984
Botnet Non SPAM,0.999689,0.999926,0.999808
Botnet SPAM,1.0,1.0,1.0


In [22]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9999710898782328


In [23]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[825982     21      0]
 [     5  67565      0]
 [     0      0   5766]]


## 5 Label Encoding

### isBotnet

In [25]:
X_botnet = X_botnet.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [26]:
X_botnet.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.002248,4,3,7,0,0,2,548,488
1,0.000298,4,3,7,0,0,2,214,81
2,1.244523,3,0,76,0,0,53,17709,2206
3,3251.449219,4,3,7,0,0,6,1270,232
4,0.000255,4,3,7,0,0,2,200,75


In [27]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [28]:
X_spam = X_spam.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [29]:
X_spam.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
21,4.689169,3,0,73,0,0,10,1011,568
23,93.495171,3,0,78,0,0,10,2248,2186
27,3566.989502,3,0,184,0,0,220,14760,8092
41,16.122133,3,0,207,0,0,17,7634,863
49,12.689653,3,0,73,0,0,28,6190,4506


In [30]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [32]:
X_test = X_test.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [33]:
X_test.head()

Unnamed: 0,Dur,Proto,Dir,State,sTos,dTos,TotPkts,TotBytes,SrcBytes
0,0.015153,4,3,7,0,0,2,335,73
1,0.000743,4,3,7,0,0,2,138,77
2,0.166308,4,3,7,0,0,2,485,145
3,104.185165,4,3,7,0,0,4,499,364
4,0.327292,3,0,73,0,0,9,1535,699


In [34]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_nofreq.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [35]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99758   0.99639   0.99699    826003
           1    0.95033   0.96081   0.95554     67570
           2    0.88828   0.92525   0.90639      5766

    accuracy                        0.99327    899339
   macro avg    0.94540   0.96082   0.95297    899339
weighted avg    0.99333   0.99327   0.99329    899339



In [36]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.997584,0.996395,0.996989
Botnet Non SPAM,0.950333,0.960811,0.955543
Botnet SPAM,0.888278,0.925251,0.906388


In [37]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9932650535560006


In [38]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[823025   2964     14]
 [  1991  64922    657]
 [     2    429   5335]]


## 4 Freq Encoding

### isBotnet

In [45]:
X_botnet = X_botnet.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos'])

In [47]:
X_botnet.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,TotPkts,TotBytes,SrcBytes
0,0.002248,1,106,338987,318377,2,548,488
1,0.000298,492764,50,1211113,1220778,2,214,81
2,1.244523,6457,202,248,3257,53,17709,2206
3,3251.449219,1,6,338987,318377,6,1270,232
4,0.000255,440828,86,1211113,1220778,2,200,75


In [48]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [49]:
X_spam = X_spam.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos'])

In [50]:
X_spam.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,TotPkts,TotBytes,SrcBytes
21,4.689169,65253,102,751,4724,10,1011,568
23,93.495171,16364,833,1946,16744,10,2248,2186
27,3566.989502,65253,239,1375,117676,220,14760,8092
41,16.122133,17236,71,60,469208,17,7634,863
49,12.689653,65253,49,2273,9064,28,6190,4506


In [51]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [52]:
X_test = X_test.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos'])

In [53]:
X_test.head()

Unnamed: 0,Dur,SrcAddr,Sport,DstAddr,Dport,TotPkts,TotBytes,SrcBytes
0,0.015153,32986,36,1211113,1220778,2,335,73
1,0.000743,1,12,197730,185254,2,138,77
2,0.166308,731,246,791,732,2,485,145
3,104.185165,1,85,338987,318377,4,499,364
4,0.327292,440828,32,11057,469208,9,1535,699


In [54]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_nolabel.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [55]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99999   0.99999   0.99999    826003
           1    0.99985   0.99991   0.99988     67570
           2    1.00000   1.00000   1.00000      5766

    accuracy                        0.99998    899339
   macro avg    0.99995   0.99997   0.99996    899339
weighted avg    0.99998   0.99998   0.99998    899339



In [56]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.999993,0.999988,0.99999
Botnet Non SPAM,0.999852,0.999911,0.999882
Botnet SPAM,1.0,1.0,1.0


In [57]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9999822091558356


In [58]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[825993     10      0]
 [     6  67564      0]
 [     0      0   5766]]


## Only Numerical

### isBotnet

In [11]:
X_botnet = X_botnet.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [12]:
X_botnet.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes
0,0.002248,2,548,488
1,0.000298,2,214,81
2,1.244523,53,17709,2206
3,3251.449219,6,1270,232
4,0.000255,2,200,75


In [13]:
clf1 = DecisionTreeClassifier(random_state=42)
clf1.fit(X_botnet, y_botnet)

### isSpam

In [14]:
X_spam = X_spam.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [15]:
X_spam.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes
21,4.689169,10,1011,568
23,93.495171,10,2248,2186
27,3566.989502,220,14760,8092
41,16.122133,17,7634,863
49,12.689653,28,6190,4506


In [16]:
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_spam, y_spam)

### Prediction

In [17]:
X_test = X_test.drop(columns=['Proto', 'Dir', 'State', 'sTos', 'dTos', 'SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [18]:
X_test.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes
0,0.015153,2,335,73
1,0.000743,2,138,77
2,0.166308,2,485,145
3,104.185165,4,499,364
4,0.327292,9,1535,699


In [20]:
# Predict botnet SPAM vs non-SPAM
y_pred = predictionStack(clf1, clf2, X_test)
np.savetxt('../Output/Revisi 4/sensor3_onlynumerical.txt', y_pred, fmt="%s", delimiter="\n")

### Evaluation

In [21]:
# Eval
print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.99753   0.99604   0.99679    826003
           1    0.94610   0.95744   0.95173     67570
           2    0.86215   0.92525   0.89259      5766

    accuracy                        0.99269    899339
   macro avg    0.93526   0.95958   0.94704    899339
weighted avg    0.99280   0.99269   0.99273    899339



In [22]:
# Eval
report = classification_report(y_test, y_pred, target_names=['Normal', 'Botnet Non SPAM', 'Botnet SPAM'], output_dict=True)
df_metrics = pd.DataFrame(report).transpose()
df_metrics = df_metrics[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
df_metrics = df_metrics.rename(columns={'precision': 'Pre.', 'recall': 'Rec.', 'f1-score': 'F1'})
df_metrics

Unnamed: 0,Pre.,Rec.,F1
Normal,0.997531,0.996044,0.996787
Botnet Non SPAM,0.946095,0.957437,0.951732
Botnet SPAM,0.862153,0.925251,0.892588


In [23]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.992689074976177


In [24]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[822735   3257     11]
 [  2034  64694    842]
 [     2    429   5335]]
