In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dataset_dir = '../../Dataset/Revisi 4/'

In [4]:
os.listdir(dataset_dir)

['sensor3_clean.csv',
 'sensor3_labeled.csv',
 'sensor3_test.csv',
 'sensor3_train.csv',
 'test3_clean.csv',
 'test3_freqlabelencoded.csv',
 'train3_clean.csv',
 'train3_freqlabelencoded.csv']

# Add Target

In [5]:
def add_target(df):
    df['isBotnet'] = df['Label'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
    df['isSpam'] = df['Label'].apply(lambda x: 1 if x == 2 else 0)
    return df

# Sensor 3

In [6]:
sensor3_train = pd.read_csv(dataset_dir + 'train3_clean.csv')
sensor3_test = pd.read_csv(dataset_dir + 'test3_clean.csv')

In [7]:
sensor3_train.shape

(2477518, 14)

In [8]:
sensor3_test.shape

(825840, 14)

In [9]:
sensor3_train.dtypes

Dur         float64
Proto        object
SrcAddr      object
Sport        object
Dir          object
DstAddr      object
Dport        object
State        object
sTos        float64
dTos        float64
TotPkts       int64
TotBytes      int64
SrcBytes      int64
Label         int64
dtype: object

In [10]:
sensor3_train.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,0.0,udp,147.32.84.59,56300,<->,147.32.80.9,53,CON,0.0,0.0,2,234,92,0
1,1.28,tcp,147.32.84.192,2921,->,65.54.165.169,443,FSPA_FSPA,0.0,0.0,23,10482,4298,1
2,0.0,udp,147.32.84.138,39007,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
3,0.0,udp,147.32.84.138,43845,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
4,3479.62,udp,147.32.85.56,44076,<->,195.20.24.37,17714,CON,0.0,0.0,40,4400,3160,0


In [11]:
sensor3_test.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,18.35,udp,93.15.196.174,41296,<->,147.32.84.229,13363,CON,0.0,0.0,4,328,142,0
1,0.0,udp,88.112.141.183,32323,<->,147.32.86.165,12114,CON,0.0,0.0,2,140,79,0
2,0.0,udp,147.32.84.138,32923,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
3,0.0,udp,147.32.84.138,34164,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
4,0.0,udp,147.32.84.138,50361,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0


## Mapping using Train Data

In [12]:
# Frequency Encoding
def frequency_mapping(df):
    freq_map = df.value_counts().to_dict()  # Create frequency map
    
    return freq_map

In [13]:
from sklearn.preprocessing import LabelEncoder
def label_mapping(df):
    le = LabelEncoder()
    label_map = le.fit(df.astype(str))
    return label_map

In [16]:
dir_map = label_mapping(sensor3_train['Dir'])
state_map = label_mapping(sensor3_train['State'])
# stos_map = label_mapping(sensor3_train['sTos'])
# dtos_map = label_mapping(sensor3_train['dTos'])
proto_map = label_mapping(sensor3_train['Proto'])
srcaddr_map = label_mapping(sensor3_train['SrcAddr'])
dstaddr_map = label_mapping(sensor3_train['DstAddr'])
sport_map = label_mapping(sensor3_train['Sport'])
dport_map = label_mapping(sensor3_train['Dport'])

In [17]:
# [Dir, State, sTos, dTos, Proto, SrcAddr, DstAddr, Sport, Dport]
label_map_arr = [
    dir_map,
    state_map,
    # stos_map,
    # dtos_map,
    proto_map,
    srcaddr_map,
    dstaddr_map,
    sport_map,
    dport_map,
]

In [18]:
freq_srcaddr_map = frequency_mapping(sensor3_train['SrcAddr'])
freq_dstaddr_map = frequency_mapping(sensor3_train['DstAddr'])
freq_sport_map = frequency_mapping(sensor3_train['Sport'])
freq_dport_map = frequency_mapping(sensor3_train['Dport'])

In [19]:
freq_map_arr = [
    freq_srcaddr_map,
    freq_dstaddr_map,
    freq_sport_map,
    freq_dport_map,
]

In [20]:
def label_transform(column, le):
    current_classes = list(le.classes_)
    new_class_start = len(current_classes)
    
    def transform_value(value):
        nonlocal new_class_start, current_classes
        value_str = str(value)  # Ensure the value is treated as a string
        
        if value_str in current_classes:
            # Known value
            return le.transform([value_str])[0]
        else:
            # Handle new unseen values by updating LabelEncoder
            current_classes.append(value_str)  # Add to class list
            le.classes_ = np.array(current_classes)  # Update LabelEncoder
            new_label = new_class_start
            new_class_start += 1
            return new_label
    
    return column.map(transform_value)

## Frequency Encoding + Label Encoding

In [21]:
def freqlabelEncoding(df, label_map_arr, freq_map_arr):
    # label_map_arr = [Dir, State, sTos, dTos, Proto, SrcAddr, DstAddr, Sport, Dport]
    # freq_map_arr = [SrcAddr, DstAddr, Sport, Dport]
    df['Dir'] = label_transform(df['Dir'], label_map_arr[0])
    df['State'] = label_transform(df['State'], label_map_arr[1])
    # df['sTos'] = label_transform(df['sTos'], label_map_arr[2])
    # df['dTos'] = label_transform(df['dTos'], label_map_arr[3])
    df['Proto'] = label_transform(df['Proto'], label_map_arr[2])

    df['SrcAddr'] = df['SrcAddr'].map(lambda x: freq_map_arr[0].get(x, 0))
    df['DstAddr'] = df['DstAddr'].map(lambda x: freq_map_arr[1].get(x, 0))
    df['Sport'] = df['Sport'].map(lambda x: freq_map_arr[2].get(x, 0))
    df['Dport'] = df['Dport'].map(lambda x: freq_map_arr[3].get(x, 0))

    return df

In [22]:
train_freqlabel = sensor3_train.copy()
train_freqlabel = freqlabelEncoding(train_freqlabel, label_map_arr, freq_map_arr)
train_freqlabel = add_target(train_freqlabel)

In [23]:
test_freqlabel = sensor3_test.copy()
test_freqlabel = freqlabelEncoding(sensor3_test, label_map_arr, freq_map_arr)
test_freqlabel = add_target(test_freqlabel)

In [24]:
train_freqlabel.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,0.0,4,437462,76,3,1174661,1185710,8,0.0,0.0,2,234,92,0,0,0
1,1.28,3,3307,140,0,40,91770,69,0.0,0.0,23,10482,4298,1,1,0
2,0.0,4,474139,39,3,1174661,1185710,8,0.0,0.0,2,214,81,0,0,0
3,0.0,4,474139,43,3,1174661,1185710,8,0.0,0.0,2,214,81,0,0,0
4,3479.62,4,662,468,3,24,28,8,0.0,0.0,40,4400,3160,0,0,0


In [25]:
test_freqlabel.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,18.35,4,0,57,3,334237,313801,8,0.0,0.0,4,328,142,0,0,0
1,0.0,4,0,8,3,197587,185027,8,0.0,0.0,2,140,79,0,0,0
2,0.0,4,474139,50,3,1174661,1185710,8,0.0,0.0,2,214,81,0,0,0
3,0.0,4,474139,38,3,1174661,1185710,8,0.0,0.0,2,214,81,0,0,0
4,0.0,4,474139,99,3,1174661,1185710,8,0.0,0.0,2,214,81,0,0,0


In [26]:
train_freqlabel.to_csv(dataset_dir + 'train3_freqlabelencoded.csv', index=False)
test_freqlabel.to_csv(dataset_dir + 'test3_freqlabelencoded.csv', index=False)