In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dataset_dir = '../../Dataset/Revisi 4/'

In [4]:
os.listdir(dataset_dir)

['sensor3_test.csv',
 'sensor3_train.csv',
 'test3_clean.csv',
 'train3_clean.csv']

# Add Target

In [5]:
def add_target(df):
    df['isBotnet'] = df['Label'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
    df['isSpam'] = df['Label'].apply(lambda x: 1 if x == 2 else 0)
    return df

# Sensor 3

In [6]:
sensor3_train = pd.read_csv(dataset_dir + 'train3_clean.csv')
sensor3_test = pd.read_csv(dataset_dir + 'test3_clean.csv')

In [7]:
sensor3_train.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,3.48,tcp,109.226.64.144,48292,->,147.32.86.98,22,FSPA_FSPA,0.0,0.0,24,4982,1685,0
1,1333.14,udp,178.162.135.66,28533,<->,147.32.84.229,13363,CON,0.0,0.0,6,792,611,0
2,0.0,udp,147.32.84.138,46958,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
3,0.0,udp,147.32.84.138,36149,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
4,0.0,udp,147.32.84.59,54328,<->,147.32.80.9,53,CON,0.0,0.0,2,239,83,0


In [8]:
sensor3_test.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,0.33,tcp,147.32.84.59,64963,->,76.13.114.90,80,FSPA_FSPA,0.0,0.0,9,1535,699,0
1,1.11,tcp,147.32.84.209,3329,->,65.54.234.24,443,FSPA_FSPA,0.0,0.0,16,5578,1440,1
2,0.0,udp,147.32.84.59,43218,<->,147.32.80.9,53,CON,0.0,0.0,2,205,65,0
3,0.0,udp,147.32.84.59,44337,<->,147.32.80.9,53,CON,0.0,0.0,2,240,78,0
4,0.0,udp,147.32.86.20,49288,<->,147.32.80.9,53,CON,0.0,0.0,2,284,76,0


## Mapping using Train Data

In [9]:
# Frequency Encoding
def frequency_mapping(df):
    freq_map = df.value_counts().to_dict()  # Create frequency map
    
    return freq_map

In [10]:
from sklearn.preprocessing import LabelEncoder
def label_mapping(df):
    le = LabelEncoder()
    label_map = le.fit(df.astype(str))
    return label_map

In [11]:
dir_map = label_mapping(sensor3_train['Dir'])
state_map = label_mapping(sensor3_train['State'])
stos_map = label_mapping(sensor3_train['sTos'])
dtos_map = label_mapping(sensor3_train['dTos'])
proto_map = label_mapping(sensor3_train['Proto'])
srcaddr_map = label_mapping(sensor3_train['SrcAddr'])
dstaddr_map = label_mapping(sensor3_train['DstAddr'])
sport_map = label_mapping(sensor3_train['Sport'])
dport_map = label_mapping(sensor3_train['Dport'])

In [12]:
# [Dir, State, sTos, dTos, Proto, SrcAddr, DstAddr, Sport, Dport]
label_map_arr = [
    dir_map,
    state_map,
    stos_map,
    dtos_map,
    proto_map,
    srcaddr_map,
    dstaddr_map,
    sport_map,
    dport_map,
]

In [13]:
freq_srcaddr_map = frequency_mapping(sensor3_train['SrcAddr'])
freq_dstaddr_map = frequency_mapping(sensor3_train['DstAddr'])
freq_sport_map = frequency_mapping(sensor3_train['Sport'])
freq_dport_map = frequency_mapping(sensor3_train['Dport'])

In [14]:
freq_map_arr = [
    freq_srcaddr_map,
    freq_dstaddr_map,
    freq_sport_map,
    freq_dport_map,
]

In [15]:
def label_transform(column, le):
    current_classes = list(le.classes_)
    new_class_start = len(current_classes)
    
    def transform_value(value):
        nonlocal new_class_start, current_classes
        value_str = str(value)  # Ensure the value is treated as a string
        
        if value_str in current_classes:
            # Known value
            return le.transform([value_str])[0]
        else:
            # Handle new unseen values by updating LabelEncoder
            current_classes.append(value_str)  # Add to class list
            le.classes_ = np.array(current_classes)  # Update LabelEncoder
            new_label = new_class_start
            new_class_start += 1
            return new_label
    
    return column.map(transform_value)

## Frequency Encoding + Label Encoding

In [16]:
def freqlabelEncoding(df, label_map_arr, freq_map_arr):
    # label_map_arr = [Dir, State, sTos, dTos, Proto, SrcAddr, DstAddr, Sport, Dport]
    # freq_map_arr = [SrcAddr, DstAddr, Sport, Dport]
    df['Dir'] = label_transform(df['Dir'], label_map_arr[0])
    df['State'] = label_transform(df['State'], label_map_arr[1])
    df['sTos'] = label_transform(df['sTos'], label_map_arr[2])
    df['dTos'] = label_transform(df['dTos'], label_map_arr[3])
    df['Proto'] = label_transform(df['Proto'], label_map_arr[4])

    df['SrcAddr'] = df['SrcAddr'].map(lambda x: freq_map_arr[0].get(x, 0))
    df['DstAddr'] = df['DstAddr'].map(lambda x: freq_map_arr[1].get(x, 0))
    df['Sport'] = df['Sport'].map(lambda x: freq_map_arr[2].get(x, 0))
    df['Dport'] = df['Dport'].map(lambda x: freq_map_arr[3].get(x, 0))

    return df

In [17]:
train_freqlabel = sensor3_train.copy()
train_freqlabel = freqlabelEncoding(train_freqlabel, label_map_arr, freq_map_arr)
train_freqlabel = add_target(train_freqlabel)

In [18]:
test_freqlabel = sensor3_test.copy()
test_freqlabel = freqlabelEncoding(sensor3_test, label_map_arr, freq_map_arr)
test_freqlabel = add_target(test_freqlabel)

In [19]:
train_freqlabel.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,3.48,3,163,46,0,188,1707,76,0,0,24,4982,1685,0,0,0
1,1333.14,4,2,13,3,335793,315437,9,0,0,6,792,611,0,0,0
2,0.0,4,478320,44,3,1179362,1190339,9,0,0,2,214,81,0,0,0
3,0.0,4,478320,33,3,1179362,1190339,9,0,0,2,214,81,0,0,0
4,0.0,4,436875,73,3,1179362,1190339,9,0,0,2,239,83,0,0,0


In [20]:
test_freqlabel.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,0.33,3,436875,49,0,11044,398218,76,0,0,9,1535,699,0,0,0
1,1.11,3,4215,60,0,336,92953,76,0,0,16,5578,1440,1,1,0
2,0.0,4,436875,47,3,1179362,1190339,9,0,0,2,205,65,0,0,0
3,0.0,4,436875,47,3,1179362,1190339,9,0,0,2,240,78,0,0,0
4,0.0,4,70155,73,3,1179362,1190339,9,0,0,2,284,76,0,0,0


In [21]:
train_freqlabel.to_csv(dataset_dir + 'train3_freqlabelencoded.csv', index=False)
test_freqlabel.to_csv(dataset_dir + 'test3_freqlabelencoded.csv', index=False)