In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dataset_dir = '../../Dataset/'

In [4]:
os.listdir(dataset_dir)

['NCC-2 Dataset Simultaneous Botnet Dataset',
 'Outdated',
 'sensor1_test.csv',
 'sensor1_train.csv',
 'sensor2_test.csv',
 'sensor2_train.csv',
 'sensor3_test.csv',
 'sensor3_train.csv',
 'test1_clean.csv',
 'test2_clean.csv',
 'test3_clean.csv',
 'test_freqlabelencoded.csv',
 'test_removefreqencoded.csv',
 'train1_clean.csv',
 'train2_clean.csv',
 'train3_clean.csv',
 'train_freqlabelencoded.csv',
 'train_removefreqencoded.csv']

# Add Target

In [5]:
def add_target(df):
    df['isBotnet'] = df['Label'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
    df['isSpam'] = df['Label'].apply(lambda x: 1 if x == 2 else 0)
    return df

# Sensor 3

In [6]:
sensor3_train = pd.read_csv(dataset_dir + 'train3_clean.csv')
sensor3_test = pd.read_csv(dataset_dir + 'test3_clean.csv')

In [7]:
sensor3_train = sensor3_train.drop(columns=['StartTime'])
sensor3_test = sensor3_test.drop(columns=['StartTime'])

In [8]:
sensor3_train.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,0.0,udp,95.52.89.144,1867,<->,147.32.84.229,13363,CON,0.0,0.0,2,548,488,0
1,0.0,udp,147.32.84.138,42444,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
2,1.24,tcp,147.32.85.76,1113,->,77.75.73.170,993,FSPA_FSRPA,0.0,0.0,53,17709,2206,0
3,3251.45,udp,178.74.193.49,12852,<->,147.32.84.229,13363,CON,0.0,0.0,6,1270,232,0
4,0.0,udp,147.32.84.59,49599,<->,147.32.80.9,53,CON,0.0,0.0,2,200,75,0


In [9]:
sensor3_test.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,0.02,udp,147.32.84.118,65502,<->,147.32.80.9,53,CON,0.0,0.0,2,335,73,0
1,0.0,udp,212.104.119.129,23512,<->,147.32.86.165,12114,CON,0.0,0.0,2,138,77,0
2,0.17,udp,69.104.66.134,59705,<->,147.32.84.192,31037,CON,0.0,0.0,2,485,145,1
3,104.19,udp,85.240.58.228,53275,<->,147.32.84.229,13363,CON,0.0,0.0,4,499,364,0
4,0.33,tcp,147.32.84.59,63491,->,76.13.114.90,80,FSPA_FSPA,0.0,0.0,9,1535,699,0


## Mapping using Train Data

In [10]:
# Frequency Encoding
def frequency_mapping(df):
    freq_map = df.value_counts().to_dict()  # Create frequency map
    
    return freq_map

In [11]:
from sklearn.preprocessing import LabelEncoder
def label_mapping(df):
    le = LabelEncoder()
    label_map = le.fit(df.astype(str))
    return label_map

In [12]:
dir_map = label_mapping(sensor3_train['Dir'])
state_map = label_mapping(sensor3_train['State'])
stos_map = label_mapping(sensor3_train['sTos'])
dtos_map = label_mapping(sensor3_train['dTos'])
proto_map = label_mapping(sensor3_train['Proto'])
srcaddr_map = label_mapping(sensor3_train['SrcAddr'])
dstaddr_map = label_mapping(sensor3_train['DstAddr'])
sport_map = label_mapping(sensor3_train['Sport'])
dport_map = label_mapping(sensor3_train['Dport'])

In [13]:
# [Dir, State, sTos, dTos, Proto, SrcAddr, DstAddr, Sport, Dport]
label_map_arr = [
    dir_map,
    state_map,
    stos_map,
    dtos_map,
    proto_map,
    srcaddr_map,
    dstaddr_map,
    sport_map,
    dport_map,
]

In [14]:
freq_srcaddr_map = frequency_mapping(sensor3_train['SrcAddr'])
freq_dstaddr_map = frequency_mapping(sensor3_train['DstAddr'])
freq_sport_map = frequency_mapping(sensor3_train['Sport'])
freq_dport_map = frequency_mapping(sensor3_train['Dport'])

In [15]:
freq_map_arr = [
    freq_srcaddr_map,
    freq_dstaddr_map,
    freq_sport_map,
    freq_dport_map,
]

In [16]:
def label_transform(column, le):
    current_classes = list(le.classes_)
    new_class_start = len(current_classes)
    
    def transform_value(value):
        nonlocal new_class_start, current_classes
        value_str = str(value)  # Ensure the value is treated as a string
        
        if value_str in current_classes:
            # Known value
            return le.transform([value_str])[0]
        else:
            # Handle new unseen values by updating LabelEncoder
            current_classes.append(value_str)  # Add to class list
            le.classes_ = np.array(current_classes)  # Update LabelEncoder
            new_label = new_class_start
            new_class_start += 1
            return new_label
    
    return column.map(transform_value)

## Frequency Encoding + Label Encoding

In [17]:
def freqlabelEncoding(df, label_map_arr, freq_map_arr):
    # label_map_arr = [Dir, State, sTos, dTos, Proto, SrcAddr, DstAddr, Sport, Dport]
    # freq_map_arr = [SrcAddr, DstAddr, Sport, Dport]
    df['Dir'] = label_transform(df['Dir'], label_map_arr[0])
    df['State'] = label_transform(df['State'], label_map_arr[1])
    df['sTos'] = label_transform(df['sTos'], label_map_arr[2])
    df['dTos'] = label_transform(df['dTos'], label_map_arr[3])
    df['Proto'] = label_transform(df['Proto'], label_map_arr[4])

    df['SrcAddr'] = df['SrcAddr'].map(lambda x: freq_map_arr[0].get(x, 0))
    df['DstAddr'] = df['DstAddr'].map(lambda x: freq_map_arr[1].get(x, 0))
    df['Sport'] = df['Sport'].map(lambda x: freq_map_arr[2].get(x, 0))
    df['Dport'] = df['Dport'].map(lambda x: freq_map_arr[3].get(x, 0))

    return df

In [19]:
train_freqlabel = sensor3_train.copy()
train_freqlabel = freqlabelEncoding(train_freqlabel, label_map_arr, freq_map_arr)
train_freqlabel = add_target(train_freqlabel)

In [20]:
test_freqlabel = sensor3_test.copy()
test_freqlabel = freqlabelEncoding(sensor3_test, label_map_arr, freq_map_arr)
test_freqlabel = add_target(test_freqlabel)

In [22]:
train_freqlabel.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,0.0,4,1,106,3,338987,318377,7,0,0,2,548,488,0,0,0
1,0.0,4,492764,50,3,1211113,1220778,7,0,0,2,214,81,0,0,0
2,1.24,3,6457,202,0,248,3257,76,0,0,53,17709,2206,0,0,0
3,3251.45,4,1,6,3,338987,318377,7,0,0,6,1270,232,0,0,0
4,0.0,4,440828,86,3,1211113,1220778,7,0,0,2,200,75,0,0,0


In [23]:
test_freqlabel.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,isBotnet,isSpam
0,0.02,4,32986,36,3,1211113,1220778,7,0,0,2,335,73,0,0,0
1,0.0,4,1,12,3,197730,185254,7,0,0,2,138,77,0,0,0
2,0.17,4,731,246,3,791,732,7,0,0,2,485,145,1,1,0
3,104.19,4,1,85,3,338987,318377,7,0,0,4,499,364,0,0,0
4,0.33,3,440828,32,0,11057,469208,73,0,0,9,1535,699,0,0,0


In [24]:
train_freqlabel.to_csv(dataset_dir + 'train_freqlabelencoded.csv', index=False)
test_freqlabel.to_csv(dataset_dir + 'test_freqlabelencoded.csv', index=False)

## Remove 4 frequency encoding

In [None]:
# train_removefreq = pd.read_csv(dataset_dir + 'train_freqlabelencoded.csv')
# test_removefreq = pd.read_csv(dataset_dir + 'test_freqlabelencoded.csv')

In [None]:
train_removefreq = train_freqlabel.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport'])
test_removefreq = test_freqlabel.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport'])

In [26]:
train_removefreq.to_csv(dataset_dir + 'train_removefreqencoded.csv', index=False)
test_removefreq.to_csv(dataset_dir + 'test_removefreqencoded.csv', index=False)