# Emotet
- Probably Emotet
- MD5 8baa9b809b591a11af423824f4d9726a
- [VirusTotal](https://www.virustotal.com/gui/file/6393fe8dd4721190f240e22feeb769675b6194a70cabd5a415c2364686a9089c/detection)
- [Malware Link](https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-114-3/)

In [1]:
import pandas as pd

## Data Preparation

### Data Loading

In [3]:
# Define the path to your file
file_path = r"../../data/labelled/static/dridex_data.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Duration,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Mean Payload Size,Std Payload Size,Mean Entropy,Inter-Packet Intervals,Label
0,0.0,216.218.185.162,192.168.1.110,80,49194,TCP,"FIN,ACK",1,34,34.0,0.0,0.0,[0],Benign
1,0.0,192.168.1.110,216.218.185.162,49194,80,TCP,RST,1,34,34.0,0.0,0.0,[0],Benign
2,8.473776,192.168.1.110,203.153.165.21,49191,8343,TCP,"FIN,SYN,ACK",5,182,36.4,4.8,0.0,[0.379873 1.022642 7.054314 0.016947],Benign
3,8.473635,203.153.165.21,192.168.1.110,8343,49191,TCP,"FIN,SYN,ACK",5,206,41.2,2.4,0.0,[0.420223 4.874338 2.782373 0.396701],Benign
4,4.833665,192.168.1.110,203.153.165.21,49191,8343,TLS,"PSH,ACK",3,1233,411.0,227.011013,3.377385,[0.389016 4.444649],Benign


### Data Balancaing

In [4]:
df["Label"].value_counts()

#! No need for data balancing

Label
Benign    30130
Dridex    30117
Name: count, dtype: int64

## Filling Null

In [5]:
df["Flags"] = df["Flags"].fillna("UNK")

## Data Engineering

In [6]:
engineering_df = df.copy()

In [7]:
engineering_df

Unnamed: 0,Duration,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Mean Payload Size,Std Payload Size,Mean Entropy,Inter-Packet Intervals,Label
0,0.000000,216.218.185.162,192.168.1.110,80,49194,TCP,"FIN,ACK",1,34,34.0,0.000000,0.000000,[0],Benign
1,0.000000,192.168.1.110,216.218.185.162,49194,80,TCP,RST,1,34,34.0,0.000000,0.000000,[0],Benign
2,8.473776,192.168.1.110,203.153.165.21,49191,8343,TCP,"FIN,SYN,ACK",5,182,36.4,4.800000,0.000000,[0.379873 1.022642 7.054314 0.016947],Benign
3,8.473635,203.153.165.21,192.168.1.110,8343,49191,TCP,"FIN,SYN,ACK",5,206,41.2,2.400000,0.000000,[0.420223 4.874338 2.782373 0.396701],Benign
4,4.833665,192.168.1.110,203.153.165.21,49191,8343,TLS,"PSH,ACK",3,1233,411.0,227.011013,3.377385,[0.389016 4.444649],Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60242,0.016348,192.188.58.163,192.168.1.110,4431,64254,TLS,"FIN,PSH,ACK",5,2898,579.6,508.031338,3.538817,[3.40002589e-05 1.47159998e-02 1.52700022e-03 ...,Benign
60243,0.563029,192.168.1.110,109.74.9.119,64255,4431,TCP,"FIN,SYN,ACK",5,182,36.4,4.800000,0.000000,[1.50999986e-04 1.66292000e-01 3.96414000e-01 ...,Dridex
60244,0.563039,109.74.9.119,192.168.1.110,4431,64255,TCP,"SYN,ACK",5,182,36.4,4.800000,0.000000,[0.000362 0.179966 0.04048 0.342231],Dridex
60245,0.181608,192.168.1.110,109.74.9.119,64255,4431,TLS,"PSH,ACK",3,1137,379.0,257.398524,3.422468,[0.179819 0.001789],Dridex


In [8]:
engineering_df["Duration"] = engineering_df["Duration"].replace(0.000, 0.000001)


engineering_df["Bytes per Packet"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Packets"] if row["Packets"] > 0 else 0, axis=1
)

# Calculate 'Packets per Second' and 'Bytes per Second' if duration is not zero
engineering_df["Packets per Second"] = engineering_df.apply(
    lambda row: row["Packets"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

engineering_df["Bytes per Second"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

In [9]:
def clean_port(port):
    try:
        # Attempt to convert port to integer
        return int(port)
    except ValueError:
        # If conversion fails, return 0
        return 0


# Clean the 'Destination Port' data
engineering_df["Destination Port"] = engineering_df["Destination Port"].apply(
    clean_port
)

engineering_df["Source Port"] = engineering_df["Source Port"].apply(clean_port)


def is_encrypted_protocol(port):
    encrypted_ports = {443, 22, 993, 995, 465, 587, 636, 989, 990, 992, 1194, 500}
    return 1 if port in encrypted_ports else 0


def is_common_port(port):
    common_ports = {80, 443, 21, 22, 25, 110, 143, 3306, 3389, 5900, 53, 23}
    return 1 if port in common_ports else 0


engineering_df["Destination Port"] = engineering_df["Destination Port"].astype(int)
engineering_df["Source Port"] = engineering_df["Source Port"].astype(int)


# Apply the encryption check
engineering_df["Is Encrypted Traffic"] = engineering_df["Destination Port"].apply(
    is_encrypted_protocol
)

# Apply the common port check
engineering_df["Common Port Usage"] = engineering_df["Destination Port"].apply(
    is_common_port
)

## Additional Shit

In [10]:
def feature_engineering(df):
    # Flags Count (specific flags)
    df['flags_count'] = df['Flags'].apply(lambda x: x.split(',') if x else [])
    df['syn_count'] = df['flags_count'].apply(lambda x: x.count('SYN'))
    df['ack_count'] = df['flags_count'].apply(lambda x: x.count('ACK'))
    df['fin_count'] = df['flags_count'].apply(lambda x: x.count('FIN'))
    df['flags_count'] = df['flags_count'].apply(lambda x: len(x))


    # Is HTTP Protocol
    df['is_http'] = (df['Protocol'] == 'HTTP').astype(int)

    # Internal IP Check
    df['is_internal_ip'] = df['Source IP'].apply(lambda x: 1 if x.startswith('10.') or x.startswith('172.') or x.startswith('192.') else 0)

    # Packet Direction (assumed local device IP is 10.0.2.102 for example)
    local_device_ip = '192.168.1.110'
    df['direction'] = df['Destination IP'].apply(lambda x: 'inbound' if x == local_device_ip else 'outbound')

    # Short Duration Check
    threshold_duration = 0.1  # Example threshold
    df['short_duration'] = (df['Duration'] < threshold_duration).astype(int)

    # Single Packet Check
    df['single_packet'] = (df['Packets'] == 1).astype(int)

    return df

engineering_df = feature_engineering(engineering_df)


In [11]:
engineering_df.drop(["Destination IP", "Source IP"], axis=1, inplace=True)

In [12]:
df = engineering_df.copy()


df.to_csv(r"../../data/processed/dridex_static.csv", index=False)

In [13]:
df

Unnamed: 0,Duration,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Mean Payload Size,Std Payload Size,Mean Entropy,...,Common Port Usage,flags_count,syn_count,ack_count,fin_count,is_http,is_internal_ip,direction,short_duration,single_packet
0,0.000001,80,49194,TCP,"FIN,ACK",1,34,34.0,0.000000,0.000000,...,0,2,0,1,1,0,0,inbound,1,1
1,0.000001,49194,80,TCP,RST,1,34,34.0,0.000000,0.000000,...,1,1,0,0,0,0,1,outbound,1,1
2,8.473776,49191,8343,TCP,"FIN,SYN,ACK",5,182,36.4,4.800000,0.000000,...,0,3,1,1,1,0,1,outbound,0,0
3,8.473635,8343,49191,TCP,"FIN,SYN,ACK",5,206,41.2,2.400000,0.000000,...,0,3,1,1,1,0,0,inbound,0,0
4,4.833665,49191,8343,TLS,"PSH,ACK",3,1233,411.0,227.011013,3.377385,...,0,2,0,1,0,0,1,outbound,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60242,0.016348,4431,64254,TLS,"FIN,PSH,ACK",5,2898,579.6,508.031338,3.538817,...,0,3,0,1,1,0,1,inbound,1,0
60243,0.563029,64255,4431,TCP,"FIN,SYN,ACK",5,182,36.4,4.800000,0.000000,...,0,3,1,1,1,0,1,outbound,0,0
60244,0.563039,4431,64255,TCP,"SYN,ACK",5,182,36.4,4.800000,0.000000,...,0,2,1,1,0,0,0,inbound,0,0
60245,0.181608,64255,4431,TLS,"PSH,ACK",3,1137,379.0,257.398524,3.422468,...,0,2,0,1,0,0,1,outbound,0,0
