# Dridex
- **Probable Name**: `Dridex`
- **MD5**: 3635ac6099baedae893b3991f730652c
- **SHA1**: cd08cc349459f99be7d00c046f6b9e5203c0f110
- **SHA256**: 326d9bf458c589d7988886d111b6933db21efc950bfa1b44b1814c9dfdcb674b
- [VirusTotal](https://www.virustotal.com/gui/file/326d9bf458c589d7988886d111b6933db21efc950bfa1b44b1814c9dfdcb674b/detection)<br>
- [Source Link](https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-246-1/)

In [1]:
import pandas as pd

## Data Preparation

### Data Loading

In [None]:
file_path = r"../../data/labelled/dridex_data.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Duration,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Mean Payload Size,Std Payload Size,Min Payload Size,Max Payload Size,Mean Entropy,Min Entropy,Max Entropy,Mean Inter-Packet Interval,Min Inter-Packet Interval,Max Inter-Packet Interval,Label
0,0.0,216.218.185.162,192.168.1.110,80,49194,TCP,"FIN,ACK",1,34,34.0,0.0,34,34,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,0.0,192.168.1.110,216.218.185.162,49194,80,TCP,RST,1,34,34.0,0.0,34,34,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,8.473776,192.168.1.110,203.153.165.21,49191,8343,TCP,"FIN,SYN,ACK",5,182,36.4,4.8,34,46,0.0,0.0,0.0,2.118444,0.016947,7.054314,Benign
3,8.473635,203.153.165.21,192.168.1.110,8343,49191,TCP,"FIN,SYN,ACK",5,206,41.2,2.4,40,46,0.0,0.0,0.0,2.118409,0.396701,4.874338,Benign
4,4.833665,192.168.1.110,203.153.165.21,49191,8343,TLS,"PSH,ACK",3,1233,411.0,227.011013,162,711,3.377385,2.98776,3.584673,2.416832,0.389016,4.444649,Benign


### Data Balancaing

In [3]:
df["Label"].value_counts()

#! No need for data balancing

Label
Benign    30130
Dridex    30117
Name: count, dtype: int64

## Filling Null

In [4]:
df["Flags"] = df["Flags"].fillna("UNK")

## Data Engineering

In [5]:
engineering_df = df.copy()

In [7]:
engineering_df["Duration"] = engineering_df["Duration"].replace(0.000, 0.000001)


engineering_df["Bytes per Packet"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Packets"] if row["Packets"] > 0 else 0, axis=1
)

# Calculate 'Packets per Second' and 'Bytes per Second' if duration is not zero
engineering_df["Packets per Second"] = engineering_df.apply(
    lambda row: row["Packets"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

engineering_df["Bytes per Second"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

In [8]:
def clean_port(port):
    try:
        # Attempt to convert port to integer
        return int(port)
    except ValueError:
        # If conversion fails, return 0
        return 0


# Clean the 'Destination Port' data
engineering_df["Destination Port"] = engineering_df["Destination Port"].apply(
    clean_port
)

engineering_df["Source Port"] = engineering_df["Source Port"].apply(clean_port)


def is_encrypted_protocol(port):
    encrypted_ports = {443, 22, 993, 995, 465, 587, 636, 989, 990, 992, 1194, 500}
    return 1 if port in encrypted_ports else 0


def is_common_port(port, df = engineering_df):
    common_ports = {80, 443, 21, 22, 25, 110, 143, 3306, 3389, 5900, 53, 23}
    return 1 if port in common_ports else 0


engineering_df["Destination Port"] = engineering_df["Destination Port"].astype(int)
engineering_df["Source Port"] = engineering_df["Source Port"].astype(int)

engineering_df["Destination Common Port Usage"] = engineering_df["Destination Port"].apply(
    is_common_port
)

In [None]:
def further_feature_engineering(df):
    # Flags Count (specific flags)
    df['Flags Count'] = df['Flags'].apply(lambda x: x.split(',') if x else [])
    df['SYN Count'] = df['Flags Count'].apply(lambda x: x.count('SYN'))
    df['ACK Count'] = df['Flags Count'].apply(lambda x: x.count('ACK'))
    df['FIN Count'] = df['Flags Count'].apply(lambda x: x.count('FIN'))
    df['Flags Count'] = df['Flags Count'].apply(lambda x: len(x))

    # Is HTTP Protocol
    df['Is HTTP'] = (df['Protocol'] == 'HTTP').astype(int)

    # Internal IP Check
    df['Is Internal IP'] = df['Source IP'].apply(lambda x: 1 if x.startswith('10.') or x.startswith('172.') or x.startswith('192.') else 0)

    # Packet Direction (assumed local device IP is 10.0.2.102 for example)
    local_device_ip = "192.168.1.110"
    df['Direction'] = df['Destination IP'].apply(lambda x: 'inbound' if x == local_device_ip else 'outbound')

    # Short Duration Check
    threshold_duration = 0.1
    df['Short Duration'] = (df['Duration'] < threshold_duration).astype(int)

    # Single Packet Check
    df['Single Packet'] = (df['Packets'] == 1).astype(int)

    return df

engineering_df = further_feature_engineering(engineering_df)

In [10]:
engineering_df.drop(["Destination IP", "Source IP"], axis=1, inplace=True)

In [11]:
df = engineering_df.copy()


df.to_csv(r"../../data/processed/dridex.csv", index=False)