# Emotet
- Probably Emotet
- MD5 8baa9b809b591a11af423824f4d9726a
- [VirusTotal](https://www.virustotal.com/gui/file/6393fe8dd4721190f240e22feeb769675b6194a70cabd5a415c2364686a9089c/detection)
- [Malware Link](https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-114-3/)

In [1]:
import pandas as pd

## Data Preparation

### Data Loading

In [2]:
# Define the path to your file
file_path = r"../../data/labelled/emotet.csv"

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Duration,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Flows,Label
0,0.093964,10.0.2.102,224.0.0.252,52334,5355,LLMNR,,2,128,1,Benign
1,1.501686,10.0.2.102,10.0.2.255,137,137,NBNS,,3,276,1,Benign
2,0.0,10.0.2.102,8.8.8.8,59869,53,DNS,,1,76,1,Benign
3,0.0,8.8.8.8,10.0.2.102,53,59869,DNS,,1,181,1,Benign
4,0.0,10.0.2.102,8.8.8.8,64935,53,DNS,,1,76,1,Benign


### Data Balancaing

In [3]:
df["Label"].value_counts()

#! No need for data balancing

Label
Benign    44252
Emotet    36745
Name: count, dtype: int64

## Filling Null

In [4]:
df["Flags"] = df["Flags"].fillna("UNK")

## Data Engineering

In [5]:
engineering_df = df.copy()

In [6]:
engineering_df

Unnamed: 0,Duration,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Flows,Label
0,0.093964,10.0.2.102,224.0.0.252,52334,5355,LLMNR,UNK,2,128,1,Benign
1,1.501686,10.0.2.102,10.0.2.255,137,137,NBNS,UNK,3,276,1,Benign
2,0.000000,10.0.2.102,8.8.8.8,59869,53,DNS,UNK,1,76,1,Benign
3,0.000000,8.8.8.8,10.0.2.102,53,59869,DNS,UNK,1,181,1,Benign
4,0.000000,10.0.2.102,8.8.8.8,64935,53,DNS,UNK,1,76,1,Benign
...,...,...,...,...,...,...,...,...,...,...,...
80992,9.001526,10.0.2.102,103.1.186.61,54564,8080,TCP,SYN,3,194,1,Benign
80993,16.209324,10.0.2.102,202.44.54.4,54565,8080,TCP,"SYN,ACK",4,228,1,Emotet
80994,15.978455,202.44.54.4,10.0.2.102,8080,54565,TCP,"FIN,SYN,ACK",3,166,1,Emotet
80995,0.000000,10.0.2.102,202.44.54.4,54565,8080,HTTP,"PSH,ACK",1,474,1,Emotet


In [7]:
engineering_df["Duration"] = engineering_df["Duration"].replace(0.000, 0.000001)


engineering_df["Bytes per Packet"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Packets"] if row["Packets"] > 0 else 0, axis=1
)

# Calculate 'Packets per Second' and 'Bytes per Second' if duration is not zero
engineering_df["Packets per Second"] = engineering_df.apply(
    lambda row: row["Packets"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

engineering_df["Bytes per Second"] = engineering_df.apply(
    lambda row: row["Bytes"] / row["Duration"] if row["Duration"] > 0 else 0, axis=1
)

In [8]:
def clean_port(port):
    try:
        # Attempt to convert port to integer
        return int(port)
    except ValueError:
        # If conversion fails, return 0
        return 0


# Clean the 'Destination Port' data
engineering_df["Destination Port"] = engineering_df["Destination Port"].apply(
    clean_port
)

engineering_df["Source Port"] = engineering_df["Source Port"].apply(clean_port)


def is_encrypted_protocol(port):
    encrypted_ports = {443, 22, 993, 995, 465, 587, 636, 989, 990, 992, 1194, 500}
    return 1 if port in encrypted_ports else 0


def is_common_port(port):
    common_ports = {80, 443, 21, 22, 25, 110, 143, 3306, 3389, 5900, 53, 23}
    return 1 if port in common_ports else 0


engineering_df["Destination Port"] = engineering_df["Destination Port"].astype(int)
engineering_df["Source Port"] = engineering_df["Source Port"].astype(int)


# Apply the encryption check
engineering_df["Is Encrypted Traffic"] = engineering_df["Destination Port"].apply(
    is_encrypted_protocol
)

# Apply the common port check
engineering_df["Common Port Usage"] = engineering_df["Destination Port"].apply(
    is_common_port
)

## Additional Shit

In [9]:
def feature_engineering(df):

    # Flags Count (specific flags)
    df['flags_count'] = df['Flags'].apply(lambda x: x.split(',') if x else [])
    df['syn_count'] = df['flags_count'].apply(lambda x: x.count('SYN'))
    df['ack_count'] = df['flags_count'].apply(lambda x: x.count('ACK'))
    df['fin_count'] = df['flags_count'].apply(lambda x: x.count('FIN'))
    df['flags_count'] = df['flags_count'].apply(lambda x: len(x))


    # Is HTTP Protocol
    df['is_http'] = (df['Protocol'] == 'HTTP').astype(int)

    # Internal IP Check
    df['is_internal_ip'] = df['Source IP'].apply(lambda x: 1 if x.startswith('10.') or x.startswith('172.') or x.startswith('192.') else 0)

    # Packet Direction (assumed local device IP is 10.0.2.102 for example)
    local_device_ip = '10.0.2.102'
    df['direction'] = df['Destination IP'].apply(lambda x: 'inbound' if x == local_device_ip else 'outbound')

    # Short Duration Check
    threshold_duration = 0.1  # Example threshold
    df['short_duration'] = (df['Duration'] < threshold_duration).astype(int)

    # Single Packet Check
    df['single_packet'] = (df['Packets'] == 1).astype(int)

    return df

engineering_df = feature_engineering(engineering_df)


In [10]:
engineering_df.drop(["Destination IP", "Source IP","Flows"], axis=1, inplace=True)

In [11]:
df = engineering_df.copy()


df.to_csv(r"../../data/processed/emotet.csv", index=False)

In [None]:
df

Unnamed: 0,Duration,Source Port,Destination Port,Protocol,Flags,Packets,Bytes,Label,Bytes per Packet,Packets per Second,...,Common Port Usage,flags_count,syn_count,ack_count,fin_count,is_http,is_internal_ip,direction,short_duration,single_packet
0,0.093964,52334,5355,LLMNR,UNK,2,128,Benign,64.000000,21.284747,...,0,1,0,0,0,0,1,outbound,1,0
1,1.501686,137,137,NBNS,UNK,3,276,Benign,92.000000,1.997755,...,0,1,0,0,0,0,1,outbound,0,0
2,0.000001,59869,53,DNS,UNK,1,76,Benign,76.000000,1000000.000000,...,1,1,0,0,0,0,1,outbound,1,1
3,0.000001,53,59869,DNS,UNK,1,181,Benign,181.000000,1000000.000000,...,0,1,0,0,0,0,0,inbound,1,1
4,0.000001,64935,53,DNS,UNK,1,76,Benign,76.000000,1000000.000000,...,1,1,0,0,0,0,1,outbound,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80992,9.001526,54564,8080,TCP,SYN,3,194,Benign,64.666667,0.333277,...,0,1,1,0,0,0,1,outbound,0,0
80993,16.209324,54565,8080,TCP,"SYN,ACK",4,228,Emotet,57.000000,0.246772,...,0,2,1,1,0,0,1,outbound,0,0
80994,15.978455,8080,54565,TCP,"FIN,SYN,ACK",3,166,Emotet,55.333333,0.187753,...,0,3,1,1,1,0,0,inbound,0,0
80995,0.000001,54565,8080,HTTP,"PSH,ACK",1,474,Emotet,474.000000,1000000.000000,...,0,2,0,1,0,1,1,outbound,1,1
