In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import numpy as np

In [21]:
# The CICIDS2017 dataset has a LOT of different files
# FOr practicality + a more global view on different types of attacks, we'll concat all of the files into a complete df
files = glob.glob('../datasets/CICIDS2017/*.csv')
dfs = [pd.read_csv(f, encoding='latin1') for f in files]
combined_df = pd.concat(dfs, ignore_index=True)
CICIDS_2017 = combined_df.copy() # For naming purposes

In [22]:
pd.set_option('display.max_rows', None) # So we don't get clipped off by pandas
pd.set_option('display.max_columns', None)
CICIDS_2017.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,0,0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,3,3,0,0.0,0.0,0,0,0,0,0,0,40,0,666666.6667,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,33,-1,1,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,110091.7,18348.62385,109.0,0.0,109,109,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,9174.311927,9174.311927,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,230769.2,38461.53846,52.0,0.0,52,52,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,19230.76923,19230.76923,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,6,6,6.0,0.0,352941.2,58823.52941,34.0,0.0,34,34,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,20,20,29411.76471,29411.76471,6,6,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,31,329,0,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,0,0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3,3,3,3.0,0.0,3,3,0,0.0,0.0,0,0,0,0,0,0,40,0,666666.6667,0.0,6,6,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,32,-1,1,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [23]:
# Some columns seem to be having leading spaces in their names, so we'll strip those for quality-of-life
CICIDS_2017.columns = CICIDS_2017.columns.str.strip()

### Despite having different features, we'll try to follow the feature engineering of UNSW_NB15 as close as possible

In [24]:
# Duration binning
CICIDS_2017['dur_category'] = pd.cut(CICIDS_2017['Flow Duration'], 
                                     bins=[0, 1000000, 10000000, 60000000, float('inf')],
                                     labels=['instant', 'short', 'medium', 'long'])

In [25]:
# Packet/byte totals
CICIDS_2017['packets_total'] = CICIDS_2017['Total Fwd Packets'] + CICIDS_2017['Total Backward Packets']
CICIDS_2017['bytes_total'] = CICIDS_2017['Total Length of Fwd Packets'] + CICIDS_2017['Total Length of Bwd Packets']
CICIDS_2017['avg_packet_size'] = CICIDS_2017['bytes_total'] / (CICIDS_2017['packets_total'] + 1)

In [26]:
# Traffic direction asymmetry
CICIDS_2017['packet_ratio'] = CICIDS_2017['Total Fwd Packets'] / (CICIDS_2017['Total Backward Packets'] + 1)
CICIDS_2017['byte_ratio'] = CICIDS_2017['Total Length of Fwd Packets'] / (CICIDS_2017['Total Length of Bwd Packets'] + 1)
CICIDS_2017['is_asymmetric'] = ((CICIDS_2017['packet_ratio'] > 10) | (CICIDS_2017['packet_ratio'] < 0.1)).astype(int)

In [27]:
# Let's group the common ports into their corresponding services
def port_to_service(port):
    port_map = {
        80: 'http', 443: 'http', 8080: 'http',
        53: 'dns',
        22: 'ssh',
        21: 'ftp',
        25: 'smtp', 587: 'smtp',
        3389: 'rdp',
        445: 'smb',
        23: 'telnet'
    }
    return port_map.get(port, '-')

In [29]:
CICIDS_2017['service_grouped'] = CICIDS_2017['Destination Port'].apply(port_to_service)

In [59]:
# Connection quality indicators
CICIDS_2017['connection_completed'] = ((CICIDS_2017['FIN Flag Count'] > 0) | (CICIDS_2017['packets_total'] > 10)).astype(int)
CICIDS_2017['connection_failed'] = (CICIDS_2017['RST Flag Count'] > 0).astype(int)

In [32]:
# Jittery features which show network instability
CICIDS_2017['avg_jitter'] = (CICIDS_2017['Flow IAT Std'] + CICIDS_2017['Fwd IAT Std'] + CICIDS_2017['Bwd IAT Std']) / 3
CICIDS_2017['high_jitter'] = (CICIDS_2017['avg_jitter'] > CICIDS_2017['avg_jitter'].quantile(0.9)).astype(int)

In [57]:
# TCP window features
CICIDS_2017['has_tcp_info'] = ((CICIDS_2017['Init_Win_bytes_forward'] > 0) | (CICIDS_2017['Init_Win_bytes_backward'] > 0)).astype(int)
CICIDS_2017['window_size_avg'] = (CICIDS_2017['Init_Win_bytes_forward'] + CICIDS_2017['Init_Win_bytes_backward']) / 2

In [58]:
# THe characteristics we can observe from mult. connections
CICIDS_2017['diverse_ports'] = ((CICIDS_2017['Fwd Packets/s'] > 10) & (CICIDS_2017['Average Packet Size'] < 200)).astype(int)
CICIDS_2017['diverse_src_ports'] = (CICIDS_2017['Bwd Packets/s'] > 10).astype(int)

In [35]:
# Repeated connection patterns
CICIDS_2017['repeated_connection'] = (CICIDS_2017['Down/Up Ratio'] == 0).astype(int)

In [37]:
# Response size categorization/binning
CICIDS_2017['response_body_len'] = CICIDS_2017['Total Length of Bwd Packets']
CICIDS_2017['response_size_cat'] = pd.cut(CICIDS_2017['response_body_len'],
                                          bins=[0, 1000, 10000, 100000, float('inf')],
                                          labels=['small', 'medium', 'large', 'very_large'])
CICIDS_2017['has_response'] = (CICIDS_2017['Total Backward Packets'] > 0).astype(int)

In [38]:
# Some simple binary attack labeling
CICIDS_2017['is_attack'] = (CICIDS_2017['Label'] != 'BENIGN').astype(int)
CICIDS_2017['is_benign'] = (CICIDS_2017['Label'] == 'BENIGN').astype(int)

In [39]:
# Tried to map these attack groups as similar as possible to how it was done within UNSW_NB15
attack_type_groups = {
    'BENIGN': 'Benign',
    'Bot': 'Backdoor',
    'DDoS': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS Hulk': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'DoS slowloris': 'DoS',
    'FTP-Patator': 'Exploits',
    'SSH-Patator': 'Exploits',
    'Heartbleed': 'Exploits',
    'Infiltration': 'Backdoor',
    'PortScan': 'Reconnaissance',
    'Web Attack – Brute Force': 'Exploits',
    'Web Attack – Sql Injection': 'Exploits',
    'Web Attack – XSS': 'Exploits'
}

In [40]:
CICIDS_2017['attack_group'] = CICIDS_2017['Label'].map(attack_type_groups).fillna('Other')

In [47]:
# General nan/inf handling
CICIDS_2017.replace([np.inf, -np.inf], np.nan, inplace=True)

# Due to type mismatch issues, we must handle categorical columns differently than numeric ones
categorical_cols = CICIDS_2017.select_dtypes(include=['category']).columns
numeric_cols = CICIDS_2017.select_dtypes(include=[np.number]).columns
object_cols = CICIDS_2017.select_dtypes(include=['object']).columns

# Fill numeric columns w/ a 0
CICIDS_2017[numeric_cols] = CICIDS_2017[numeric_cols].fillna(0)
# Fill object columns w/ dummy val
CICIDS_2017[object_cols] = CICIDS_2017[object_cols].fillna('unknown')

for col in categorical_cols:
    CICIDS_2017[col] = CICIDS_2017[col].astype(str).replace('nan', 'unknown').astype('category') # Convert to string -> fill -> then convert back

In [51]:
# Subsets are kept the same as in UNSW_NB15 to avoid the large pain of making joint inferences from contrasting subsets
subsets = {}

In [60]:
# Balanced subset
benign = CICIDS_2017[CICIDS_2017['is_benign'] == 1].sample(n=min(15000, len(CICIDS_2017[CICIDS_2017['is_benign'] == 1])), random_state=42)
attack = CICIDS_2017[CICIDS_2017['is_attack'] == 1].sample(n=min(15000, len(CICIDS_2017[CICIDS_2017['is_attack'] == 1])), random_state=42)
subsets['balanced'] = pd.concat([benign, attack]).sample(frac=1, random_state=42)

In [53]:
# Attack-focused subset
benign_small = CICIDS_2017[CICIDS_2017['is_benign'] == 1].sample(n=5000, random_state=42)
all_attacks = CICIDS_2017[CICIDS_2017['is_attack'] == 1]
subsets['attack_focused'] = pd.concat([all_attacks, benign_small]).sample(frac=1, random_state=42)

In [54]:
# Service-specific subsets
for service in ['http', 'dns', 'smtp', 'ssh', 'ftp']:
    service_data = CICIDS_2017[CICIDS_2017['service_grouped'] == service]
    if len(service_data) > 0:
        subsets[f'service_{service}'] = service_data.sample(
            n=min(5000, len(service_data)), random_state=42)

In [55]:
os.makedirs('subsets_cicids', exist_ok=True)
for name, data in subsets.items():
    data.to_csv(f'subsets_cicids/cicids2017_{name}.csv', index=False)
    print(f"Saved {name}: {len(data)} rows, {data['is_attack'].sum()} attacks")

Saved balanced: 30000 rows, 15000 attacks
Saved attack_focused: 562646 rows, 557646 attacks
Saved service_http: 5000 rows, 1718 attacks
Saved service_dns: 5000 rows, 1 attacks
Saved service_smtp: 439 rows, 318 attacks
Saved service_ssh: 5000 rows, 1794 attacks
Saved service_ftp: 5000 rows, 3022 attacks


In [56]:
CICIDS_2017.to_csv("CICIDS2017_Modified.csv", index=False)