In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
UNSW_NB15 = pd.read_csv('../datasets/UNSW-NB15/Training and Testing Sets/UNSW_NB15_training-set.csv')

In [5]:
pd.set_option('display.max_rows', None) # So we don't get clipped off by pandas
UNSW_NB15.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [6]:
# Duration categories for visualization
UNSW_NB15['dur_category'] = pd.cut(UNSW_NB15['dur'], 
                                  bins=[0, 1, 10, 60, float('inf')],
                                  labels=['instant', 'short', 'medium', 'long'])

In [7]:
# Rate-based features
UNSW_NB15['packets_total'] = UNSW_NB15['spkts'] + UNSW_NB15['dpkts']
UNSW_NB15['bytes_total'] = UNSW_NB15['sbytes'] + UNSW_NB15['dbytes']
UNSW_NB15['avg_packet_size'] = UNSW_NB15['bytes_total'] / (UNSW_NB15['packets_total'] + 1)

In [8]:
# Traffic direction asymmetry (could pose a problem for stateful devices, but usually doesn't matter too much)
UNSW_NB15['packet_ratio'] = UNSW_NB15['spkts'] / (UNSW_NB15['dpkts'] + 1)  # The ratio of src/dist we have
UNSW_NB15['byte_ratio'] = UNSW_NB15['sbytes'] / (UNSW_NB15['dbytes'] + 1)
UNSW_NB15['is_asymmetric'] = ((UNSW_NB15['packet_ratio'] > 10) | (UNSW_NB15['packet_ratio'] < 0.1)).astype(int)

In [9]:
# For simpler visualization, we'll group protocols
protocol_dict = {'tcp': 'TCP', 'udp': 'UDP', 'icmp': 'ICMP', 'arp': 'ARP'}
UNSW_NB15['protocol_category'] = UNSW_NB15['proto'].map(protocol_dict).fillna('Other')

In [10]:
# Service categories (we basically group rarer services here)
service_counts = UNSW_NB15['service'].value_counts()
top_services = service_counts.head(10).index
UNSW_NB15['service_grouped'] = UNSW_NB15['service'].apply(
    lambda x: x if x in top_services else 'other'
)

In [11]:
UNSW_NB15['service_protocol'] = UNSW_NB15['service_grouped'] + '_' + UNSW_NB15['protocol_category']

In [12]:
# State categories (CON=established, INT=intermediate, etc.)
# These classify the status of network connections for logging + analysis
state_categories = {
    'CON': 'established', # Network ongoing + all communication parameters have been negotiated
    'INT': 'intermediate', # Transitional state, like during initial TCP handshake
    'FIN': 'finished', # Application has initiated a termination of connection 
    'REQ': 'request', # Initial packet(s) requesting connection, such as a synchronize packet during TCP handshake
    'RST': 'reset' # Connection was terminated abruptly, usually due to host crash, connection attempt into closed port, firewall blocking traffic
} 
UNSW_NB15['state_category'] = UNSW_NB15['state'].map(state_categories).fillna('other')

In [13]:
# Connection quality indication in the dataset
UNSW_NB15['connection_completed'] = UNSW_NB15['state'].isin(['FIN', 'CON']).astype(int)
UNSW_NB15['connection_failed'] = UNSW_NB15['state'].isin(['RST', 'REQ']).astype(int)

In [14]:
# Loss features
UNSW_NB15['has_packet_loss'] = ((UNSW_NB15['sloss'] > 0) | (UNSW_NB15['dloss'] > 0)).astype(int)
UNSW_NB15['total_loss'] = UNSW_NB15['sloss'] + UNSW_NB15['dloss']
UNSW_NB15['loss_rate'] = UNSW_NB15['total_loss'] / (UNSW_NB15['packets_total'] + 1)

In [15]:
# Jitter features (this shows network instability)
UNSW_NB15['avg_jitter'] = (UNSW_NB15['sjit'] + UNSW_NB15['djit']) / 2
UNSW_NB15['high_jitter'] = (UNSW_NB15['avg_jitter'] > UNSW_NB15['avg_jitter'].quantile(0.9)).astype(int)

In [16]:
# TCP window sizes (0 might indicate non-TCP)
UNSW_NB15['has_tcp_info'] = ((UNSW_NB15['swin'] > 0) | (UNSW_NB15['dwin'] > 0)).astype(int)
UNSW_NB15['window_size_avg'] = (UNSW_NB15['swin'] + UNSW_NB15['dwin']) / 2

In [17]:
# TCP timing features
UNSW_NB15['has_timing'] = ((UNSW_NB15['tcprtt'] > 0) | (UNSW_NB15['synack'] > 0) | (UNSW_NB15['ackdat'] > 0)).astype(int)
UNSW_NB15['tcp_handshake_time'] = UNSW_NB15['synack'] + UNSW_NB15['ackdat']

In [18]:
# High connection counts (potential scanning)
UNSW_NB15['is_scanner'] = (
    (UNSW_NB15['ct_srv_src'] > 10) |  # many connections to same service
    (UNSW_NB15['ct_srv_dst'] > 10) |  # many connections from same service
    (UNSW_NB15['ct_src_ltm'] > 20)    # many connections from same source recently
).astype(int)

In [20]:
# Port scanning indicators
UNSW_NB15['diverse_ports'] = (UNSW_NB15['ct_src_dport_ltm'] > 5).astype(int)  # scanning for multiple dst ports
UNSW_NB15['diverse_src_ports'] = (UNSW_NB15['ct_dst_sport_ltm'] > 5).astype(int)

In [22]:
# Same IP/port combinations (this COULD be a sign of a potential DoS/persistent connection)
UNSW_NB15['repeated_connection'] = UNSW_NB15['is_sm_ips_ports']

In [23]:
# Indication of a connection burst
UNSW_NB15['connection_burst'] = (
    (UNSW_NB15['ct_state_ttl'] > 15) |  # many connections in same state
    (UNSW_NB15['ct_dst_src_ltm'] > 15)  # many connections between src dst pair
).astype(int)

In [24]:
# Application layer features specific to HTTP 
UNSW_NB15['is_http'] = (UNSW_NB15['ct_flw_http_mthd'] > 0).astype(int)
UNSW_NB15['http_methods_count'] = UNSW_NB15['ct_flw_http_mthd']

In [25]:
# Application layer features specific to FTP
UNSW_NB15['is_ftp'] = UNSW_NB15['is_ftp_login']
UNSW_NB15['ftp_commands_count'] = UNSW_NB15['ct_ftp_cmd']

In [27]:
# When categorizing web attacks, we'll bin them into response size categories
UNSW_NB15['response_size_cat'] = pd.cut(UNSW_NB15['response_body_len'],
                                        bins=[0, 1000, 10000, 100000, float('inf')],
                                        labels=['small', 'medium', 'large', 'huge'])
UNSW_NB15['has_response'] = (UNSW_NB15['response_body_len'] > 0).astype(int) # Generic has response feature

## Attack Labeling/Grouping

In [32]:
UNSW_NB15['is_attack'] = (UNSW_NB15['label'] == 1).astype(int)
UNSW_NB15['is_benign'] = (UNSW_NB15['label'] == 0).astype(int)

#### As stated from the Kaggle page, this dataset has "nine types of attack: Fuzzers, Analysis, Backdoors, DoS, Exploits, Generic, Reconnaissance, Shellcode and Worms"

In [31]:
# Let's group these attack types
attack_type_groups = {
    'Normal': 'Benign',
    'Generic': 'Generic Attack',
    'Exploits': 'Exploits',
    'Fuzzers': 'Fuzzers',
    'DoS': 'DoS',
    'Reconnaissance': 'Reconnaissance',
    'Analysis': 'Reconnaissance',  # This attack seems to be similar to reconnaissance, so we'll group the two together
    'Backdoor': 'Backdoor',
    'Shellcode': 'Shellcode',
    'Worms': 'Worms'
}
UNSW_NB15['attack_group'] = UNSW_NB15['attack_cat'].map(attack_type_groups).fillna('Other')

In [None]:
# We'll score attack impact later
# I'm thinking of either trying our MITRE ATT&CK Impact scores, or maybe trying to build my own custom score calculator
# I will definitely need to do more research on how attack frameworks work though