In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
UNSW_NB15 = pd.read_csv('../datasets/UNSW-NB15/Training and Testing Sets/UNSW_NB15_training-set.csv')

In [18]:
pd.set_option('display.max_rows', None) # So we don't get clipped off by pandas
UNSW_NB15.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label,dur_category,packets_total,bytes_total,avg_packet_size,packet_ratio,byte_ratio,is_asymmetric,protocal_category
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,252,254,14158.94238,8495.365234,0,0,24.2956,8.375,30.177547,11.830604,255,621772692,2202533631,255,0.0,0.0,0.0,43,43,0,0,1,0,1,1,1,1,0,0,0,1,1,0,Normal,0,instant,10,430,39.090909,1.2,1.491329,0,TCP
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,252,8395.112305,503571.3125,2,17,49.915,15.432865,61.426934,1387.77833,255,1417884146,3077387971,255,0.0,0.0,0.0,52,1106,0,0,43,1,1,1,1,2,0,0,0,1,6,0,Normal,0,instant,52,42748,806.566038,0.358974,0.01747,0,TCP
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,252,1572.271851,60929.23047,1,6,231.875571,102.737203,17179.58686,11420.92623,255,2116150707,2963114973,255,0.111897,0.061458,0.050439,46,824,0,0,7,1,2,1,1,3,0,0,0,2,6,0,Normal,0,short,24,13550,542.0,0.470588,0.027603,0,TCP
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,252,2740.178955,3358.62207,1,3,152.876547,90.235726,259.080172,4991.784669,255,1107119177,1047442890,255,0.0,0.0,0.0,52,64,0,0,1,1,2,1,1,3,1,1,0,2,1,0,Normal,0,short,24,1398,55.92,0.923077,0.814527,0,TCP
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,252,8561.499023,3987.059814,2,1,47.750333,75.659602,2415.837634,115.807,255,2436137549,1977154190,255,0.128381,0.071147,0.057234,53,45,0,0,43,1,2,2,1,40,0,0,0,2,39,0,Normal,0,instant,16,802,47.176471,1.428571,1.98513,0,TCP


In [8]:
# Duration categories for visualization
UNSW_NB15['dur_category'] = pd.cut(UNSW_NB15['dur'], 
                                  bins=[0, 1, 10, 60, float('inf')],
                                  labels=['instant', 'short', 'medium', 'long'])

In [10]:
# Rate-based features
UNSW_NB15['packets_total'] = UNSW_NB15['spkts'] + UNSW_NB15['dpkts']
UNSW_NB15['bytes_total'] = UNSW_NB15['sbytes'] + UNSW_NB15['dbytes']
UNSW_NB15['avg_packet_size'] = UNSW_NB15['bytes_total'] / (UNSW_NB15['packets_total'] + 1)

In [19]:
# Traffic direction asymmetry (could pose a problem for stateful devices, but usually doesn't matter too much)
UNSW_NB15['packet_ratio'] = UNSW_NB15['spkts'] / (UNSW_NB15['dpkts'] + 1)  # The ratio of src/dist we have
UNSW_NB15['byte_ratio'] = UNSW_NB15['sbytes'] / (UNSW_NB15['dbytes'] + 1)
UNSW_NB15['is_asymmetric'] = ((UNSW_NB15['packet_ratio'] > 10) | (UNSW_NB15['packet_ratio'] < 0.1)).astype(int)

In [24]:
# For simpler visualization, we'll group protocols
protocol_dict = {'tcp': 'TCP', 'udp': 'UDP', 'icmp': 'ICMP', 'arp': 'ARP'}
UNSW_NB15['protocol_category'] = UNSW_NB15['proto'].map(protocol_dict).fillna('Other')

In [25]:
# Service categories (we basically group rarer services here)
service_counts = UNSW_NB15['service'].value_counts()
top_services = service_counts.head(10).index
UNSW_NB15['service_grouped'] = UNSW_NB15['service'].apply(
    lambda x: x if x in top_services else 'other'
)

In [26]:
UNSW_NB15['service_protocol'] = UNSW_NB15['service_grouped'] + '_' + UNSW_NB15['protocol_category']

In [30]:
# State categories (CON=established, INT=intermediate, etc.)
# These classify the status of network connections for logging + analysis
state_categories = {
    'CON': 'established', # Network ongoing + all communication parameters have been negotiated
    'INT': 'intermediate', # Transitional state, like during initial TCP handshake
    'FIN': 'finished', # Application has initiated a termination of connection 
    'REQ': 'request', # Initial packet(s) requesting connection, such as a synchronize packet during TCP handshake
    'RST': 'reset' # Connection was terminated abruptly, usually due to host crash, connection attempt into closed port, firewall blocking traffic
} 
UNSW_NB15['state_category'] = UNSW_NB15['state'].map(state_categories).fillna('other')

In [31]:
# Connection quality indication in the dataset
UNSW_NB15['connection_completed'] = UNSW_NB15['state'].isin(['FIN', 'CON']).astype(int)
UNSW_NB15['connection_failed'] = UNSW_NB15['state'].isin(['RST', 'REQ']).astype(int)

In [33]:
# Loss features
UNSW_NB15['has_packet_loss'] = ((UNSW_NB15['sloss'] > 0) | (UNSW_NB15['dloss'] > 0)).astype(int)
UNSW_NB15['total_loss'] = UNSW_NB15['sloss'] + UNSW_NB15['dloss']
UNSW_NB15['loss_rate'] = UNSW_NB15['total_loss'] / (UNSW_NB15['packets_total'] + 1)

In [36]:
# Jitter features (this shows network instability)
UNSW_NB15['avg_jitter'] = (UNSW_NB15['sjit'] + UNSW_NB15['djit']) / 2
UNSW_NB15['high_jitter'] = (UNSW_NB15['avg_jitter'] > UNSW_NB15['avg_jitter'].quantile(0.9)).astype(int)

In [38]:
# TCP window sizes (0 might indicate non-TCP)
UNSW_NB15['has_tcp_info'] = ((UNSW_NB15['swin'] > 0) | (UNSW_NB15['dwin'] > 0)).astype(int)
UNSW_NB15['window_size_avg'] = (UNSW_NB15['swin'] + UNSW_NB15['dwin']) / 2

In [39]:
# TCP timing features
UNSW_NB15['has_timing'] = ((UNSW_NB15['tcprtt'] > 0) | (UNSW_NB15['synack'] > 0) | (UNSW_NB15['ackdat'] > 0)).astype(int)
UNSW_NB15['tcp_handshake_time'] = UNSW_NB15['synack'] + UNSW_NB15['ackdat']

In [40]:
# High connection counts (potential scanning)
UNSW_NB15['is_scanner'] = (
    (UNSW_NB15['ct_srv_src'] > 10) |  # many connections to same service
    (df['ct_srv_dst'] > 10) |  # many connections from same service
    (df['ct_src_ltm'] > 20)    # many connections from same source recently
).astype(int)

NameError: name 'df' is not defined