# Load Raw Data from S3

This notebook load the three network traffic datasets (CIC-IDS2017, TON_IOT, UNSW_NB15) from the S3 bucket using Athena.

In [9]:
!pip -q install "PyAthena[SQLAlchemy]" sqlalchemy s3fs

In [33]:
import re
import csv
import boto3
import sagemaker
import pandas as pd
from sqlalchemy import create_engine, text

# helps the notebook print all columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)

## AWS/SageMaker context + Athena engine

In [11]:
sess = sagemaker.Session()
region = boto3.Session().region_name

results_bucket = sess.default_bucket()
athena_results_path = f"s3://{results_bucket}/athena/staging/"

database_name = "aai540_eda"

engine = create_engine(
    f"awsathena+rest://@athena.{region}.amazonaws.com:443/{database_name}",
    connect_args={"s3_staging_dir": athena_results_path, "region_name": region},
)
print("Region:", region)
print("Athena results:", athena_results_path)

Region: us-east-1
Athena results: s3://sagemaker-us-east-1-128131109986/athena/staging/


In [12]:
# helper funcions for queries
def exec_ddl(sql: str):
    with engine.begin() as conn:
        conn.execute(text(sql))

def read_sql(sql: str) -> pd.DataFrame:
    return pd.read_sql(sql, engine)

## Create Athena database

In [13]:
exec_ddl(f"CREATE DATABASE IF NOT EXISTS {database_name}")
read_sql("SHOW DATABASES")

Unnamed: 0,database_name
0,aai540_eda
1,default
2,dsoaws
3,hw2
4,sagemaker_featurestore


## Create TON_IOT raw table

In [14]:
ton_location = "s3://aai-540-final-project-group-5/raw/TON_IOT/"

exec_ddl(f"DROP TABLE IF EXISTS {database_name}.ton_iot_raw") # drop if exists

exec_ddl(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.ton_iot_raw (
  ts bigint,
  src_ip string,
  src_port bigint,
  dst_ip string,
  dst_port bigint,
  proto string,
  service string,
  duration double,
  src_bytes bigint,
  dst_bytes bigint,
  conn_state string,
  missed_bytes bigint,
  src_pkts bigint,
  src_ip_bytes bigint,
  dst_pkts bigint,
  dst_ip_bytes bigint,
  dns_query string,
  dns_qclass bigint,
  dns_qtype bigint,
  dns_rcode bigint,
  dns_aa string,
  dns_rd string,
  dns_ra string,
  dns_rejected string,
  ssl_version string,
  ssl_cipher string,
  ssl_resumed string,
  ssl_established string,
  ssl_subject string,
  ssl_issuer string,
  http_trans_depth string,
  http_method string,
  http_uri string,
  http_referrer string,
  http_version string,
  http_request_body_len bigint,
  http_response_body_len bigint,
  http_status_code bigint,
  http_user_agent string,
  http_orig_mime_types string,
  http_resp_mime_types string,
  weird_name string,
  weird_addl string,
  weird_notice string,
  label bigint,
  type string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'separatorChar' = ',',
  'quoteChar'     = '"',
  'escapeChar'    = '\\\\'
)
STORED AS TEXTFILE
LOCATION '{ton_location}'
TBLPROPERTIES (
  'skip.header.line.count'='1',
  'use.null.for.invalid.data'='true'
)
""")

In [39]:
read_sql(f"SELECT * FROM {database_name}.ton_iot_raw LIMIT 5") # show the first 5 items

Unnamed: 0,ts,src_ip,src_port,dst_ip,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,src_ip_bytes,dst_pkts,dst_ip_bytes,dns_query,dns_qclass,dns_qtype,dns_rcode,dns_aa,dns_rd,dns_ra,dns_rejected,ssl_version,ssl_cipher,ssl_resumed,ssl_established,ssl_subject,ssl_issuer,http_trans_depth,http_method,http_uri,http_referrer,http_version,http_request_body_len,http_response_body_len,http_status_code,http_user_agent,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,label,type
0,1556153426,192.168.1.30,4906,192.168.1.190,4906,tcp,-,0.000135,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,dos
1,1556153426,192.168.1.30,4906,192.168.1.190,4906,tcp,-,0.000181,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,dos
2,1556153426,192.168.1.30,4906,192.168.1.193,4906,tcp,-,4.5e-05,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,dos
3,1556153426,192.168.1.30,4906,192.168.1.190,4906,tcp,-,0.000171,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,dos
4,1556153426,192.168.1.30,4906,192.168.1.193,4906,tcp,-,1.8e-05,0,0,REJ,0,1,40,1,40,-,0,0,0,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0,0,-,-,-,-,-,-,1,dos


In [40]:
read_sql(f"SHOW COLUMNS FROM {database_name}.ton_iot_raw") # show regitered columns

Unnamed: 0,field
0,ts
1,src_ip
2,src_port
3,dst_ip
4,dst_port
5,proto
6,service
7,duration
8,src_bytes
9,dst_bytes


In [42]:
read_sql(f"SELECT COUNT(*) AS row_count FROM {database_name}.ton_iot_raw") # number of rows

Unnamed: 0,row_count
0,22339021


## Create CIC-IDS2017 raw tabel

In [16]:
cic_location = "s3://aai-540-final-project-group-5/raw/CIC-IDS2017/"

exec_ddl(f"DROP TABLE IF EXISTS {database_name}.cic_ids2017_raw")

exec_ddl(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.cic_ids2017_raw (
  destination_port bigint,
  flow_duration double,
  total_fwd_packets double,
  total_backward_packets double,
  total_length_of_fwd_packets double,
  total_length_of_bwd_packets double,
  fwd_packet_length_max double,
  fwd_packet_length_min double,
  fwd_packet_length_mean double,
  fwd_packet_length_std double,
  bwd_packet_length_max double,
  bwd_packet_length_min double,
  bwd_packet_length_mean double,
  bwd_packet_length_std double,
  flow_bytes_s double,
  flow_packets_s double,
  flow_iat_mean double,
  flow_iat_std double,
  flow_iat_max double,
  flow_iat_min double,
  fwd_iat_total double,
  fwd_iat_mean double,
  fwd_iat_std double,
  fwd_iat_max double,
  fwd_iat_min double,
  bwd_iat_total double,
  bwd_iat_mean double,
  bwd_iat_std double,
  bwd_iat_max double,
  bwd_iat_min double,
  fwd_psh_flags bigint,
  bwd_psh_flags bigint,
  fwd_urg_flags bigint,
  bwd_urg_flags bigint,
  fwd_header_length double,
  bwd_header_length double,
  fwd_packets_s double,
  bwd_packets_s double,
  min_packet_length double,
  max_packet_length double,
  packet_length_mean double,
  packet_length_std double,
  packet_length_variance double,
  fin_flag_count bigint,
  syn_flag_count bigint,
  rst_flag_count bigint,
  psh_flag_count bigint,
  ack_flag_count bigint,
  urg_flag_count bigint,
  cwe_flag_count bigint,
  ece_flag_count bigint,
  down_up_ratio double,
  average_packet_size double,
  avg_fwd_segment_size double,
  avg_bwd_segment_size double,
  fwd_header_length_1 double,
  fwd_avg_bytes_bulk double,
  fwd_avg_packets_bulk double,
  fwd_avg_bulk_rate double,
  bwd_avg_bytes_bulk double,
  bwd_avg_packets_bulk double,
  bwd_avg_bulk_rate double,
  subflow_fwd_packets double,
  subflow_fwd_bytes double,
  subflow_bwd_packets double,
  subflow_bwd_bytes double,
  init_win_bytes_forward double,
  init_win_bytes_backward double,
  act_data_pkt_fwd double,
  min_seg_size_forward double,
  active_mean double,
  active_std double,
  active_max double,
  active_min double,
  idle_mean double,
  idle_std double,
  idle_max double,
  idle_min double,
  label string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'separatorChar' = ',',
  'quoteChar'     = '"',
  'escapeChar'    = '\\\\'
)
STORED AS TEXTFILE
LOCATION '{cic_location}'
TBLPROPERTIES (
  'skip.header.line.count'='1',
  'use.null.for.invalid.data'='true'
)
""")

In [34]:
read_sql(f"SELECT * FROM {database_name}.cic_ids2017_raw LIMIT 5")

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,bwd_packet_length_max,bwd_packet_length_min,bwd_packet_length_mean,bwd_packet_length_std,flow_bytes_s,flow_packets_s,flow_iat_mean,flow_iat_std,flow_iat_max,flow_iat_min,fwd_iat_total,fwd_iat_mean,fwd_iat_std,fwd_iat_max,fwd_iat_min,bwd_iat_total,bwd_iat_mean,bwd_iat_std,bwd_iat_max,bwd_iat_min,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fwd_header_length,bwd_header_length,fwd_packets_s,bwd_packets_s,min_packet_length,max_packet_length,packet_length_mean,packet_length_std,packet_length_variance,fin_flag_count,syn_flag_count,rst_flag_count,psh_flag_count,ack_flag_count,urg_flag_count,cwe_flag_count,ece_flag_count,down_up_ratio,average_packet_size,avg_fwd_segment_size,avg_bwd_segment_size,fwd_header_length_1,fwd_avg_bytes_bulk,fwd_avg_packets_bulk,fwd_avg_bulk_rate,bwd_avg_bytes_bulk,bwd_avg_packets_bulk,bwd_avg_bulk_rate,subflow_fwd_packets,subflow_fwd_bytes,subflow_bwd_packets,subflow_bwd_bytes,init_win_bytes_forward,init_win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,443,146817.0,10.0,5.0,1156.0,831.0,917.0,0.0,115.6,285.174099,693.0,0.0,166.2,300.491597,13533.85507,102.168005,10486.93,15765.6,36715.0,2.0,146817.0,16313.0,23572.06,63946.0,2.0,106646.0,26661.5,18214.92,41085.0,582.0,0,0,0,0,332.0,168.0,68.112003,34.056002,0.0,917.0,124.1875,273.018246,74538.9625,0,0,0,1,0,0,0,0,0.0,132.466667,115.6,166.2,332.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1156.0,5.0,831.0,65535.0,362.0,5.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,53,4777306.0,2.0,2.0,131.0,245.0,85.0,46.0,65.5,27.577164,167.0,78.0,122.5,62.932504,78.705446,0.837292,1592435.0,2573841.0,4563673.0,47938.0,4729368.0,4729368.0,0.0,4729368.0,4729368.0,4611611.0,4611611.0,0.0,4611611.0,4611611.0,0,0,0,0,64.0,40.0,0.418646,0.418646,46.0,167.0,84.4,49.530799,2453.3,0,0,0,0,0,0,0,0,1.0,105.5,65.5,122.5,64.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,131.0,2.0,245.0,-1.0,-1.0,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,53,30622.0,2.0,2.0,88.0,88.0,44.0,44.0,44.0,0.0,44.0,44.0,44.0,0.0,5747.501796,130.625041,10207.33,17635.47,30571.0,3.0,3.0,3.0,0.0,3.0,3.0,48.0,48.0,0.0,48.0,48.0,0,0,0,0,64.0,64.0,65.31252,65.31252,44.0,44.0,44.0,0.0,0.0,0,0,0,0,0,0,0,0,1.0,55.0,44.0,44.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,88.0,2.0,88.0,-1.0,-1.0,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,55672,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500000.0,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40.0,0.0,500000.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,6547.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,80,15609178.0,7.0,4.0,2168.0,622.0,1460.0,0.0,309.714286,571.546065,622.0,0.0,155.5,311.0,178.740995,0.704714,1560918.0,3414609.0,9953410.0,52.0,15600000.0,2601529.667,4220807.0,9953410.0,52.0,10100000.0,3358892.0,5783434.0,10000000.0,4260.0,0,0,0,0,152.0,88.0,0.448454,0.256259,0.0,1460.0,232.5,464.137421,215423.5455,0,0,0,1,0,0,0,0,0.0,253.636364,309.714286,155.5,152.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2168.0,4.0,622.0,8192.0,6547.0,3.0,20.0,123646.0,0.0,123646.0,123646.0,9953410.0,0.0,9953410.0,9953410.0,BENIGN


In [41]:
read_sql(f"SHOW COLUMNS FROM {database_name}.cic_ids2017_raw")

Unnamed: 0,field
0,destination_port
1,flow_duration
2,total_fwd_packets
3,total_backward_packets
4,total_length_of_fwd_packets
5,total_length_of_bwd_packets
6,fwd_packet_length_max
7,fwd_packet_length_min
8,fwd_packet_length_mean
9,fwd_packet_length_std


In [43]:
read_sql(f"SELECT COUNT(*) AS row_count FROM {database_name}.cic_ids2017_raw")

Unnamed: 0,row_count
0,2830743


## Create UNSW-NB15 raw table

In [20]:
unsw_location = "s3://aai-540-final-project-group-5/raw/UNSW_NB15/"

exec_ddl(f"DROP TABLE IF EXISTS {database_name}.unsw_nb15_raw")

exec_ddl(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.unsw_nb15_raw (
  srcip string,
  sport bigint,
  dstip string,
  dsport bigint,
  proto string,
  state string,
  dur double,
  sbytes bigint,
  dbytes bigint,
  sttl bigint,
  dttl bigint,
  sloss bigint,
  dloss bigint,
  service string,
  sload double,
  dload double,
  spkts bigint,
  dpkts bigint,
  swin bigint,
  dwin bigint,
  stcpb bigint,
  dtcpb bigint,
  smeansz double,
  dmeansz double,
  trans_depth bigint,
  res_bdy_len bigint,
  sjit double,
  djit double,
  stime bigint,
  ltime bigint,
  sintpkt double,
  dintpkt double,
  tcprtt double,
  synack double,
  ackdat double,
  is_sm_ips_ports bigint,
  ct_state_ttl bigint,
  ct_flw_http_mthd bigint,
  is_ftp_login bigint,
  ct_ftp_cmd bigint,
  ct_srv_src bigint,
  ct_srv_dst bigint,
  ct_dst_ltm bigint,
  ct_src_ltm bigint,
  ct_src_dport_ltm bigint,
  ct_dst_sport_ltm bigint,
  ct_dst_src_ltm bigint,
  attack_cat string,
  label bigint
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'separatorChar' = ',',
  'quoteChar'     = '"',
  'escapeChar'    = '\\\\'
)
STORED AS TEXTFILE
LOCATION '{unsw_location}'
TBLPROPERTIES (
  'skip.header.line.count'='0',
  'use.null.for.invalid.data'='true'
)
""")

In [36]:
read_sql(f"SELECT * FROM {database_name}.unsw_nb15_raw LIMIT 5")

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,sload,dload,spkts,dpkts,swin,dwin,stcpb,dtcpb,smeansz,dmeansz,trans_depth,res_bdy_len,sjit,djit,stime,ltime,sintpkt,dintpkt,tcprtt,synack,ackdat,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.6,26478,149.171.126.5,5190,tcp,FIN,0.004433,1470,1728,31,29,5,4,-,2533724.0,2896458.0,22,14,255,255,865426282,3015282052,67.0,123.0,0,0,0.0,0.387833,1421948648,1421948648,0.211095,0.287308,0.000721,0.000591,0.00013,0,0,0,0,0,2,10,7,2,1,1,1,,0
1,59.166.0.8,48038,149.171.126.8,60018,tcp,FIN,0.033681,3390,42634,31,29,7,22,-,790950.4,9951961.0,56,58,255,255,3130814893,985771159,61.0,735.0,0,0,40.942315,38.220897,1421948648,1421948648,0.606618,0.579579,0.000767,0.000635,0.000132,0,0,0,0,0,20,7,5,4,1,1,2,,0
2,59.166.0.9,7295,149.171.126.9,3920,tcp,FIN,0.027005,3390,42634,31,29,7,22,-,986484.0,12412220.0,56,58,255,255,3490811880,3491155213,61.0,735.0,0,0,27.501907,26.6208,1421948648,1421948648,0.485091,0.462333,0.000777,0.000641,0.000136,0,0,0,0,0,7,9,6,4,1,1,2,,0
3,59.166.0.1,47530,149.171.126.9,111,udp,CON,0.004691,568,312,31,29,0,0,-,726497.6,399062.0,4,4,0,0,0,0,142.0,78.0,0,0,1.763762,1.482803,1421948648,1421948648,1.254667,1.053,0.0,0.0,0.0,0,0,0,0,0,8,9,6,3,1,1,3,,0
4,59.166.0.8,25518,149.171.126.2,53,udp,CON,0.001116,146,178,31,29,0,0,dns,523297.5,637992.8,2,2,0,0,0,0,73.0,89.0,0,0,0.0,0.0,1421948648,1421948648,0.002,0.011,0.0,0.0,0.0,0,0,0,0,0,3,3,2,4,1,1,1,,0


In [37]:
read_sql(f"SHOW COLUMNS FROM {database_name}.unsw_nb15_raw")

Unnamed: 0,field
0,srcip
1,sport
2,dstip
3,dsport
4,proto
5,state
6,dur
7,sbytes
8,dbytes
9,sttl


In [44]:
read_sql(f"SELECT COUNT(*) AS row_count FROM {database_name}.unsw_nb15_raw")

Unnamed: 0,row_count
0,2540047


In [23]:
read_sql(f"SHOW TABLES IN {database_name}")

Unnamed: 0,tab_name
0,cic_ids2017_raw
1,ton_iot_raw
2,unsw_nb15_raw
