In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import csv

In [2]:
from ic.models.common import process_row

with open('../../data/Grouped_All.csv') as file:
    protocol_set = set()
    label_set = set()
    
    csv_reader = csv.reader(file)
    next(csv_reader)

    for row in csv_reader:
        try:
            (x, y) = process_row(row)
            protocol_set.add(row[5])
            label_set.add(row[-1])
        except ValueError:
            pass
    
    print("Labels:", label_set, "Protocols:", protocol_set)

Labels: {'Heartbleed', 'DDoS', 'Web Attack  Sql Injection', 'Infiltration', 'Web Attack  XSS', 'SSH-Patator', 'DoS slowloris', 'Web Attack  Brute Force', 'PortScan', 'DoS GoldenEye', 'Bot', 'DoS Slowhttptest', 'FTP-Patator', 'BENIGN', 'DoS Hulk'} Protocols: {'6', '0', '17'}


In [10]:
import csv

def all_headers():
    with open('../../data/Grouped_All.csv') as file:
        csv_reader = csv.reader(file)
        return next(csv_reader)

all_available_headers = all_headers()
print(all_available_headers, all_available_headers.index("Init_Win_bytes_forward"))

['Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', 

In [3]:
import csv

def get_headers():
    with open('../../data/Grouped_All.csv') as file:
        csv_reader = csv.reader(file)
        headers = next(csv_reader)
        return [ headers[5] , *headers[7:-1] ]
    
def gen_general_stats():
    return {
        'lines_with_nan': 0,
        'lines_with_inf': 0,
        'lines_with_neg': 0,
        'lines_with_exception': 0,
        'total_lines': 0
    }
            
headers = get_headers()
print(headers)

[' Protocol', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', 

In [4]:
def gen_analysis_dic(header_list):
    return dict([ 
        (header_label, { 
            'inf_count': 0,
            'nan_count': 0,
            'neg_count': 0,
            'exception_count': 0,
            'inf_values': set(),
            'nan_values': set(),
            'exception_values': set()
        }) for header_label in header_list
    ])

analysis_dic = gen_analysis_dic(headers)

In [5]:
import csv, math

def analyze(
    file_paths,
    headers,
    analysis_dic, 
    general_stats
):
    for file_path in file_paths:
        with open(file_path, errors='ignore') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)
            for row in csv_reader:
                values = [ row[5], *row[7:-1] ]
            
                nan_flag = False
                inf_flag = False
                neg_flag = False
                exception_flag = False
                general_stats['total_lines'] += 1
                
                for i in range(len(values)):                    
                    try:
                        float_value = float(values[i])
                        
                        if float_value < 0:
                            if not neg_flag:
                                neg_flag = True
                                general_stats['lines_with_neg'] += 1
                            analysis_dic[headers[i]]['neg_count'] += 1
                        
                        if math.fabs(float_value) == math.inf:
                            if not inf_flag:
                                inf_flag = True
                                general_stats['lines_with_inf'] += 1
                            analysis_dic[headers[i]]['inf_count'] += 1
                            analysis_dic[headers[i]]['inf_values'].add(values[i])
                        
                        if float_value == math.nan:
                            if not nan_flag:
                                nan_flag = True
                                general_stats['lines_with_nan'] += 1
                            analysis_dic[headers[i]]['nan_count'] += 1
                            analysis_dic[headers[i]]['nan_values'].add(values[i])
                            
                    except:
                        if not exception_flag:
                            exception_flag = True
                            general_stats['lines_with_exception'] += 1
                        analysis_dic[headers[i]]['exception_count'] += 1
                        analysis_dic[headers[i]]['exception_values'].add(values[i])
    
    return (analysis_dic, general_stats)
                    
    

In [6]:
value_analysis = analyze(
    [
        '../../data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
        '../../data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
        '../../data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
        '../../data/Monday-WorkingHours.pcap_ISCX.csv',
        '../../data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
        '../../data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
        '../../data/Tuesday-WorkingHours.pcap_ISCX.csv',
        '../../data/Wednesday-workingHours.pcap_ISCX.csv'
    ],
    headers,
    gen_analysis_dic(headers),
    gen_general_stats()
)

print(value_analysis)
fields_with_negative = [ 
    field for field in value_analysis[0] if value_analysis[0][field]['neg_count'] > 0
]
print(fields_with_negative)

({' Protocol': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Flow Duration': {'inf_count': 0, 'nan_count': 0, 'neg_count': 115, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Total Fwd Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Total Backward Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, 'Total Length of Fwd Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Total Length of Bwd Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception

In [7]:
print(
    analyze(
        [
            '../../data/Grouped_All.csv'
        ],
        headers,
        gen_analysis_dic(headers),
        gen_general_stats()
    )
)

({' Protocol': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Flow Duration': {'inf_count': 0, 'nan_count': 0, 'neg_count': 115, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Total Fwd Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Total Backward Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, 'Total Length of Fwd Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception_values': {''}}, ' Total Length of Bwd Packets': {'inf_count': 0, 'nan_count': 0, 'neg_count': 0, 'exception_count': 288602, 'inf_values': set(), 'nan_values': set(), 'exception

In [7]:
processed_files = [
    "../../data/Tuesday-WorkingHours.pcap_ISCX.processed.csv",
    "../../data/Wednesday-workingHours.pcap_ISCX.processed.csv",
    "../../data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.processed.csv",
    "../../data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.processed.csv",
    "../../data/Friday-WorkingHours-Morning.pcap_ISCX.processed.csv",
    "../../data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv",
    "../../data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.processed.csv"
]

for file_path in processed_files:
    with open(file_path) as file:
        lc = 0
        csv_reader = csv.reader(file)
        next(csv_reader)
        labels = set()
        
        for row in csv_reader:
            lc += 1
            labels.add(row[-1])
        
        print("Labels em", file_path+":", labels, ", line count:" )

Labels em ../../data/Tuesday-WorkingHours.pcap_ISCX.processed.csv: {'BENIGN', 'FTP-Patator', 'SSH-Patator'} , line count: 188714
Labels em ../../data/Wednesday-workingHours.pcap_ISCX.processed.csv: {'DoS slowloris', 'Heartbleed', 'DoS GoldenEye', 'DoS Slowhttptest', 'BENIGN', 'DoS Hulk'} , line count: 356284
Labels em ../../data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.processed.csv: {'BENIGN', 'Web Attack  Sql Injection', 'Web Attack  XSS', 'Web Attack  Brute Force'} , line count: 67662
Labels em ../../data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.processed.csv: {'BENIGN', 'Infiltration'} , line count: 123870
Labels em ../../data/Friday-WorkingHours-Morning.pcap_ISCX.processed.csv: {'Bot', 'BENIGN'} , line count: 72330
Labels em ../../data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.processed.csv: {'BENIGN', 'DDoS'} , line count: 137329
Labels em ../../data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.processed.csv: {'PortScan', 'BENIGN'} , line count: 20