In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import functools, math, operator

import numpy as np
import pandas as pd
from IPython.display import display, HTML

from ic.data_treatment import util, preprocess


DATA_ROOT = "../../data/MachineLearningCVE"

In [3]:
csvs = util.files_from_dir(DATA_ROOT, 'csv')
csvs

['../../data/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 '../../data/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv']

In [4]:
headers = [ header.strip() for header in util.csv_headers(csvs[0]) ]
headers[55] = 'Fwd Header Length.1'
print(headers)

['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count

In [1]:
def problems_in_chunk(chunk):
    numeric_columns = chunk.columns.tolist()[:-1] # Remove 'Label'
    return chunk[preprocess.problem_filter(chunk, numeric_columns)]


STATS = ['nan values', 'inf values', 'negative values']


def initial_stats(headers):
    return pd.DataFrame(dict(
        [(stat, [ 0 for h in headers ]) for stat in STATS]
    ), index=headers)


def stats_for_series(series):
    return pd.Series(
        [
            sum([ 1 for x in series if pd.isna(x) ]),
            sum([ 1 for x in series if abs(x) == math.inf ]),
            sum([ 1 for x in series if x < 0 ]),
        ], 
        index=STATS
    )


def stats_dataframe(problems_dataframe):
    df = problems_dataframe.drop('Label', axis=1)        
    return df.apply(stats_for_series).transpose()


def drop_zeroes(df):
    return df.loc[df.sum(axis=1) != 0, df.sum() != 0]


def process_file(in_file_path, out_dir):
    first_chunk = True
    total_rows = 0
    total_problems = 0
    
    reader = pd.read_csv(in_file_path, chunksize=1000)
    
    base_name = util.basename(in_file_path)
    out_problems_path = out_dir + '/' + base_name + ".problems.csv"
    out_stats_path = out_dir + '/' + base_name + ".stats.csv"
    
    stats_df = initial_stats(headers[:-1])
    
    for chunk in reader:
        chunk = chunk.rename(columns=lambda c: c.strip())
        total_rows += chunk.shape[0]
        
        problems_dataframe = problems_in_chunk(chunk)
        total_problems += problems_dataframe.shape[0]
        
        if problems_dataframe.shape[0]:
            chunk_stats = stats_dataframe(problems_dataframe)
            stats_df += chunk_stats
        
        mode = 'w' if first_chunk else 'a'
        problems_dataframe.to_csv(out_problems_path, mode=mode, header=first_chunk)
        first_chunk = False
    
    stats_df.to_csv(out_stats_path)
    return (total_rows, total_problems, stats_df)


def report(total_rows, total_problems, stats):
    percentage = (total_problems/total_rows)*100
    print(total_problems, "linhas com problema", "({:.2f}%)".format(round(percentage, 2)))
    display(drop_zeroes(stats))


def main(csvs, out_dir):
    total = 0
    total_problems = 0
    full_stats = initial_stats(headers[:-1])
    
    for file in csvs:
        display(HTML(f'<h3>{util.basename(file)}</h3>'))
        total_rows, problem_rows, stats_df = process_file(file, out_dir)
        
        total += total_rows
        total_problems += problem_rows
        full_stats += stats_df 
        
        report(total_rows, problem_rows, stats_df)
        display(HTML('<hr/>'))
        
    display(HTML('<h3>Total:</h3>'))
    report(total, total_problems, full_stats)
        
        

In [6]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    main(csvs, '../../data/MachineLearningCVEReport')

299810 linhas com problema (56.58%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,15
Flow Bytes/s,64,373,9
Flow Packets/s,0,437,15
Flow IAT Mean,0,0,15
Flow IAT Max,0,0,15
Flow IAT Min,0,0,512
Fwd Header Length,0,0,23
Bwd Header Length,0,0,17
Fwd Header Length.1,0,0,23
Init_Win_bytes_forward,0,0,224495


102704 linhas com problema (60.28%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,11
Flow Bytes/s,20,115,10
Flow Packets/s,0,135,11
Flow IAT Mean,0,0,11
Flow IAT Max,0,0,11
Flow IAT Min,0,0,263
Init_Win_bytes_forward,0,0,81911
Init_Win_bytes_backward,0,0,102373


88416 linhas com problema (39.17%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,2
Flow Bytes/s,4,30,2
Flow Packets/s,0,34,2
Flow IAT Mean,0,0,2
Flow IAT Max,0,0,2
Flow IAT Min,0,0,108
Fwd IAT Min,0,0,6
Init_Win_bytes_forward,0,0,32925
Init_Win_bytes_backward,0,0,88299


118703 linhas com problema (62.14%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,9
Flow Bytes/s,28,94,8
Flow Packets/s,0,122,9
Flow IAT Mean,0,0,9
Flow IAT Max,0,0,9
Flow IAT Min,0,0,225
Init_Win_bytes_forward,0,0,95834
Init_Win_bytes_backward,0,0,118419


257195 linhas com problema (57.68%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,17
Flow Bytes/s,201,63,0
Flow Packets/s,0,264,17
Flow IAT Mean,0,0,17
Flow IAT Max,0,0,17
Flow IAT Min,0,0,505
Fwd Header Length,0,0,11
Bwd Header Length,0,0,5
Fwd Header Length.1,0,0,11
Init_Win_bytes_forward,0,0,200128


336419 linhas com problema (48.57%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,21
Flow Bytes/s,1008,289,16
Flow Packets/s,0,1297,21
Flow IAT Mean,0,0,21
Flow IAT Max,0,0,21
Flow IAT Min,0,0,749
Fwd IAT Min,0,0,11
Fwd Header Length,0,0,1
Fwd Header Length.1,0,0,1
Init_Win_bytes_forward,0,0,203253


77434 linhas com problema (27.03%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,36
Flow Bytes/s,15,356,36
Flow Packets/s,0,371,36
Flow IAT Mean,0,0,36
Flow IAT Max,0,0,36
Flow IAT Min,0,0,176
Init_Win_bytes_forward,0,0,60210
Init_Win_bytes_backward,0,0,76942


164732 linhas com problema (57.08%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,4
Flow Bytes/s,18,189,4
Flow Packets/s,0,207,4
Flow IAT Mean,0,0,4
Flow IAT Max,0,0,4
Flow IAT Min,0,0,353
Init_Win_bytes_forward,0,0,102433
Init_Win_bytes_backward,0,0,164351


1445413 linhas com problema (51.06%)


Unnamed: 0,nan values,inf values,negative values
Flow Duration,0,0,115
Flow Bytes/s,1358,1509,85
Flow Packets/s,0,2867,115
Flow IAT Mean,0,0,115
Flow IAT Max,0,0,115
Flow IAT Min,0,0,2891
Fwd IAT Min,0,0,17
Fwd Header Length,0,0,35
Bwd Header Length,0,0,22
Fwd Header Length.1,0,0,35


In [13]:
def label_counts(file_path):
    chunks = pd.read_csv(file_path, chunksize=1000)
    mapped = map(lambda chunk: chunk.iloc[:, -1].value_counts(), chunks)
    return functools.reduce(lambda acc, v: acc.add(v, fill_value=0), mapped)

def display_labels(csvs):
    display(HTML("<h3>Labels em cada arquivo:</h3>"))
    for csv in csvs:
        display(HTML((f'<h4>{util.basename(csv)}</h4>')))
        display(label_counts(csv))
        display(HTML('<hr/>'))
        
display_labels(csvs)        

BENIGN    529918
Name:  Label, dtype: int64

BENIGN                        168186.0
Web Attack � Brute Force        1507.0
Web Attack � Sql Injection        21.0
Web Attack � XSS                 652.0
Name:  Label, dtype: float64

BENIGN     97718.0
DDoS      128027.0
Name:  Label, dtype: float64

BENIGN    189067.0
Bot         1966.0
Name:  Label, dtype: float64

BENIGN         432074.0
FTP-Patator      7938.0
SSH-Patator      5897.0
Name:  Label, dtype: float64

BENIGN              440031.0
DoS GoldenEye        10293.0
DoS Hulk            231073.0
DoS Slowhttptest      5499.0
DoS slowloris         5796.0
Heartbleed              11.0
Name:  Label, dtype: float64

BENIGN      127537.0
PortScan    158930.0
Name:  Label, dtype: float64

BENIGN          288566.0
Infiltration        36.0
Name:  Label, dtype: float64