### Imports


In [2]:
import numpy as np
import pandas as pd
import os
from fastai.tabular.all import df_shrink

### Reading local files
CSV files containing all raw data are too big for github, but the source is linked in README.

In [4]:
def read_paths(dir_path):
  dspaths = []
  for dirname, _, filenames in os.walk(dir_path):
      for filename in filenames:
          if filename.endswith('.csv'):
              pds = os.path.join(dirname, filename)
              dspaths.append(pds)
              print(pds)
  return dspaths


def read_files(dspaths):
  individual_dfs = [pd.read_csv(dsp, sep=',') for dsp in dspaths]
  [i.shape for i in individual_dfs]
  return individual_dfs

In [6]:
# labelledTraffic = read_paths('../Local/2017LabelledTraffic/')
# labelledTraffic_dfs = read_files(labelledTraffic)

traffic2017 = read_paths('../Local/2017MachineLearningCVE/')
traffic2017_dfs = read_files(traffic2017)

# traffic2018 = read_paths('../Local/2018/')
# traffic2018_dfs = read_files(traffic2018)

../Local/2017MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv
../Local/2017MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv


In [7]:
traffic2017_dfs[0].dtypes

 Destination Port                int64
 Flow Duration                   int64
 Total Fwd Packets               int64
 Total Backward Packets          int64
Total Length of Fwd Packets      int64
                                ...   
Idle Mean                      float64
 Idle Std                      float64
 Idle Max                        int64
 Idle Min                        int64
 Label                          object
Length: 79, dtype: object

# Downsizing

In [6]:
individual_dfs[0].dtypes

 Destination Port                int64
 Flow Duration                   int64
 Total Fwd Packets               int64
 Total Backward Packets          int64
Total Length of Fwd Packets      int64
                                ...   
Idle Mean                      float64
 Idle Std                      float64
 Idle Max                        int64
 Idle Min                        int64
 Label                          object
Length: 79, dtype: object

In [7]:
individual_dfs = [df_shrink(df) for df in individual_dfs]

In [8]:
individual_dfs[0].dtypes

 Destination Port                 int32
 Flow Duration                    int32
 Total Fwd Packets                int16
 Total Backward Packets           int16
Total Length of Fwd Packets       int32
                                 ...   
Idle Mean                       float32
 Idle Std                       float32
 Idle Max                         int32
 Idle Min                         int32
 Label                         category
Length: 79, dtype: object

# Usuwanie niezdefiniowanych wartości

In [9]:
def drop_nan(individual_dfs):
  for df in individual_dfs:
      df.replace([np.inf, -np.inf], np.nan, inplace=True)
      # print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
      df.dropna(inplace=True)
  [i.shape for i in individual_dfs]
  return individual_dfs

individual_dfs = drop_nan(individual_dfs)

# Usuwanie zduplikowanych wierszy

In [10]:
def drop_dupes(individual_dfs):
  for df in individual_dfs:
      print(df.duplicated().sum(), "fully duplicate rows to remove")
      df.drop_duplicates(inplace=True)
      df.reset_index(inplace=True, drop=True)
  [i.shape for i in individual_dfs]
  return individual_dfs

individual_dfs = drop_dupes(individual_dfs)

2629 fully duplicate rows to remove
72319 fully duplicate rows to remove
6867 fully duplicate rows to remove
26831 fully duplicate rows to remove
35605 fully duplicate rows to remove
6052 fully duplicate rows to remove
24019 fully duplicate rows to remove
80914 fully duplicate rows to remove


## Zapisywanie danych w CSV

In [9]:
def convert(individual_dfs):
  for i, df in enumerate(individual_dfs):
        csv_path = f"../IDS2017/Clean/{dspaths[i].split('/')[-1].replace('.pcap_ISCX', '')[4:]}"
        df.to_csv(csv_path, index=False)

convert(individual_dfs)

## Zapisywanie połączonych danych w CSV

In [None]:
combined_df = pd.concat(individual_dfs, ignore_index=True)
combined_df.to_csv("../IDS2017/Clean/Combined.csv", index=False)


# Zapisywanie danych w parquet



In [11]:
def convert(individual_dfs):
  for i, df in enumerate(individual_dfs):
      df.to_parquet(f"../IDS2017/Clean/{dspaths[i].split('/')[-1].replace('.pcap_ISCX.csv', '.parquet')[4:]}")

convert(individual_dfs)

## Zapisywanie połączonych danych w parquet

In [None]:
def convert_to_single_parquet(individual_dfs, output_path):
    combined_df = pd.concat(individual_dfs, ignore_index=True)
    combined_df.to_parquet(output_path)

convert_to_single_parquet(individual_dfs, "../IDS2017/Clean/Combined.parquet")

# Czytanie plików i misc

## Zamienianie wartośći z kolumny Label na BENINGN/ATTACK


In [None]:
label_map = {'BENIGN': 'BENIGN', 'DDoS': 'ATTACK', 'DoS Hulk': 'ATTACK', 'DoS GoldenEye': 'ATTACK', 'DoS slowloris': 'ATTACK',
             'FTP-Patator': 'ATTACK', 'SSH-Patator':  'ATTACK', 'DoS Slowhttptest': 'ATTACK', 'PortScan': 'ATTACK',
             'Web Attack � Brute Force' : 'ATTACK', 'Bot': 'ATTACK', 'Web Attack � XSS': 'ATTACK', 'Infiltration': 'ATTACK',
             'Web Attack � Sql Injection': 'ATTACK', 'Heartbleed': 'ATTACK'}
combined01['Label'] = combined01['Label'].map(label_map).fillna('ATTACK')

print(combined01['Label'].value_counts())

Label
BENIGN    1886548
ATTACK     316445
Name: count, dtype: int64
