In [1]:
import numpy as np
import pandas as pd
import os
from fastai.tabular.all import df_shrink

### Reading local files
The CSV files containing the data are too large to be uploaded to GitHub. All sources are linked in the README.

In [2]:
dspaths = []

def read_paths(dir_path):
  for dirname, _, filenames in os.walk(dir_path):
      for filename in filenames:
          if filename.endswith('.csv'):
              pds = os.path.join(dirname, filename)
              dspaths.append(pds)
              print(pds)
  return dspaths

def read_files(dspaths):
  individual_dfs = [pd.read_csv(dsp) for dsp in dspaths]
  [i.shape for i in individual_dfs]
  return individual_dfs

In [3]:
dspath2017 = read_paths('../Local/2017/')
dfs2017 = read_files(dspath2017)

# Warning: more than 30 GB of RAM is needed to run this code 
#dspath2018 = read_paths('../Local/2018/')
#dfs2018 = read_files(dspath2018)

../Local/2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
../Local/2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
../Local/2017/Friday-WorkingHours-Morning.pcap_ISCX.csv
../Local/2017/Monday-WorkingHours.pcap_ISCX.csv
../Local/2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
../Local/2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
../Local/2017/Tuesday-WorkingHours.pcap_ISCX.csv
../Local/2017/Wednesday-workingHours.pcap_ISCX.csv


In [4]:
dfs2017[0].dtypes

 Destination Port                int64
 Flow Duration                   int64
 Total Fwd Packets               int64
 Total Backward Packets          int64
Total Length of Fwd Packets      int64
                                ...   
Idle Mean                      float64
 Idle Std                      float64
 Idle Max                        int64
 Idle Min                        int64
 Label                          object
Length: 79, dtype: object

### Downsizing

In [6]:
dfs2017 = [df_shrink(df) for df in dfs2017]
dfs2017[0].dtypes

 Destination Port                 int32
 Flow Duration                    int32
 Total Fwd Packets                int16
 Total Backward Packets           int16
Total Length of Fwd Packets       int32
                                 ...   
Idle Mean                       float32
 Idle Std                       float32
 Idle Max                         int32
 Idle Min                         int32
 Label                         category
Length: 79, dtype: object

### Deleting NaN values

In [9]:
def drop_nan(individual_dfs):
  for df in individual_dfs:
      df.replace([np.inf, -np.inf], np.nan, inplace=True)
#      print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
      df.dropna(inplace=True)
  [i.shape for i in individual_dfs]
  return individual_dfs

dfs2017 = drop_nan(dfs2017)

### Deleting duplicate rows

In [10]:
def drop_dupes(individual_dfs):
  for df in individual_dfs:
#      print(df.duplicated().sum(), "fully duplicate rows to remove")
      df.drop_duplicates(inplace=True)
      df.reset_index(inplace=True, drop=True)
  [i.shape for i in individual_dfs]
  return individual_dfs

dfs2017 = drop_dupes(dfs2017)

2629 fully duplicate rows to remove
72319 fully duplicate rows to remove
6867 fully duplicate rows to remove
26831 fully duplicate rows to remove
35605 fully duplicate rows to remove
6052 fully duplicate rows to remove
24019 fully duplicate rows to remove
80914 fully duplicate rows to remove


### Saving data back into CSV

In [11]:
def convert(individual_dfs):
  for i, df in enumerate(individual_dfs):
        csv_path = f"../IDS2017/Clean/{dspaths[i].split('/')[-1].replace('.pcap_ISCX', '')[0:]}"
        df.to_csv(csv_path, index=False)

convert(dfs2017)

combined_df = pd.concat(dfs2017, ignore_index=True)
combined_df.to_csv("../IDS2017/Clean/Combined.csv", index=False)

### Converting data to parquet format



In [12]:
def convert(individual_dfs):
  for i, df in enumerate(individual_dfs):
      df.to_parquet(f"../IDS2017/Clean/{dspaths[i].split('/')[-1].replace('.pcap_ISCX.csv', '.parquet')[0:]}")

convert(dfs2017)

def convert_to_single_parquet(individual_dfs, output_path):
    combined_df = pd.concat(individual_dfs, ignore_index=True)
    combined_df.to_parquet(output_path)

convert_to_single_parquet(dfs2017, "../IDS2017/Clean/Combined.parquet")