In [4]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# List of CSV files
file_paths = [
    "data/Tuesday-WorkingHours.pcap_ISCX.csv",
    "data/Wednesday-workingHours.pcap_ISCX.csv",
    "data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

# Initialize StandardScaler and LabelEncoder
scaler = StandardScaler()
label_encoder = LabelEncoder()

# Preprocessing data function
def preprocess_data(file_paths):
    
    # Read CSV file
    df = pd.read_csv(file_paths, encoding='iso-8859-2', engine='python')

    # Handle missing values and infinite values
    # Drop rows with missing Flow_Duration values
    df.dropna(subset=[" Flow Duration"], inplace=True)

    # Replace infinites values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Drop rows with the remaining NaN values
    df.dropna(inplace=True)

    # Normalize numeric columns
    numeric_columns = df.select_dtypes(include='number').columns
    df[numeric_columns] = df[numeric_columns].astype(np.float32)
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

    # Encode categorical columns to numeric
    categorical_columns = df.select_dtypes(include='object').columns
    for col in categorical_columns:
        if col != ' Label':
           df[col] = df[col].replace('Infinity', -1)
           df[col] = label_encoder.fit_transform(df[col])
    
    return df

# Process each file and concatenate results into a single DataFrame
processed_dfs = [preprocess_data(file) for file in file_paths]
final_dataframe = pd.concat(processed_dfs, ignore_index=True)

# Process the data and save to a new CSV file
final_dataframe.to_csv("new_data.csv", index=False)
print("Processing and Saving to CSV file is done.")

Processing and Saving to CSV file is done.
