In [None]:
# Cell 1 - imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Cell 2 - Load Data

data_dir_name = "data/"
file_list = [ 
             "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", 
             "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
             "Friday-WorkingHours-Morning.pcap_ISCX.csv",
             "Monday-WorkingHours.pcap_ISCX.csv",
             "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
             "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
             "Tuesday-WorkingHours.pcap_ISCX.csv",
             "Wednesday-workingHours.pcap_ISCX.csv"
             ]

# Combine all files into a single DataFrame
data_frames = []
df = pd.DataFrame()
for file_name in file_list:
    df = pd.read_csv(data_dir_name + file_name)
    data_frames.append(df)
combined_data = pd.concat(data_frames, ignore_index=True)


print("Combined data shape:", combined_data.shape)
print("Columns in the dataset:", combined_data.columns.tolist())
# Display first few rows of the combined dataset
print(combined_data.head())

In [None]:
# Cell 3 - Visualize the distribution of the target variable

#clean column name if needed
combined_data.columns = combined_data.columns.str.strip()

#clean label values
combined_data['Label'] = combined_data['Label'].str.strip()

#get distribution
label_counts = combined_data['Label'].value_counts()

#plot
plt.figure(figsize=(12, 6))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.xticks(rotation=45, ha='right')
plt.title("Distribution of Traffic Types in Wednesday Dataset")
plt.xlabel("Traffic Label")
plt.ylabel("Number of Records")
plt.grid(True)
plt.tight_layout()
plt.show()

#shows class ratios
print(label_counts)

In [None]:
# Cell 4

# Drop columns with all drop IPS, ports, IDs, etc.
# to avoid data leakage
columns_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Fwd Header Length.1']  # adjust as needed

for col in columns_to_drop:
    if col in combined_data.columns:
        combined_data.drop(col, axis=1, inplace=True)

#drop constant columns(same value or not unique value for every row)
nunique = combined_data.nunique()
constant_cols = nunique[nunique <= 1].index.tolist()
combined_data.drop(constant_cols, axis=1, inplace=True)

#print remaining columns
print(f"Remaining columns: {combined_data.columns.tolist()}")