In [None]:
import os
import pandas as pd

def load_all_csvs(folder_path):
    all_dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            df = pd.read_csv(os.path.join(folder_path, filename), low_memory=False)
            all_dataframes.append(df)
    return pd.concat(all_dataframes, ignore_index=True)

# Replace these paths with your local paths
cicids2017_path = "C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset"
cicids2018_path = "C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2018_dataset"

df_2017 = load_all_csvs(cicids2017_path)
df_2018 = load_all_csvs(cicids2018_path)

df_merged = pd.concat([df_2017, df_2018], ignore_index=True)
df_merged.dropna(inplace=True)
df_merged.reset_index(drop=True, inplace=True)

print("Merged Shape:", df_merged.shape)



Merged Shape: (5658770, 79)


In [2]:
from rapidfuzz import fuzz

def fuzzy_deduplicate_columns(df, threshold=90):
    columns = list(df.columns)
    to_drop = set()
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            if fuzz.ratio(columns[i].lower(), columns[j].lower()) > threshold:
                to_drop.add(columns[j])
    df = df.drop(columns=list(to_drop), errors='ignore')
    return df

df_merged = fuzzy_deduplicate_columns(df_merged)
print("Columns after fuzzy deduplication:", len(df_merged.columns))


Columns after fuzzy deduplication: 56


In [3]:
# Save merged dataset to a CSV file
output_path = "CICIDS2017_2018_Merged.csv"
df_merged.to_csv(output_path, index=False)
print(f"Merged dataset saved to {output_path}")


Merged dataset saved to CICIDS2017_2018_Merged.csv
