In [None]:
import pandas as pd
import glob
import os
from rapidfuzz import process, fuzz

# === CONFIG ===
path_2017 = "C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset"
path_2018 = "C:/Users/GPU RTX 5000/Desktop/Major Project Dataset/major/major_merge/cicids2017_dataset"
merge_mode = 'multiclass'  # or 'binary'
output_file = f"CICIDS2017_2018_Merged_Fuzzy_{merge_mode}.csv"

# === LOAD & MERGE FILES ===
def load_and_merge(folder_path):
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    df_list = []
    for file in all_files:
        print(f"Loading: {file}")
        df = pd.read_csv(file, low_memory=False)
        df.columns = df.columns.str.strip()
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)

print("📦 Merging 2017...")
df_2017 = load_and_merge(path_2017)

print("📦 Merging 2018...")
df_2018 = load_and_merge(path_2018)

# === FUZZY MATCH FEATURE NAMES ===
def fuzzy_align_columns(cols_src, cols_target, threshold=85):
    rename_map = {}
    for col in cols_src:
        match, score, _ = process.extractOne(col, cols_target, scorer=fuzz.token_sort_ratio)
        if score >= threshold:
            rename_map[match] = col
    return rename_map

# Match 2018 columns to 2017's naming
rename_dict = fuzzy_align_columns(df_2017.columns, df_2018.columns)
df_2018_renamed = df_2018.rename(columns=rename_dict)

# Keep only columns that now match
common_cols = df_2017.columns.intersection(df_2018_renamed.columns)
df_2017 = df_2017[common_cols]
df_2018_renamed = df_2018_renamed[common_cols]

# === CLEAN LABELS ===
import unicodedata
import re

def clean_label(label):
    if not isinstance(label, str):
        return 'Attack'

    label = label.strip().lower()
    label = unicodedata.normalize("NFKD", label).encode("ascii", "ignore").decode()
    label = re.sub(r"[^a-zA-Z0-9_]", "_", label)
    label = label.replace("__", "_")

    if merge_mode == 'binary':
        return 'Benign' if 'benign' in label or 'normal' in label else 'Attack'
    return label

df_2017['Label'] = df_2017['Label'].apply(clean_label)
df_2018_renamed['Label'] = df_2018_renamed['Label'].apply(clean_label)

# === MERGE & SHUFFLE ===
merged_df = pd.concat([df_2017, df_2018_renamed], ignore_index=True)
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

# === OUTPUT STATS ===
print(f"\n✅ Done! Merged file saved as: {output_file}")
print(f"🔢 Number of features (columns): {len(merged_df.columns)}")
print(f"\n📊 Label distribution:\n{merged_df['Label'].value_counts()}")

# === SAVE FILE ===
merged_df.to_csv(output_file, index=False)


#### Features
- Flow Duration  
- Flow Bytes/s  
- Flow Packets/s  
- Flow IAT Mean  
- Flow IAT Std  
- Flow IAT Max  
- Flow IAT Min  
- Fwd IAT Total  
- Fwd IAT Mean  
- Fwd IAT Std  
- Fwd IAT Max  
- Fwd IAT Min  
- Bwd IAT Total  
- Bwd IAT Mean  
- Bwd IAT Std  
- Bwd IAT Max  
- Bwd IAT Min  
- Fwd PSH Flags  
- Bwd PSH Flags  
- Fwd URG Flags  
- Bwd URG Flags  
- Fwd Header Length  
- Bwd Header Length  
- Fwd Packets/s  
- Bwd Packets/s  
- FIN Flag Count  
- SYN Flag Count  
- RST Flag Count  
- PSH Flag Count  
- ACK Flag Count  
- URG Flag Count  
- CWE Flag Count  
- ECE Flag Count  
- Down/Up Ratio  
- Avg Fwd Segment Size  
- Avg Bwd Segment Size  
- Fwd Avg Bulk Rate  
- Bwd Avg Bulk Rate  
- Subflow Fwd Packets  
- Subflow Fwd Bytes  
- Subflow Bwd Packets  
- Subflow Bwd Bytes  
- Active Mean  
- Active Std  
- Active Max  
- Active Min  
- Idle Mean  
- Idle Std  
- Idle Max  
- Idle Min  
- Label


### use
1. Flow Duration
Total time (in microseconds) from the first to the last packet in the flow.
Useful to detect long-running connections (e.g., backdoors or slow attacks).

2. Flow Bytes/s
Total number of bytes transferred per second in the flow.
High or very low values may suggest suspicious activity.

3. Flow Packets/s
Number of packets sent per second.
Burst patterns can indicate DoS attacks or scanning.

4. Flow IAT Mean
Average Inter-Arrival Time between packets across the flow.
Captures regularity or burstiness of communication.

5. Flow IAT Std
Standard deviation of packet inter-arrival time.
High variation can suggest abnormal behavior.

6. Flow IAT Max
Maximum time between two packets in the flow.
Unusually high values may indicate idle times or slow attacks.

7. Flow IAT Min
Minimum inter-arrival time.
Helps detect rapid packet bursts (e.g., flooding).

8. Fwd IAT Total
Total time between forward packets (source to destination).
Describes total activity period for the forward direction.

9. Fwd IAT Mean
Mean time between forward packets.
Useful for identifying scan patterns or exfiltration attempts.

10. Fwd IAT Std
Standard deviation of forward packet timing.
Shows irregular communication behavior.

11. Fwd IAT Max
Max time between forward packets.
Useful to identify idle periods in an attack.

12. Fwd IAT Min
Min time between forward packets.
Rapid transmissions may suggest flooding.

13. Bwd IAT Total
Total time between backward packets (destination to source).
Shows total response activity.

14. Bwd IAT Mean
Mean inter-arrival time for backward packets.
Helps understand server/client behavior.

15. Bwd IAT Std
Standard deviation of backward packet times.
Variation could hint at response delays or server overload.

16. Bwd IAT Max
Max time between backward packets.
Can detect idle wait between responses.

17. Bwd IAT Min
Min time between backward packets.
High frequency might suggest fast server responses or echoes.

18. Fwd PSH Flags
Count of PUSH flags in forward direction (TCP).
Used in data transfer; excessive use could be suspicious.

19. Bwd PSH Flags
Same as above, but for backward direction.

20. Fwd URG Flags
Count of URGENT flags in forward packets.
Rarely used in normal traffic — often abused in attacks.

21. Bwd URG Flags
Same for backward packets.

22. Fwd Header Length
Total size of headers in forward packets.
Might indicate packet crafting or tunneling attempts.

23. Bwd Header Length
Same for backward direction.

24. Fwd Packets/s
Packet rate in forward direction.
High rates could mean scanning or exfiltration.

25. Bwd Packets/s
Packet rate in backward direction.

26. FIN Flag Count
Number of packets with FIN flag (TCP session closure).
Helps detect orderly shutdowns or incomplete sessions (e.g., TCP FIN scan).

27. SYN Flag Count
SYN packets count (used to initiate TCP connection).
High counts may indicate SYN floods or scans.

28. RST Flag Count
RST packets reset a connection.
Abnormal use may suggest scanning or disruption.

29. PSH Flag Count
PUSH flag enables data transfer without waiting for buffer.
High counts can signal tunneling or custom protocols.

30. ACK Flag Count
Acknowledgment packets count.
Important for validating bidirectional traffic.

31. URG Flag Count
URG flag indicates urgent data.
Rarely used — can be abused in stealthy attacks.

32. CWE Flag Count
Likely custom flag (possibly Content Warning or a typo in some CSVs).
May not be standard — depends on dataset version.

33. ECE Flag Count
Explicit Congestion Notification Echo.
Used for congestion control — may help detect overuse or spoofing.

34. Down/Up Ratio
Ratio of downstream to upstream traffic.
Asymmetry could imply upload/download behavior or scanning.

35. Avg Fwd Segment Size
Average size of forward TCP segments.
Can show if the client is pushing large data blocks.

36. Avg Bwd Segment Size
Same for backward segments (e.g., server responses).

37. Fwd Avg Bulk Rate
Bulk data rate of forward flow.
High rate may point to data exfiltration.

38. Bwd Avg Bulk Rate
Same for backward.

39. Subflow Fwd Packets
Number of packets in forward subflows.
Can indicate fragmentation or flow splitting.

40. Subflow Fwd Bytes
Bytes sent in forward subflows.

41. Subflow Bwd Packets
Packets in backward subflows.

42. Subflow Bwd Bytes
Bytes in backward subflows.

43. Active Mean
Mean time the flow was actively transmitting packets.
Useful for spotting silent intervals in attacks.

44. Active Std
Std dev of active times.

45. Active Max
Max duration of activity burst.

46. Active Min
Min burst activity time.

47. Idle Mean
Average idle time (periods with no packets).
High values may indicate slow connections or covert channels.

48. Idle Std
Standard deviation of idle times.

49. Idle Max
Maximum observed idle time.

50. Idle Min
Minimum idle time between packets.

51. Label
Attack name or Benign.
Used as the target class for training the ML model.