In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# ==== LOAD DATA ====
df = pd.read_csv("/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_merged.csv")

df.drop(columns=['id'], inplace=True)

# Identify categorical and numeric columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Don't scale label column if it's the target
label_col = 'label'
if label_col in num_cols:
    num_cols.remove(label_col)

# ==== 1. FREQUENCY ENCODING FOR CATEGORICAL VARIABLES ====
freq_mappings = {}

for col in cat_cols:
    freqs = df[col].value_counts(normalize=True)  # relative frequencies
    df[col] = df[col].map(freqs)
    freq_mappings[col] = freqs.to_dict()

# ==== 2. SCALING NUMERIC FEATURES ====
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# ==== 3. SAVE PROCESSED DATASET ====
df.to_csv("/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_freq_scaled.csv", index=False)
print("\nProcessed dataset saved as UNSW_NB15_freq_scaled.csv")

# ==== 4. SAVE FREQUENCY MAPPINGS ====
with open("/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/categorical_frequency_mappings.txt", "w") as f:
    for col, mapping in freq_mappings.items():
        f.write(f"{col}:\n")
        for category, freq in mapping.items():
            f.write(f"  {category}: {freq:.6f}\n")
            print(f"  {category}: {freq:.6f}\n")
        f.write("\n")

print("\nFrequency mappings saved to categorical_frequency_mappings.txt")

# ==== 5. QUICK CHECK ====
print("\n===== Sample Encoded Data =====")
print(df.head())

print("\n===== Example Frequency Mapping =====")
for col in cat_cols:
    print(f"\n{col} mapping:")
    for category, freq in freq_mappings[col].items():
        print(f"  {category}: {freq:.6f}")
    break  # show only the first categorical mapping here



Processed dataset saved as UNSW_NB15_freq_scaled.csv
  tcp: 0.477508

  udp: 0.359762

  unas: 0.060538

  arp: 0.014926

  ospf: 0.012694

  sctp: 0.005720

  any: 0.001537

  gre: 0.001215

  rsvp: 0.001025

  ipv6: 0.001017

  sun-nd: 0.000990

  swipe: 0.000982

  mobile: 0.000982

  pim: 0.000982

  sep: 0.000974

  xns-idp: 0.000516

  prm: 0.000516

  leaf-1: 0.000516

  cbt: 0.000512

  cphb: 0.000512

  iso-ip: 0.000512

  kryptolan: 0.000512

  cpnx: 0.000512

  il: 0.000512

  3pc: 0.000512

  ipv6-route: 0.000512

  idrp: 0.000512

  bna: 0.000512

  mfe-nsp: 0.000512

  ib: 0.000512

  dgp: 0.000512

  sat-mon: 0.000512

  pri-enc: 0.000512

  pvp: 0.000512

  ipv6-frag: 0.000512

  wsn: 0.000512

  ddp: 0.000512

  snp: 0.000512

  ipv6-opts: 0.000512

  ax.25: 0.000512

  eigrp: 0.000512

  pgm: 0.000512

  idpr-cmtp: 0.000512

  pnni: 0.000512

  gmtp: 0.000512

  narp: 0.000512

  fc: 0.000512

  pipe: 0.000512

  ipcomp: 0.000512

  ipv6-no: 0.000512

  sat-expak: 0.

In [None]:
from joblib import dump

# Save scaler for future use
dump(scaler, "/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/scaler.joblib")
print("\nScaler saved as scaler.joblib")


Scaler saved as scaler.joblib


# attack_cat as categoriccal instead of frequency map

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# ==== LOAD DATA ====
df = pd.read_csv("/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_merged.csv")
df.drop(columns=['id'], inplace=True)

# Identify categorical and numeric columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Don't scale label column if it's the target
label_col = 'label'
if label_col in num_cols:
    num_cols.remove(label_col)

# ==== 1. ENCODING CATEGORICAL VARIABLES ====
freq_mappings = {}
attack_cat_mapping = {}

for col in cat_cols:
    if col == "attack_cat":
        # Integer encoding with "Normal" as 0
        unique_vals = sorted(df[col].dropna().unique())
        unique_vals = [v for v in unique_vals if str(v).lower() == "normal"] + \
                      [v for v in unique_vals if str(v).lower() != "normal"]
        attack_cat_mapping = {val: idx for idx, val in enumerate(unique_vals)}
        df[col] = df[col].map(attack_cat_mapping)
    else:
        # Frequency encoding
        freqs = df[col].value_counts(normalize=True)
        df[col] = df[col].map(freqs)
        freq_mappings[col] = freqs.to_dict()

# ==== 2. SCALING NUMERIC FEATURES ====
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# ==== 3. SAVE PROCESSED DATASET ====
processed_path = "/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_attack_cat.csv"
df.to_csv(processed_path, index=False)
print(f"\nProcessed dataset saved as {processed_path}")

# ==== 4. SAVE ENCODING MAPPINGS ====
mapping_path = "/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/categorical_frequency_mappings_2.txt"
with open(mapping_path, "w") as f:
    f.write("attack_cat (integer encoding):\n")
    for category, idx in attack_cat_mapping.items():
        f.write(f"  {category}: {idx}\n")
    f.write("\n")
    for col, mapping in freq_mappings.items():
        f.write(f"{col} (frequency encoding):\n")
        for category, freq in mapping.items():
            f.write(f"  {category}: {freq:.6f}\n")
        f.write("\n")

print(f"\nEncoding mappings saved to {mapping_path}")

# ==== 5. QUICK CHECK ====
print("\n===== Sample Encoded Data =====")
print(df.head())

print("\n===== attack_cat mapping =====")
for category, idx in attack_cat_mapping.items():
    print(f"  {category}: {idx}")



Processed dataset saved as /content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_attack_cat.csv

Encoding mappings saved to /content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/categorical_frequency_mappings_2.txt

===== Sample Encoded Data =====
        dur     proto   service   state     spkts     dpkts    sbytes  \
0 -0.188346  0.477508  0.548451  0.4547 -0.101342 -0.129612 -0.047849   
1 -0.099897  0.477508  0.548451  0.4547 -0.042496  0.173998 -0.045110   
2  0.063006  0.477508  0.548451  0.4547 -0.086630 -0.022456 -0.047239   
3  0.072800  0.477508  0.019327  0.4547 -0.057207 -0.058174 -0.045720   
4 -0.133449  0.477508  0.548451  0.4547 -0.071919 -0.111753 -0.046261   

     dbytes      rate      sttl  ...  ct_dst_sport_ltm  ct_dst_src_ltm  \
0 -0.097232 -0.568650  0.702512  ...         -0.520051       -0.658496   
1  0.188966 -0.568623 -1.151363  ...         -0.520051       -0.568574   
2 -0.008217 -0.569024 -1.151363  ...  

# one hot encoding


That's an excellent question, and it's a critical point when applying causal discovery algorithms like DirectLiNGAM.

For DirectLiNGAM, it is generally much better to use one-hot encoding (a form of mapping) than to use frequency encoding or simple integer mapping.

Why One-Hot Encoding is Best
The fundamental assumption of LiNGAM and similar methods is that the relationships between variables are linear and continuous. When you have a categorical variable, you need to transform it into a format that honors this assumption.

Avoids False Order: Simple integer mapping (e.g., A=1, B=2, C=3) imposes an artificial ordinal relationship on the data. DirectLiNGAM might interpret this as a linear effect, incorrectly assuming that C is "more" of something than A simply because its integer value is higher. This can lead to misleading or incorrect causal relationships.

Captures Individual Effects: One-hot encoding creates a new binary variable (0 or 1) for each category. For example, if your variable is color with categories red and blue, you'd get two new features: is_red and is_blue. This allows the model to learn the unique causal effect of being in the red category and the effect of being in the blue category independently, without any false assumptions about their relative order.

Frequency Encoding is Problematic: Frequency encoding replaces each category with its frequency in the dataset. This can also lead to incorrect causal inference, as it loses the distinction between the categories themselves and instead models their causal effect based on how often they occur. Two different categories that have a similar frequency would be treated identically by the model.


One-hot encoding could be considered if:Many Categories with Distinct Effects: If proto has many categories (e.g., 10+ protocols), and each has a unique causal role (e.g., specific protocols strongly tied to is_malicious), one-hot encoding ensures each category’s effect is modeled separately.
Non-Linear Models: If you were using a causal discovery method that handles categorical variables or non-linear relationships (e.g., PC algorithm in causal-learn), one-hot encoding is preferred. However, DirectLiNGAM’s linear assumption makes this less relevant.
Domain Knowledge: If you know protocols have independent causal effects and you want to avoid any ordinal assumptions, one-hot encoding is safer, but you’d need to manage the increased dimensionality.



In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# ==== LOAD DATA ====
df = pd.read_csv("/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_merged.csv")
df.drop(columns=['id'], inplace=True)

# Identify categorical and numeric columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Don't scale label column if it's the target
label_col = 'label'
if label_col in num_cols:
    num_cols.remove(label_col)

# ==== 2. ONE-HOT ENCODE ALL CATEGORICAL FEATURES ====
df = pd.get_dummies(df, columns=cat_cols, drop_first=False)

# ==== 3. SCALING NUMERIC FEATURES ====
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# ==== 4. SAVE PROCESSED DATASET ====
processed_path = "/content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_all_ohe.csv"
df.to_csv(processed_path, index=False)
print(f"\nProcessed dataset saved as {processed_path}")

# ==== 5. PRINT NEW attack_cat FEATURES ====
attack_cat_cols = [col for col in df.columns if col.startswith("attack_cat_")]
print("\n===== One-Hot Encoded attack_cat Columns =====")
print(attack_cat_cols)



Processed dataset saved as /content/drive/MyDrive/Data-science-project/journal-article/UNSW_NB15/data/UNSW_NB15_all_ohe.csv

===== One-Hot Encoded attack_cat Columns =====
['attack_cat_Analysis', 'attack_cat_Backdoor', 'attack_cat_DoS', 'attack_cat_Exploits', 'attack_cat_Fuzzers', 'attack_cat_Generic', 'attack_cat_Normal', 'attack_cat_Reconnaissance', 'attack_cat_Shellcode', 'attack_cat_Worms']
