Detection Datasets

In [7]:
import pandas as pd

# === 讀取原始資料 ===
file_path = "/home/tommy/Projects/cross-architecture/Experiment3.1/dataset/20250509_merged_deduped.csv"
df = pd.read_csv(file_path)

# === 設定條件 ===
target_cpus = ["ARM", "Advanced Micro Devices X86-64", "MIPS R3000", "PowerPC", "Intel 80386"]
benign_label = "benign"
malware_labels = ["mirai", "gafgyt", "tsunami"]

# 篩選所需 CPU 和 label 的資料
filtered_df = df[df["CPU"].isin(target_cpus) & df["label"].isin([benign_label] + malware_labels)]

# === 開始分組抽樣 ===
all_samples = []

for cpu in target_cpus:
    cpu_df = filtered_df[filtered_df["CPU"] == cpu]
    
    # 取 1200 筆 benign
    benign_sample = cpu_df[cpu_df["label"] == benign_label].sample(
        n=min(1200, len(cpu_df[cpu_df["label"] == benign_label])), random_state=42
    )
    all_samples.append(benign_sample)
    
    # 每個 malware family 各取 400，並改 label
    for m_label in malware_labels:
        malware_sample = cpu_df[cpu_df["label"] == m_label].sample(
            n=min(400, len(cpu_df[cpu_df["label"] == m_label])), random_state=42
        )
        malware_sample = malware_sample.copy()
        malware_sample["label"] = "malware"
        all_samples.append(malware_sample)

# 合併所有抽樣結果
final_df = pd.concat(all_samples)

# === 儲存結果 ===
output_path = "/home/tommy/Projects/cross-architecture/Experiment3.1/dataset/detection_dataset.csv"
final_df.to_csv(output_path, index=False)



Alignment Dataset

In [None]:
import pandas as pd

# 載入 CSV
file_name = "/home/tommy/Projects/pcodeFcg/dataset/csv/combined.csv"
df = pd.read_csv(file_name)

# 篩選目標 label
target_labels = ["benign", "mirai", "gafgyt", "tsunami"]
df = df[df['label'].isin(target_labels)]

# 建立 train/test 分割
train_df = df[df['CPU'].str.contains("Advanced Micro Devices X86-64", case=False, na=False)]
test_arm_df = df[df['CPU'].str.contains("ARM", case=False, na=False)]
test_mips_df = df[df['CPU'].str.contains("MIPS", case=False, na=False)]

# 只保留需要的欄位
train_df = train_df[['file_name', 'CPU', 'label']]
test_arm_df = test_arm_df[['file_name', 'CPU', 'label']]
test_mips_df = test_mips_df[['file_name', 'CPU', 'label']]

# 儲存檔案
train_df.to_csv("/home/tommy/Projects/pcodeFcg/dataset/csv/train.csv", index=False)
test_arm_df.to_csv("/home/tommy/Projects/pcodeFcg/dataset/csv/test_arm.csv", index=False)
test_mips_df.to_csv("/home/tommy/Projects/pcodeFcg/dataset/csv/test_mips.csv", index=False)

# 顯示前幾筆檢查
print("Train:")
print(train_df.head())
print("\nTest ARM:")
print(test_arm_df.head())
print("\nTest MIPS:")
print(test_mips_df.head())


Train:
                                              file_name  \
1800  2d65b9d86d771c8d84f995f40f140ee07a1498353fe879...   
1801  afdf94d82b354b659652182e5d6fcc6a3d0d4dc20694c6...   
1802  ceb80fe99245dd78053c95b825d6a8b99767e3dbf2b7e8...   
1803  abef1c340c9c5f1870eb843f4169dceef63a70583ce68b...   
1804  35f8483e994e351d3b0ffba87893f70aa4e65ebf7ed7ff...   

                                CPU   label  
1800  Advanced Micro Devices X86-64  benign  
1801  Advanced Micro Devices X86-64  benign  
1802  Advanced Micro Devices X86-64  benign  
1803  Advanced Micro Devices X86-64  benign  
1804  Advanced Micro Devices X86-64  benign  

Test:
                                           file_name  CPU   label
0  1f20fd8843c6a5145399c66f16e27d6daa4bed92658911...  ARM  benign
1  3e78e1799bfd26e9b9a245bdeaad1ecda258b5ee03cd05...  ARM  benign
2  aa4474d13d7eb07fc790759fb04e73fd3948a53eb4825d...  ARM  benign
3  e905964de7509408a4d44ec7c8e743f23b98b25964c92a...  ARM  benign
4  3862d6a1de842210e07d56

Remove all columns of csv except for file_name, CPU, label

In [4]:
import pandas as pd

file_path = "/home/tommy/Projects/cross-architecture/Experiment3.1/dataset/cleaned_20250509_train_450.csv"
file_path_2 = "/home/tommy/Projects/cross-architecture/Experiment3.1/dataset/cleaned_20250509_test_600.csv"

df = pd.read_csv(file_path)
df_2 = pd.read_csv(file_path_2)

# 合併兩個 DataFrame
merged_df = pd.concat([df, df_2], ignore_index=True)
merged_df = merged_df[['file_name', 'CPU', 'label']]
merged_df.to_csv("/home/tommy/Projects/pcodeFcg/dataset/csv/combined.csv", index=False)
