Prepare Cross Architecture Base Dataset

In [None]:
import pandas as pd

benign_file_path = "/home/tommy/datasets/benignware_info.csv"
malware_file_path ="/home/tommy/datasets/202403_Malware(New).csv"


benign_df = pd.read_csv(benign_file_path)
malware_df = pd.read_csv(malware_file_path)

# # Display 10 rows of Malware DataFrame
# print("\nMalware DataFrame:")
# print(malware_df.head(10))

print(benign_df.head(50))


In [None]:
import pandas as pd

# 讀取資料
benign_file_path = "/home/tommy/datasets/benignware_info.csv"
malware_file_path = "/home/tommy/datasets/202403_Malware(New).csv"

benign_df = pd.read_csv(benign_file_path)
malware_df = pd.read_csv(malware_file_path)

# 要關注的家族
target_families = [
    "gafgyt", "tsunami", "mirai", "dofloo", 
    "kaiji", "mobidash", "meterpreter", "wroba"
]

# 要關注的 CPU
target_cpus = [
    "Intel i386-32",
    "ARM-32",
    "MIPS R3000-32",
    "Advanced Micro Devices x86-64",
    "PowerPC-32"
]

# 過濾 malware
filtered = malware_df[
    malware_df["family"].isin(target_families) &
    malware_df["CPU"].isin(target_cpus)
]

# 分組統計
stats = filtered.groupby(["family", "CPU"]).size().unstack(fill_value=0)

print(stats)


In [1]:
import pandas as pd

# Benign 與 Malware 對應關係
benign_to_plan = {
    "Intel 80386": "Intel i386-32",
    "ARM": "ARM-32",
    "MIPS R3000": "MIPS R3000-32",
    "Advanced Micro Devices X86-64": "AMD X86-64",   # benign CSV: 大寫 X
    "PowerPC": "PowerPC-32"
}

malware_to_plan = {
    "Intel i386-32": "Intel i386-32",
    "ARM-32": "ARM-32",
    "MIPS R3000-32": "MIPS R3000-32",
    "Advanced Micro Devices x86-64": "AMD X86-64",   # malware CSV: 小寫 x
    "PowerPC-32": "PowerPC-32"
}

dataset_plan = {
    "ARM-32": {
        "benign": 2000,      
        "dofloo": 250,
        "gafgyt": 250,
        "kaiji": 250,
        "meterpreter": 250,
        "mirai": 750,        
        "mobidash": 250,
        "tsunami": 250,
        "wroba": 22        
    },
    "AMD X86-64": {
        "benign": 2000,
        "dofloo": 250,
        "gafgyt": 250,
        "kaiji": 181,      
        "meterpreter": 250,
        "mirai": 750,        
        "mobidash": 250,
        "tsunami": 250,
        "wroba": 19        
    },
    "Intel i386-32": {
        "benign": 2000,
        "dofloo": 250,
        "gafgyt": 250,
        "kaiji": 250,
        "meterpreter": 250,
        "mirai": 750,      
        "mobidash": 250,
        "tsunami": 250,
        "wroba": 50        
    },
    "MIPS R3000-32": {
        "benign": 2000,
        "dofloo": 250,
        "gafgyt": 250,
        "kaiji": 250,
        "meterpreter": 44,  
        "mirai": 906,        
        "mobidash": 242,    
        "tsunami": 250,
        "wroba": 58        
    },
    "PowerPC-32": {
        "benign": 2000,
        "dofloo": 0,
        "gafgyt": 250,
        "kaiji": 0,
        "meterpreter": 23,  
        "mirai": 1477,       
        "mobidash": 0,
        "tsunami": 250,
        "wroba": 0
    }
}

benign_file_path = "/home/tommy/datasets/benignware_info.csv"
malware_file_path = "/home/tommy/datasets/202403_Malware(New).csv"
output_path = "/home/tommy/Projects/PcodeBERT/dataset/csv/base_dataset.csv"

benign_df = pd.read_csv(benign_file_path)
malware_df = pd.read_csv(malware_file_path)

# 過濾掉 packed 樣本
benign_df = benign_df[benign_df["is_packed"] != 1]
malware_df = malware_df[malware_df["is_packed"] != True]

selected_benign = []
selected_malware = []

# --- 抽 benign ---
for benign_cpu, plan_cpu in benign_to_plan.items():
    target_n = dataset_plan[plan_cpu]["benign"]
    df_cpu = benign_df[benign_df["CPU"] == benign_cpu]
    sampled = df_cpu.sample(n=min(len(df_cpu), target_n), random_state=42)
    sampled = sampled.assign(label=0, family="benign")
    selected_benign.append(sampled)

# --- 抽 malware ---
for malware_cpu, plan_cpu in malware_to_plan.items():
    plan = dataset_plan[plan_cpu]
    df_cpu = malware_df[malware_df["CPU"] == malware_cpu]

    for family, n in plan.items():
        if family == "benign" or n == 0:
            continue
        df_family = df_cpu[df_cpu["family"] == family]
        sampled = df_family.sample(n=min(len(df_family), n), random_state=42)
        sampled = sampled.assign(label=1)
        selected_malware.append(sampled)

# --- 合併 ---
final_df = pd.concat(selected_benign + selected_malware).reset_index(drop=True)

# 只保留需要的欄位
final_df = final_df[["file_name", "CPU", "label", "family"]]

# --- 統一 CPU 標籤 (*** 新增區塊 ***) ---
# 建立一個從原始名稱到目標統一名稱的完整對應字典
cpu_map = {**benign_to_plan, **malware_to_plan}

# 使用 .map() 或 .replace() 將 CPU 欄位的值進行統一
final_df['CPU'] = final_df['CPU'].replace(cpu_map)


# 輸出
final_df.to_csv(output_path, index=False)
print(f"✅ Dataset saved to {output_path}, total rows = {len(final_df)}")

✅ Dataset saved to /home/tommy/Projects/PcodeBERT/dataset/csv/base_dataset.csv, total rows = 20741


In [2]:
import pandas as pd

base_data_csv = "/home/tommy/Projects/PcodeBERT/dataset/csv/base_dataset.csv"
faid_list = "/home/tommy/Projects/PcodeBERT/reverse/timed_out_files.txt"

df = pd.read_csv(base_data_csv)

with open(faid_list, "r") as f:
    faid_set = set(line.strip() for line in f if line.strip())
print(f"Files to exclude (timed out): {len(faid_set)}")

# filter out rows where file_name is in faid_set and save new CSV
filtered_df = df[~df["file_name"].isin(faid_set)]
output_csv = "/home/tommy/Projects/PcodeBERT/dataset/csv/base_dataset_filtered.csv"
filtered_df.to_csv(output_csv, index=False)
print(f"Filtered dataset saved to {output_csv}, total rows = {len(filtered_df)}")




Files to exclude (timed out): 429
Filtered dataset saved to /home/tommy/Projects/PcodeBERT/dataset/csv/base_dataset_filtered.csv, total rows = 20312
