Prepare Cross Architecture Base Dataset

In [1]:
import pandas as pd

input_file = "/home/tommy/Project/PcodeBERT/dataset/csv/merged_adjusted_filtered.csv"
output_file = "/home/tommy/Project/PcodeBERT/dataset/csv/merged_filtered_final.csv"

try:
    df = pd.read_csv(input_file)

    families_to_exclude = ['dofloo', 'kaiji', 'mobidash', 'wroba']
    
    condition_to_exclude = (df['CPU'] == 'ARM') & (df['family'].isin(families_to_exclude))
    
    filtered_df = df[~condition_to_exclude]
    
    filtered_df.to_csv(output_file, index=False)
    
    print(f"資料過濾完成，已儲存至 {output_file}")
    print(f"原始筆數: {len(df)}")
    print(f"過濾後筆數: {len(filtered_df)}")

except FileNotFoundError:
    print(f"錯誤：找不到檔案 {input_file}")
except Exception as e:
    print(f"發生錯誤：{e}")

資料過濾完成，已儲存至 /home/tommy/Project/PcodeBERT/dataset/csv/merged_filtered_final.csv
原始筆數: 15622
過濾後筆數: 15291


Clean new dataset

In [5]:
import pandas as pd
import numpy as np

# -----------------
# 1. 調整後的資料分佈 (沿用您上一回合的結果)
# -----------------
adjusted_data = {
    'ARM': [1470, 182, 378, 93, 70, 517, 68, 246, 52],
    'x86_64': [1851, 49, 607, 54, 157, 514, 12, 575, 10],
    'Intel': [1478, 346, 390, 46, 67, 515, 50, 242, 16],
    'MIPS': [1618, 111, 439, 176, 38, 550, 141, 267, 45],
    'PPC': [1518, 0, 445, 0, 21, 614, 0, 119, 0]
}
families = ['benign', 'dofloo', 'gafgyt', 'kaiji', 'meterpreter', 'mirai', 'mobidash', 'tsunami', 'wroba']
df_adjusted_counts = pd.DataFrame(adjusted_data, index=families)

# -----------------
# 2. 舊名稱到新名稱的對應表
# -----------------
cpu_mapping = {
    'Intel 80386': 'Intel',
    'Advanced Micro Devices X86-64': 'x86_64',
    'MIPS R3000': 'MIPS',
    'PowerPC': 'PPC',
    'ARM': 'ARM',
    # 忽略 'AArch64' 和 'hajime' 相關的樣本，因為它們不在調整分佈表中。
}

# -----------------
# 3. 讀取原始 CSV (使用您提供的路徑)
# -----------------
csv_path = "/home/tommy/Projects/cross-architecture/datasets/scripts/merged_deduped.csv"
try:
    df = pd.read_csv(csv_path)
    print(f"成功讀取檔案: {csv_path}")
except FileNotFoundError:
    print(f"錯誤：找不到檔案 {csv_path}。請檢查路徑是否正確。")
    exit()

# -----------------
# 4. 資料欄位名稱和值轉換
# -----------------

# a. 將原本的 'label' 欄位名稱改為 'family'
df = df.rename(columns={'label': 'family'})

# b. 轉換 CPU 名稱 (使用新的縮寫)
df['CPU'] = df['CPU'].replace(cpu_mapping)

# c. 建立新的 'label' 欄位 (0: benign, 1: malware)
df['label'] = df['family'].apply(lambda x: 0 if x == 'benign' else 1)

# d. 過濾掉不在調整表中的 family 和 CPU
valid_cpus = df_adjusted_counts.columns
valid_families = df_adjusted_counts.index
df = df[df['CPU'].isin(valid_cpus) & df['family'].isin(valid_families)]
print(f"已過濾掉不在調整分佈中 (如 'hajime' 和 'AArch64') 的樣本。")


# -----------------
# 5. 根據調整後的數量進行採樣 (Sampling)
# -----------------
df_new_list = []
RANDOM_SEED = 42 

for cpu in df_adjusted_counts.columns:
    for family in df_adjusted_counts.index:
        target_count = df_adjusted_counts.loc[family, cpu]
        
        # 過濾特定 CPU 和 Family
        df_subset = df[(df['CPU'] == cpu) & (df['family'] == family)]
        
        # 進行採樣
        if target_count > 0 and len(df_subset) >= target_count:
            df_sampled = df_subset.sample(n=target_count, random_state=RANDOM_SEED, replace=False)
            df_new_list.append(df_sampled)
        elif target_count > 0 and len(df_subset) < target_count:
            # 這是針對我們調整的分佈，理論上不應該發生，但作為保護機制
            print(f"警告: {cpu} - {family} 的原始樣本數 {len(df_subset)} 少於目標數 {target_count}。將保留所有原始樣本。")
            df_new_list.append(df_subset)

# 合併採樣後的資料
df_new = pd.concat(df_new_list)

# -----------------
# 6. 輸出最終 CSV
# -----------------

# 選取並重新排序所需的欄位
df_final = df_new[['file_name', 'CPU', 'label', 'family']]

# 儲存新的 DataFrame 到 CSV
output_path = "/home/tommy/Projects/PcodeBERT/dataset/csv/merged_adjusted.csv"
df_final.to_csv(output_path, index=False)

print(f"\n新的調整後資料集已成功儲存至：{output_path}")
print(f"最終樣本總數：{len(df_final)} 筆 (目標為 16157 筆)")

成功讀取檔案: /home/tommy/Projects/cross-architecture/datasets/scripts/merged_deduped.csv
已過濾掉不在調整分佈中 (如 'hajime' 和 'AArch64') 的樣本。

新的調整後資料集已成功儲存至：/home/tommy/Projects/PcodeBERT/dataset/csv/merged_adjusted.csv
最終樣本總數：16157 筆 (目標為 16157 筆)


In [3]:
csv_path = "/home/tommy/Projects/cross-architecture/datasets/scripts/merged_deduped.csv"

#display unique family and cpu 

df = pd.read_csv(csv_path)
unique_families = df['label'].unique()
unique_cpus = df['CPU'].unique()
print("Unique Families:", unique_families)
print("Unique CPUs:", unique_cpus)

Unique Families: ['tsunami' 'mirai' 'gafgyt' 'meterpreter' 'benign' 'kaiji' 'dofloo'
 'mobidash' 'hajime' 'wroba']
Unique CPUs: ['MIPS R3000' 'Advanced Micro Devices X86-64' 'ARM' 'Intel 80386'
 'PowerPC' 'AArch64']


In [None]:
import pandas as pd
import os

# --- 設定路徑 ---
CSV_PATH = "/home/tommy/Projects/PcodeBERT/dataset/csv/merged_adjusted.csv"
DATA_FOLDER = "/home/tommy/Projects/PcodeBERT/outputs/preprocessed/gpickle_new"
OUTPUT_PATH = "/home/tommy/Projects/PcodeBERT/dataset/csv/merged_adjusted_filtered.csv"

# 1. 讀取原始 CSV
df = pd.read_csv(CSV_PATH)
original_count = len(df)

# 2. 遞迴收集所有 .gpickle 檔名
gpickle_filenames = set()
for root, _, files in os.walk(DATA_FOLDER):
    for file in files:
        if file.endswith(".gpickle"):
            print(f"Found gpickle: {file}")
            gpickle_filenames.add(file)

# 3. 篩選 DataFrame 並儲存成功的部分
filtered_df = df[df['file_name'].isin(gpickle_filenames)]
filtered_df.to_csv(OUTPUT_PATH, index=False)
success_count = len(filtered_df)

# 4. 輸出結果
print(f"--- 處理結果 ---")
print(f"在 '{DATA_FOLDER}' 及其子資料夾中找到 {success_count} 個對應的 gpickle 檔案。")
print(f"成功部分已儲存至: '{OUTPUT_PATH}'")

# 5. 計算失敗數量
failed_count = original_count - success_count
print(f"共有 {failed_count} 個檔案未能在資料夾中找到 gpickle。")

# 額外：如果您想看到是哪些檔案失敗了，可以取消註解下面這行
# failed_df = df[~df['file_name'].isin(gpickle_filenames)]
# print("\n--- 部分失敗檔名範例 ---")
# print(failed_df['file_name'].head())

Found gpickle: de0d8f53e51af09f2a78ea0ad33eb9922a5d78d680305d815c8b1af42b9db71e.gpickle
Found gpickle: de4a1b602e6fb6aae6e331619890ff87633a15e1dde58aa9a35692a6427a2e7e.gpickle
Found gpickle: de1030fe637244d263f9deadf8b338196af03cdefa8f42eb775edae0bf9b69b1.gpickle
Found gpickle: de1977eee7b71a1e1153657029528bfaad546146fba216610e802ee35c65c8a8.gpickle
Found gpickle: dee2efedf2438807014b3a1ea39e63825064a182e1cf3bbebe7f54ad2c00cdea.gpickle
Found gpickle: de1bffbf8efecc46e401d970c98c0b81e92fccc6dd6ed9e868e8802acbd3cfd4.gpickle
Found gpickle: de5da89f7ab726ea95382e0810c2b4436181853270c4ca536af6f22f1c62a2c6.gpickle
Found gpickle: de2f7eca0ac6caed191444d0717ea91f24b1778d03ea252a390230a177d438ae.gpickle
Found gpickle: dec7e65ffb903afa371323779b0b94f2cad50037cccf5470bb77211062791bb4.gpickle
Found gpickle: def7b4b7096d24c5cd4a25b22c27dc7bfd8e512ebcd60b2116d0e3e1acab848d.gpickle
Found gpickle: de7d7051d308ec1529bd410e40417f136cea855b78a43c0c80a14cdb9f165d49.gpickle
Found gpickle: debb605527ef51d54