Merge CSV

In [1]:
import pandas as pd
from collections import Counter

# 檔案路徑
files = [
    "/home/tommy/Projects/cross-architecture/datasets/csv/deduplicated/20250504_dedup_combined_file_features.csv",
    "/home/tommy/Projects/cross-architecture/datasets/csv/deduplicated/20250428-5_dedup_combined_file_features.csv",
    "/home/tommy/Projects/cross-architecture/datasets/csv/deduplicated/20250430-5_dedup_combined_file_features.csv",
    "/home/tommy/Projects/cross-architecture/datasets/csv/20250508_cleaned_all_malware_file_features.csv"
]

# 讀取並合併檔案
dfs = [pd.read_csv(f) for f in files]
combined = pd.concat(dfs, ignore_index=True)

# 檢查重複的檔名
filename_counts = Counter(combined["file_name"])
duplicates = {name: count for name, count in filename_counts.items() if count > 1}

# 顯示重複檔名數量
print(f"總行數: {len(combined)}")
print(f"重複檔名數量: {len(duplicates)}")

# 若有重複，顯示前5個
if duplicates:
    print("\n前5個重複檔名:")
    for name, count in list(duplicates.items())[:5]:
        print(f"{name}: 出現 {count} 次")

# 根據條件處理重複項
# 優先保留 CPU 和 label 欄位不為空的資料
def select_priority_row(group):
    # 檢查 CPU 和 label 欄位是否有值
    has_values = ~(group['CPU'].isna() | group['label'].isna())
    if has_values.any():
        # 返回有值的第一筆資料
        return group[has_values].iloc[0:1]
    else:
        # 若都沒有值，則保留第一筆
        return group.iloc[0:1]

# 根據 file_name 分組，並對每組應用選擇函數
deduped = combined.groupby('file_name').apply(select_priority_row).reset_index(drop=True)

# 儲存結果
deduped.to_csv("merged_deduped.csv", index=False)
print(f"\n移除重複後行數: {len(deduped)}")
print("已儲存至 merged_deduped.csv")

總行數: 33197
重複檔名數量: 266

前5個重複檔名:
7357069bf5306677330773a936078cb4840de493fd673ef11be8843a524bdb6d: 出現 2 次
73c581435280b4f823b5eaf90aea9cfe4999301e9852af7a1e95e9e77551434c: 出現 2 次
73fd677a62e465c6b6f028eeda0c5f79147eff45b98478cb97864835f58a553e: 出現 2 次
735ba90862662fd7cdc80d1ec0d440967b078fefc700e303ec083f0e57aafbe7: 出現 2 次
737ad21399108c65f7f8b3085410f5aa0957995df4c3dbfbc2fbc72f9bd81ac5: 出現 2 次


  deduped = combined.groupby('file_name').apply(select_priority_row).reset_index(drop=True)



移除重複後行數: 32931
已儲存至 merged_deduped.csv


In [3]:
# Check Merged File

merged_file = pd.read_csv("merged_deduped.csv")

# CPU by Label
cpu_label_counts = merged_file.groupby(['CPU', 'label']).size().reset_index(name='count')

print("\nCPU by Label Counts:") 
print(cpu_label_counts)



CPU by Label Counts:
                              CPU        label  count
0                         AArch64       gafgyt     16
1                         AArch64        kaiji     24
2                         AArch64  meterpreter     26
3                         AArch64        mirai    168
4                         AArch64     mobidash     13
5                         AArch64      tsunami     17
6                         AArch64        wroba     11
7                             ARM       benign   1922
8                             ARM       dofloo    182
9                             ARM       gafgyt   1249
10                            ARM       hajime     45
11                            ARM        kaiji     93
12                            ARM  meterpreter     70
13                            ARM        mirai   1706
14                            ARM     mobidash     68
15                            ARM      tsunami    809
16                            ARM        wroba     52
17  Ad

In [None]:
import pandas as pd

def split_cpu_data(csv_filename):
    # Read the CSV file
    df = pd.read_csv(csv_filename)
    train_cpus = ['ARM', 'Advanced Micro Devices X86-64']
    test_cpus = ['MIPS R3000']
    
    # Split data into train and test sets
    train_df = df[df['CPU'].isin(train_cpus)]
    test_df = df[df['CPU'].isin(test_cpus)]
    
    # Get base filename without extension
    base_filename = csv_filename.split('.')[0]
    
    # Save the train and test CSV files
    train_filename = f"{base_filename}_train.csv"
    test_filename = f"{base_filename}_test.csv"
    
    train_df.to_csv(train_filename, index=False)
    test_df.to_csv(test_filename, index=False)
    
    print(f"Train set: {len(train_df)} rows, saved as {train_filename}")
    print(f"Test set: {len(test_df)} rows, saved as {test_filename}")
    
    return train_df, test_df

# Usage
if __name__ == "__main__":
    train, test = split_cpu_data("merged_deduped.csv")

Train set: 20201 rows, saved as merged_deduped_train.csv
Test set: 12193 rows, saved as merged_deduped_test.csv


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd

# 參數設定
original_csv_path = "/home/tommy/Projects/cross-architecture/datasets/csv/Cross-arch_Dataset_with_Additional_20250428200830.csv"
output_csv_path = "/home/tommy/Projects/cross-architecture/datasets/csv/Additional_Files_20250506.csv"

# 讀取原始CSV檔案
df = pd.read_csv(original_csv_path)

# 篩選source欄位值為additional的資料
filtered_df = df[df['source'] == 'additional']

# Drop Source欄位
filtered_df = filtered_df.drop(columns=['source'])

# 顯示篩選結果數量
print(f"原始資料共 {len(df)} 筆")
print(f"標記為additional的資料共 {filtered_df.shape[0]} 筆")

# 儲存篩選後的資料到新CSV檔案
filtered_df.to_csv(output_csv_path, index=False)
print(f"已將篩選後的資料儲存至 {output_csv_path}")

原始資料共 77054 筆
標記為additional的資料共 2549 筆
已將篩選後的資料儲存至 /home/tommy/Projects/cross-architecture/datasets/csv/Additional_Files_20250506.csv


In [None]:
file_path="/home/tommy/datasets/202403_Malware(New).csv"

