In [None]:
import sys
from pathlib import Path
from tqdm import tqdm

# 假設 utils 模組中的函數能夠被正確導入
# 這需要您確保 utils.py 在 Python 的搜尋路徑中
from utils import iterate_json_files, extract_sentences_from_file, read_filenames_from_csv

# --- 您的常數設定 ---
CSV_PATH = Path("/home/tommy/Projects/PcodeBERT/dataset/csv/merged_adjusted_filtered.csv")
RAW_DATA_PATH = Path("/home/tommy/Projects/PcodeBERT/reverse/new/results")
ERROR_LOG_PATH = Path("/home/tommy/Projects/PcodeBERT/outputs/preprocessed/error_log.txt") # 假設需要這個參數
TARGET_CPU = "x86_64"
TARGET_TOKEN = "INT_"

def find_raw_files_with_token(csv_path: Path, root_dir: Path, error_log_path: Path, cpu_to_process: str, target_token: str, limit: int = 10):
    """
    遍歷原始 JSON 檔案，篩選出其指令中包含特定 token 的檔名。
    """
    print(f"--- 開始尋找包含 '{target_token}' token 的原始檔案 (CPU: {cpu_to_process}) ---")
    
    # 使用原本腳本中的函數來產生檔案數據的迭代器
    # file_iterator 預期回傳 (file_name, json_data_string)
    file_iterator = iterate_json_files(csv_path, root_dir, error_log_path, cpu_filter=cpu_to_process)
    
    found_filenames = []
    
    # 預先計算總檔案數以設定進度條
    try:
        total_files = len(read_filenames_from_csv(csv_path, cpu_filter=cpu_to_process))
    except:
        # 如果無法計算總數，則不顯示進度條
        total_files = None
        print("Warning: Could not get total file count for progress bar.")

    # 進行檔案處理和篩選
    for file_name, json_data_string in tqdm(file_iterator, total=total_files, desc="Searching files"):
        
        # 1. 使用原本腳本中的函數將原始資料轉換為句子列表 (token list of lists)
        # 假設 extract_sentences_from_file 接受原始檔案數據 (這裡假設是 json_data_string)
        sentences_from_file = extract_sentences_from_file((file_name, json_data_string))
        
        if sentences_from_file:
            # 2. 檢查這些句子中是否有包含目標 token
            token_found = False
            
            # sentences_from_file 預期是 List[List[Token]]
            for sentence in sentences_from_file:
                if target_token in sentence:
                    token_found = True
                    break # 找到一個句子包含 token 即可
            
            # 3. 如果找到，則記錄檔名
            if token_found:
                found_filenames.append(file_name)
                
                # 達到數量限制則停止
                if len(found_filenames) >= limit:
                    break

    return found_filenames

if __name__ == "__main__":
    
    # 執行篩選
    target_filenames = find_raw_files_with_token(
        csv_path=CSV_PATH, 
        root_dir=RAW_DATA_PATH, 
        error_log_path=ERROR_LOG_PATH, 
        cpu_to_process=TARGET_CPU, 
        target_token=TARGET_TOKEN, 
        limit=10
    )

    print("\n" + "="*50)
    print(f"✅ 已找到包含 '{TARGET_TOKEN}' token 的原始檔案名 ({len(target_filenames)} 個):")
    
    if target_filenames:
        for i, filename in enumerate(target_filenames):
            print(f"   {i+1}. {filename}")
    else:
        print("   ❌ 未找到符合條件的檔案。")
    print("="*50)

In [None]:
import pickle
from typing import Any, List, Union

file = "/home/tommy/Projects/PcodeBERT/outputs/preprocessed/pcode_corpus_x86_64_new_data.pkl"

total_batches = 0
total_sentences_checked = 0

print(f"--- 檔案結構簡潔檢查: {file} ---")

with open(file, "rb") as f:
    while True:
        try:
            # 每次成功呼叫 load()，代表載入了一個獨立的序列化物件 (一個 Batch)
            corpus_batch: Any = pickle.load(f)
            total_batches += 1
            
            # --- 新增的結構檢查與資訊輸出 ---
            batch_type_str = f"Batch {total_batches} ({type(corpus_batch).__name__}, len={len(corpus_batch) if hasattr(corpus_batch, '__len__') else 'N/A'}): "
            
            if isinstance(corpus_batch, list) and corpus_batch:
                first_element = corpus_batch[0]
                
                if isinstance(first_element, list):
                    # 判斷為巢狀結構：List[List[Token]]
                    print(batch_type_str + "✅ 結構: List[List[Token]] (2D)")
                    # 巢狀結構，計算總和
                    batch_sentences = sum(len(sublist) for sublist in corpus_batch if isinstance(sublist, list))
                    print(f"    - 句子範例: {first_element[:5]}...")
                else:
                    # 判斷為單層結構：List[Token]
                    print(batch_type_str + "❌ 結構: List[Token] (1D) - 警告：句子可能被展平。")
                    batch_sentences = len(corpus_batch)
                    print(f"    - 元素範例: {corpus_batch[:5]}...")
            else:
                # 非列表或其他非預期結構
                print(batch_type_str + "❓ 結構: 非列表或為空。")
                batch_sentences = 0
            
            total_sentences_checked += batch_sentences
            
        except EOFError:
            break
        except Exception as e:
            print(f"Error reading batch {total_batches + 1}: {e}")
            break

print(f"\n--- 最終總結 ---")
print(f"檔案路徑: {file}")
print(f"找到的總 Batch (序列化物件) 數量: {total_batches}")
print(f"總共計算到的句子數量: {total_sentences_checked}")