In [8]:
#get the sop_uid from file name, then find in excel, statistics how much does it match.

import logging
import pathlib
from typing import Union, List
import traceback

logger = logging.getLogger(__name__)

from enum import Enum

class ErrorCode(Enum):
    """Centralized error codes using Enum for better type safety"""
    SUCCESS = (0, "Success")
    JSON_FILE_NOT_FOUND = (-19, "JSON file not found")
    INVALID_JSON_FORMAT = (-2, "Invalid JSON format")
    IMAGE_FILE_NOT_FOUND = (-3, "Image file not found")
    INVALID_INPUT_DIRECTORY = (-4, "Invalid input directory")
    SPREADSHEET_ERROR = (-5, "Spreadsheet operation error")
    INVALID_SHAPE_DATA = (-6, "Invalid shape data in JSON")
    FILE_OPERATION_ERROR = (-7, "File operation failed")

    def __init__(self, code, message):
        self.code = code
        self.message = message

    @classmethod
    def get_message(cls, code: int) -> str:
        """Get error message for a given code"""
        for error in cls:
            if error.code == code:
                return error.message
        return "Unknown error"
        
def collect_and_save_filenames(
    target_dir: Union[str, pathlib.Path],
    suffix: str,
    output_file: Union[str, pathlib.Path]
) -> int:
    """
    Recursively collect filenames with given suffix and save stems to file.
    
    Args:
        target_dir: Directory to search recursively
        suffix: File suffix to match (e.g. '.txt')
        output_file: File to save filename stems
    
    Returns:
        True if operation succeeded, False otherwise
    """
    try:
        # Convert to Path objects
        target_path = pathlib.Path(target_dir).resolve()
        output_path = pathlib.Path(output_file).resolve()

        # Validate inputs
        if not target_path.is_dir():
            logger.error(f"Target directory not found: {target_path}")
            return ErrorCode.IMAGE_FILE_NOT_FOUND.code
            
        if not suffix.startswith("."):
            suffix = f".{suffix}"
            logger.info(f"Normalized suffix to: {suffix}")

        # Prepare output directory
        output_parent = output_path.parent
        output_parent.mkdir(parents=True, exist_ok=True)

        # Collect unique filename stems
        logger.info(f"Starting recursive search in {target_path} for *{suffix} files")
        file_stems = set()
        
        for file_path in target_path.rglob(f"*{suffix}"):
            if file_path.is_file():
                file_stems.add(file_path.stem)
                logger.debug(f"Found matching file: {file_path}")

        # Write results
        if not file_stems:
            logger.warning("No matching files found")
            return True

        with output_path.open("w") as f:
            f.write("\n".join(sorted(file_stems)))
            
        logger.info(f"Successfully saved {len(file_stems)} stems to {output_path}")
        return ErrorCode.SUCCESS.code

    except PermissionError as e:
        logger.error(f"Permission denied: {e}")
        return ErrorCode.FILE_OPERATION_ERROR.code
    except Exception as e:
        logger.error(f"Unexpected error: {str(e)}\n{traceback.format_exc()}")
        return ErrorCode.FILE_OPERATION_ERROR.code



homedir=r'/mnt/f/241129-xin1zhipu-thyroid-datas/11-extracted-received-datas/301PX_250219-croppedMultiNodulesInOneCase/'
homedir+='02.202201020172.01'
suffix='.json'
outfile='sop_id.txt'
collect_and_save_filenames(homedir, suffix, outfile)

0

In [10]:
import logging
import pandas as pd
from pathlib import Path
from datetime import datetime
from typing import Union

logger = logging.getLogger(__name__)

def find_and_save_matches(
    text_file: Union[str, Path],
    excel_file: Union[str, Path],
    sheet_name: str,
    column_name: str,
    output_dir: Union[str, Path]
) -> bool:
    """
    Find matches between text file items and Excel column values
    
    Args:
        text_file: Path to text file with items to search
        excel_file: Path to Excel file to search in
        sheet_name: Worksheet name containing the data
        column_name: Column name to search for matches
        output_dir: Directory to save results
    
    Returns:
        True if operation succeeded, False otherwise
    """
    try:
        # Convert to Path objects and validate inputs
        text_path = Path(text_file).resolve()
        excel_path = Path(excel_file).resolve()
        output_dir = Path(output_dir).resolve()
        
        # Validate input files
        if not text_path.exists():
            logger.error(f"Text file not found: {text_path}")
            return False
            
        if not excel_path.exists():
            logger.error(f"Excel file not found: {excel_path}")
            return False

        # Read text file items
        with text_path.open('r') as f:
            search_items = [line.strip() for line in f if line.strip()]
            
        if not search_items:
            logger.warning("No valid items found in text file")
            return True

        # Read Excel data
        try:
            df = pd.read_excel(excel_path, sheet_name=sheet_name, usecols=[column_name])
        except ValueError as e:
            logger.error(f"Column or sheet not found: {str(e)}")
            return False
            
        excel_values = set(df[column_name].astype(str).str.strip())

        # Find matches
        matches = [item for item in search_items if item in excel_values]
        
        # Create output filename with timestamp
        output_dir.mkdir(parents=True, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        output_file = output_dir / f"matched-{timestamp}.txt"

        # Save matches if any found
        if matches:
            with output_file.open('w') as f:
                f.write('\n'.join(matches))
            logger.info(f"Saved {len(matches)} matches to {output_file}")
        else:
            logger.warning("No matches found between text file and Excel column")

        # Log statistics
        logger.debug(f"Search items: {len(search_items)}")
        logger.debug(f"Excel values: {len(excel_values)}")
        logger.debug(f"Matches found: {len(matches)}")

        return True

    except pd.errors.EmptyDataError:
        logger.error("Excel file contains no data in the specified sheet")
        return False
    except Exception as e:
        logger.error(f"Unexpected error: {str(e)}", exc_info=True)
        return False

excelfile=r"/mnt/f/241129-xin1zhipu-thyroid-datas/11-extracted-received-datas/301PX_250219-croppedMultiNodulesInOneCase/nodules_0125_receivedFrom250222.xlsx"
# Example usage:
find_and_save_matches(
     text_file=outfile,
     excel_file=excelfile,
     sheet_name="nodules",
     column_name="sop_uid",
     output_dir="results"
 )

True

In [7]:
from enum import Enum

class ErrorCode(Enum):
    """Centralized error codes using Enum for better type safety"""
    SUCCESS = (0, "Success")
    JSON_FILE_NOT_FOUND = (-19, "JSON file not found")
    INVALID_JSON_FORMAT = (-2, "Invalid JSON format")
    IMAGE_FILE_NOT_FOUND = (-3, "Image file not found")
    INVALID_INPUT_DIRECTORY = (-4, "Invalid input directory")
    SPREADSHEET_ERROR = (-5, "Spreadsheet operation error")
    INVALID_SHAPE_DATA = (-6, "Invalid shape data in JSON")
    FILE_OPERATION_ERROR = (-7, "File operation failed")

    def __init__(self, code, message):
        self.code = code
        self.message = message

    @classmethod
    def get_message(cls, code: int) -> str:
        """Get error message for a given code"""
        for error in cls:
            if error.code == code:
                return error.message
        return "Unknown error"

# In parseXiaobaoJson function:
def parseXiaobaoJson(json_file:pathlib.Path, leastPointCount:int=4):
    # ... existing code ...
    if not json_file.is_absolute():
        return ErrorCode.INVALID_JSON_FORMAT.code
    if not json_file.is_file():
        return ErrorCode.JSON_FILE_NOT_FOUND.code
    if not image_file.is_file():
        return ErrorCode.IMAGE_FILE_NOT_FOUND.code
    if pointCntInShape < leastPointCount:
        return ErrorCode.INVALID_SHAPE_DATA.code
parseXiaobaoJson(pathlib.Path("/tmp"))

-19

In [4]:
test001()

AttributeError: 'tuple' object has no attribute 'code'