In [1]:
import io
import logging
import os
import pymupdf4llm

In [2]:
# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
class ConvertResumesToText:
    def __init__(self, dir_orig, dir_dest):
        self.dir_orig = dir_orig
        self.dir_dest = dir_dest
        
        # Create orig directory if it doesn't exist
        os.makedirs(self.dir_orig, exist_ok=True)

        # Create destination directory if it doesn't exist
        os.makedirs(self.dir_dest, exist_ok=True)

    def _get_paths(self):
        files = []
        try:
            for file in os.listdir(self.dir_orig):  
                if file.endswith((".pdf", ".docx")):
                    logger.debug(f"Found file: {file}")
                    # Create full source path
                    source_path = os.path.join(self.dir_orig, file)
                    files.append(source_path)
            logger.debug(f"Files found: {files}")
            return files
        except FileNotFoundError:
            logger.error(f"Directory not found: {self.dir_orig}")
            return []

    def convert_files(self):
            files = self._get_paths()
            
            if not files:
                logger.warning("No PDF or DOCX files found to process")
                return

            for source_path in files:
                try:
                    # Process the file
                    logger.debug(f"Processing file: {source_path}") 
                    parsed_content = pymupdf4llm.to_markdown(source_path)

                    # get file basename
                    base_name = os.path.basename(source_path)

                    # Remove the extension from the base_name
                    base_name = os.path.splitext(base_name)[0]

                    #add .txt extension to base_name
                    file_name_text = base_name + ".txt"

                    # Create destination path with .txt extension
                    dest_file = os.path.join(self.dir_dest, file_name_text)
                    logger.debug(f"Destination file: {dest_file}")

                    # Write contents to dest_file
                    with open(dest_file, "w", encoding='utf-8') as f:
                        f.write(parsed_content)
                    logger.info(f"Successfully converted {source_path} to {dest_file}")

                except Exception as e:
                    logger.error(f"Error processing {source_path}: {str(e)}")
                    continue


In [4]:
# import argparse

# def parse_arguments():
#     """Parse command line arguments."""
#     parser = argparse.ArgumentParser(
#         description='Run notebook code from command line'
#     )
#     parser.add_argument(
#         '--dir_orig',
#         type=str,
#         required=True,
#         help='First parameter for the class'
#     )
#     parser.add_argument(
#         '--dir_dest',
#         type=str,
#         required=True,
#         help='Second parameter for the class'
#     )
#     return parser.parse_args()

In [5]:
# def main():
#     """Main execution function."""
#     # Parse command line arguments
#     args = parse_arguments()
    
#     try:
#         # Initialize your class
#         instance = ConvertResumesToText(args.param1, args.param2)
        
#         # Run your main logic
#         instance.your_method()
        
#         logger.info("Processing completed successfully")
        
#     except Exception as e:
#         logger.error(f"An error occurred: {str(e)}")
#         raise

# if __name__ == "__main__":
#     main()

In [None]:
# resumes = ConvertResumesToText(dir_orig="./resumes/to_proc/", dir_dest="./resumes/processed/")
# resumes.convert_files()