In [1]:
# Initial imports
import os
from pathlib import Path
import pandas as pd
import sys

In [2]:
# Path Setup
# Add project root to Python path
project_root = Path.cwd().parent  # Still need this for importing src
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import path utilities
from src.utils.project_path_utils import get_project_path

# Set up Paths
metadata_csv_path = get_project_path("metadata", "images_metadata.csv")
log_path = get_project_path('logs', create=True)
log_file = log_path / 'metadata_generator.log'

print(f"Project root: {project_root}")
print(f"Metadata CSV path: {metadata_csv_path}")
print(f"Log file will be created at: {log_file}")

Project root: d:\UCSD_MJM_Data_Grooming
Metadata CSV path: d:\UCSD_MJM_Data_Grooming\metadata\images_metadata.csv
Log file will be created at: d:\UCSD_MJM_Data_Grooming\logs\metadata_generator.log


In [3]:
# Setup Logging and Run Generator
# Set up Logging 
from src.preprocessing.metadata_generator import setup_logging
setup_logging()  # Initialize logging configuration

# Import and run generator to populate metadata_csv
from src.preprocessing.metadata_generator import MetadataGenerator

# Initialize and run
generator = MetadataGenerator()
generator.process_directory()
generator.write_csv()

2025-02-19 18:15:49,688 - INFO - Loaded 0 existing records
2025-02-19 18:15:49,692 - INFO - Processed: 1078.jpg
2025-02-19 18:15:49,695 - INFO - Processed: 1079.jpg
2025-02-19 18:15:49,695 - INFO - Processed: 1080.jpg
2025-02-19 18:15:49,702 - INFO - Processed: 1081.jpg
2025-02-19 18:15:49,706 - INFO - Processed: 1082.jpg
2025-02-19 18:15:49,709 - INFO - Processed: 1083.jpg
2025-02-19 18:15:49,711 - INFO - Processed: 1084.jpg
2025-02-19 18:15:49,712 - INFO - Processed: 1085.jpg
2025-02-19 18:15:49,715 - INFO - Processed: 1086.jpg
2025-02-19 18:15:49,718 - INFO - Processed: 1087.jpg
2025-02-19 18:15:49,720 - INFO - Processed: 1088.jpeg
2025-02-19 18:15:49,723 - INFO - Processed: 1089.jpeg
2025-02-19 18:15:49,726 - INFO - Processed: 1090.jpeg
2025-02-19 18:15:49,727 - INFO - Processed: 1092.jpg
2025-02-19 18:15:49,729 - INFO - Processed: 1093.jpg
2025-02-19 18:15:49,731 - INFO - Processed: 1094.jpg
2025-02-19 18:15:49,733 - INFO - Processed: 1095.jpg
2025-02-19 18:15:49,735 - INFO - Proc

In [4]:
# Cell 5 - Verify Results
updated_metadata_df = pd.read_csv(metadata_csv_path)
print(f"\nTotal records after processing: {len(updated_metadata_df)}")
print("\nFirst few records:")
print(updated_metadata_df.head())


Total records after processing: 694

First few records:
  filename_original  invoice_number  is_standard_form  moved_to_standardized  \
0          1078.jpg            1078             False                  False   
1          1079.jpg            1079             False                  False   
2          1080.jpg            1080             False                  False   
3          1081.jpg            1081             False                  False   
4          1082.jpg            1082             False                  False   

   manual_review_notes  original_orientation  rotation_applied  wb_corrected  \
0                  NaN                   NaN                 0         False   
1                  NaN                   NaN                 0         False   
2                  NaN                   NaN                 0         False   
3                  NaN                   NaN                 0         False   
4                  NaN                   NaN                 0

In [5]:
# Explore this data update
duplicate_rows = updated_metadata_df[updated_metadata_df["invoice_number"].duplicated(keep=False)]["invoice_number"]
print(duplicate_rows)

41     1126
42     1126
67     1158
122    1229
123    1229
       ... 
646    1024
654    1034
655    1034
656    1037
657    1037
Name: invoice_number, Length: 175, dtype: int64
