In [None]:
# import pandas as pd
import cv2
import matplotlib.pyplot as plt
import os
from tqdm import tqdm # for progress bars

# dataset file directory
DATA_DIR = r'D:\NIH Xray Dataset'
# CSV file containing labels
CSV_PATH = os.path.join(DATA_DIR, 'Data_Entry_2017.csv')

# Load full dataset
IMAGE_SUBFOLDER_PREFIX = 'images_' # The prefix for the image folders
IMAGE_DIRS = [os.path.join(DATA_DIR, f'{IMAGE_SUBFOLDER_PREFIX}{i:03d}', 'images') for i in range(1, 13)]

# load csv metadata
try:
    df = pd.read_csv(CSV_PATH)
    print(f"CSV loaded successfully! Total entries: {len(df)}")
    print("First 5 rows of metadata:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: CSV file not found at {CSV_PATH}. Please check your DATA_DIR and CSV_PATH.")
    exit() # Exit if the CSV is not found

# --- 2. Create a mapping of image filenames to their full paths ---
image_path_map = {}
print("\nBuilding image path map (this might take a moment for large datasets)...")
found_image_count = 0
for img_dir in IMAGE_DIRS:
    if os.path.exists(img_dir):
        # Use tqdm for a progress bar if os.listdir is slow on many files
        for filename in tqdm(os.listdir(img_dir), desc=f"Scanning {os.path.basename(os.path.dirname(img_dir))}"):
            if filename.endswith('.png'): # Assuming images are PNGs
                image_path_map[filename] = os.path.join(img_dir, filename)
                found_image_count += 1
    else:
        print(f"Warning: Image directory not found: {img_dir}. Skipping.")

print(f"Image path map built. Found {found_image_count} image paths.")

# Add a new column 'Path' to the DataFrame for easy access to image paths
df['Path'] = df['Image Index'].map(image_path_map)

# Check for any images in the CSV that weren't found on disk
missing_images_in_df = df[df['Path'].isnull()]
if not missing_images_in_df.empty:
    print(f"\nWarning: {len(missing_images_in_df)} images listed in CSV were not found on disk.")
    # Option: Remove these rows if you can't find the images
    df.dropna(subset=['Path'], inplace=True)
    print(f"Removed missing images. Remaining entries in DataFrame: {len(df)}")
else:
    print("\nAll images listed in CSV found on disk.")

# --- 3. Example: Load and display a sample image using its path ---
if not df.empty:
    sample_row = df.iloc[0] # Get the first row
    sample_image_filename = sample_row['Image Index']
    sample_image_full_path = sample_row['Path']
    sample_finding_labels = sample_row['Finding Labels']

    print(f"\nAttempting to load sample image: {sample_image_filename}")
    print(f"Full path: {sample_image_full_path}")
    print(f"Labels: {sample_finding_labels}")

    try:
        image = cv2.imread(sample_image_full_path)
        if image is not None:
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # OpenCV loads BGR, Matplotlib expects RGB

            plt.figure(figsize=(8, 8))
            plt.imshow(image_rgb)
            plt.title(f"Image: {sample_image_filename}\nLabels: {sample_finding_labels}")
            plt.axis('off')
            plt.show()
        else:
            print(f"Error: Could not read image file at {sample_image_full_path}. It might be corrupted or empty.")
    except Exception as e:
        print(f"An unexpected error occurred while loading or displaying the image: {e}")
else:
    print("DataFrame is empty after processing. No images to display.")
