In [1]:
import os
from PIL import Image
from PIL.ExifTags import TAGS
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import csv
from fractions import Fraction

In [2]:
def get_exif_data(image_path):
    """Extract EXIF data from an image and sanitize values."""
    try:
        img = Image.open(image_path)
        exif_data = img._getexif()
        if exif_data:
            exif = {}
            for tag_id, value in exif_data.items():
                tag_name = TAGS.get(tag_id, tag_id)
                # Sanitize data: Convert all values to strings and handle None
                if isinstance(value, bytes):
                    value = value.decode(errors='replace')  # Decode bytes
                exif[tag_name] = str(value) if value is not None else ""
            return exif
        else:
            return None
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None
    

In [4]:
# parallel processing

def scan_directories(root_dirs):
    """Process multiple directories in parallel."""
    data = []
    extensions = ['.jpg', '.jpeg', '.tiff', '.png']
    
    with ThreadPoolExecutor() as executor:
        futures = []
        for root_dir in root_dirs:  # Loop through each input directory
            for root, dirs, files in os.walk(root_dir):
                for file in files:
                    if os.path.splitext(file)[1].lower() in extensions:
                        image_path = os.path.join(root, file)
                        # Freeze variables for this task
                        futures.append(
                            executor.submit(
                                lambda ip=image_path, fl=file, rd=root_dir: get_exif_data_with_file(ip, fl, rd)
                            )
                        )
        
        for future in tqdm(futures, desc="Processing images"):
            exif = future.result()
            if exif:
                data.append(exif)
    return data

In [5]:
def get_exif_data_with_file(image_path, file, root_dir):
    """Wrapper to add filename/path and source directory."""
    exif = get_exif_data(image_path)
    if exif:
        exif['Filename'] = file
        exif['Filepath'] = image_path
        exif['SourceDirectory'] = root_dir  # Track which root_dir this file came from
    return exif

In [14]:
def convert_exposure_time(exposure_str):
    """Convert any ExposureTime to strict '1/X' or 'X"' format."""
    if not exposure_str or pd.isna(exposure_str):
        return ""
    
    try:
        # Case 1: Tuple format like "(1, 16000)"
        if exposure_str.startswith('(') and exposure_str.endswith(')'):
            num_den = exposure_str[1:-1].replace(' ', '').split(',')
            numerator, denominator = map(int, num_den)
            return format_as_reciprocal(numerator, denominator)
        
        # Case 2: Decimal format like "0.0000625"
        if '.' in exposure_str and '/' not in exposure_str:
            exposure_float = float(exposure_str)
            return format_decimal(exposure_float)
        
        # Case 3: Fraction format like "10/13" or "625/0"
        if '/' in exposure_str:
            numerator, denominator = map(int, exposure_str.split('/'))
            return format_as_reciprocal(numerator, denominator)
        
        # Case 4: Whole number like "2"
        return f"{int(float(exposure_str))}\""
    
    except Exception as e:
        print(f"Failed to convert {exposure_str}: {str(e)}")
        return ""

def format_as_reciprocal(numerator, denominator):
    """Force numerator=1 by calculating reciprocal (e.g., 2/5 → 1/2.5)."""
    if denominator == 0:
        # Handle camera-specific encodings like 625/0 → 1/16000
        return f"1/{int(2 ** (numerator.bit_length() + 6))}"  # Empirical scaling
    elif numerator == 1:
        return f"1/{denominator}"
    elif denominator == 1:
        return f"{numerator}\""
    else:
        # Convert to 1/X format (e.g., 10/13 → 1/1.3, 5/16 → 1/3.2)
        reciprocal = denominator / numerator
        return f"1/{reciprocal:.1f}" if not reciprocal.is_integer() else f"1/{int(reciprocal)}"

def format_decimal(exposure_float):
    """Convert decimal to '1/X' or 'X"'."""
    if exposure_float >= 1:
        return f"{exposure_float}\""
    else:
        return f"1/{int(round(1 / exposure_float))}"

In [15]:
def main():
    root_dir = [
        "E:/Camera Master/Olympus EM10ii/",
        "E:/Camera Master/Olympus PEN-F/",
        "E:/Camera Master/Lumix G9ii/"
    ]
    output_csv = "exif_data.csv"
    
    # Collect all EXIF data
    exif_data = scan_directories(root_dir)
    
    # Convert to DataFrame and save as CSV
    if exif_data:
        df = pd.DataFrame(exif_data)
        
        df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

        # unwanted_columns = ['PrintImageMatching', 'MakerNote','UserComment','GPSInfo',"FocalLengthIn35mmFilm"]
        # df = df.drop(columns=unwanted_columns, errors='ignore')  # Ignore if column doesn't exist

        # Add ExposureFraction column
        df['ExposureFraction'] = df['ExposureTime'].apply(convert_exposure_time)

        # Reorder columns to ensure 'Filename' is first
        columns = ['Filename'] + [col for col in df.columns if col != 'Filename']
        df = df[columns]
        
        # Columns to export
        df1 = df[["Filename","Make","Model","DateTime","YResolution","XResolution",
         "ExposureBiasValue","MaxApertureValue","Flash","FocalLength",
         "ExifImageWidth","ExifImageHeight","FNumber","ISOSpeedRatings","ExposureTime","ExposureFraction",
         "LensModel","Filepath"]]

        # df1 = df

        # Fix CSV escaping issues by quoting all fields and specifying an escape character
        df1.to_csv(
            output_csv,
            index=False,
            quoting=csv.QUOTE_ALL,  # Enclose all fields in quotes
            escapechar='\\',         # Use backslash to escape special chars
            # encoding='utf-8'
        )
        print(f"Data saved to {output_csv}")
    else:
        print("No EXIF data found.")
        
if __name__ == "__main__":
    main()

Processing images: 100%|██████████| 35131/35131 [00:00<00:00, 123365.50it/s]


Data saved to exif_data.csv
