In [1]:
import os
import exiftool
from dataclasses import dataclass
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
PREFIX = "/home/andrei/Pictures/"
PATHS = ["Photo", "Archive", "Raw"]
images = []

In [3]:
for path in PATHS:
    for directory, _, files in os.walk(os.path.join(PREFIX, path)):
        for file in files:
            images.append(os.path.join(directory, file))
        # print(directory)

In [4]:
len(images)

48089

In [5]:
@dataclass
class MetaData:
    file: str
    filename: str
    directory: str
    brand: str
    model: str
    created: str
    exposure: float
    fnumber: float
    focallength: float
    focallength_35mm: float
    iso: int
    exp_compensation: float
    lens: str
    shutter_count: int
    mechanical_shutter_count: int
    flash: int
    af: str
    quality: str
    

In [6]:
def int_safe(x):
    try:
        return int(x)
    except:
        return None


def float_safe(x):
    try:
        return float(x)
    except:
        return float("inf")


def get_date_safe(s):
    try:
        return datetime.datetime.strptime(s, '%Y:%m:%d %H:%M:%S')
    except:
        return None


def strip_safe(s):
    if isinstance(s, str):
        return s.strip()
    elif s is not None:
        return str(s)
    return s        


def extract(metadata_dict):
    metadata = MetaData(
        file=metadata_dict.get("SourceFile", None),
        filename=metadata_dict.get("File:FileName", None),
        directory=metadata_dict.get("File:Directory", None),
        brand=metadata_dict.get("EXIF:Make", None),
        model=metadata_dict.get("EXIF:Model", None),
        created=get_date_safe(metadata_dict.get("EXIF:CreateDate", None)),
        exposure=1 / float_safe(metadata_dict.get("EXIF:ExposureTime", float("inf"))),
        fnumber=metadata_dict.get("EXIF:FNumber", None),
        focallength=metadata_dict.get("EXIF:FocalLength", None),
        focallength_35mm=metadata_dict.get("EXIF:FocalLengthIn35mmFormat", None),
        iso=int_safe(metadata_dict.get("EXIF:ISO", str(metadata_dict.get("MakerNotes:ISO", "")).split(" ")[-1])),
        exp_compensation=metadata_dict.get("EXIF:ExposureCompensation", None),
        lens=metadata_dict.get("EXIF:LensModel", metadata_dict.get("Composite:LensSpec", None)),
        shutter_count=metadata_dict.get("MakerNotes:ShutterCount", None),
        mechanical_shutter_count=metadata_dict.get("MakerNotes:MechanicalShutterCount", None),
        flash=metadata_dict.get("EXIF:Flash", None),
        af=strip_safe(metadata_dict.get("MakerNotes:FocusMode", None)),
        quality=strip_safe(metadata_dict.get("MakerNotes:Quality", None)),
    )
    return metadata

In [7]:
def process_images(images):
    with exiftool.ExifToolHelper() as et:
        metadata = et.get_metadata(images)
    metadata = [extract(meta) for meta in metadata]
    return metadata

In [8]:
batches = [list(batch) for batch in np.array_split(images, len(images) // 1000 + bool(len(images) % 1000))]

In [9]:
meta = []
for batch in tqdm(batches):
    meta.extend(process_images(batch))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [13:23<00:00, 16.40s/it]


In [10]:
len(meta)

48089

In [11]:
data = pd.DataFrame(meta)

In [12]:
data.head(2)

Unnamed: 0,file,filename,directory,brand,model,created,exposure,fnumber,focallength,focallength_35mm,iso,exp_compensation,lens,shutter_count,mechanical_shutter_count,flash,af,quality
0,/home/andrei/Pictures/Photo/2004/200406_Фотоох...,2004.06-2004.10-00560-022.jpg,/home/andrei/Pictures/Photo/2004/200406_Фотоох...,,,NaT,0.0,,,,,,,,,,,
1,/home/andrei/Pictures/Photo/2004/200406_Фотоох...,2004.06-2004.10-00560-002.jpg,/home/andrei/Pictures/Photo/2004/200406_Фотоох...,,,NaT,0.0,,,,,,,,,,,


In [13]:
for column in [
    "file",
    "filename",
    "directory",
    "brand",
    "model",
    "lens",
    "af",
    "quality",
]:
    data[column] = data[column].astype(str)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48089 entries, 0 to 48088
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   file                      48089 non-null  object        
 1   filename                  48089 non-null  object        
 2   directory                 48089 non-null  object        
 3   brand                     48089 non-null  object        
 4   model                     48089 non-null  object        
 5   created                   28249 non-null  datetime64[ns]
 6   exposure                  48089 non-null  float64       
 7   fnumber                   28019 non-null  float64       
 8   focallength               28018 non-null  float64       
 9   focallength_35mm          26757 non-null  float64       
 10  iso                       27813 non-null  float64       
 11  exp_compensation          27585 non-null  float64       
 12  lens              

In [15]:
data.to_parquet("photos.parquet")