In [58]:
import os, duckdb, csv, random
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image, ImageOps, ImageFile
from pprint import pprint

In [60]:
# Configuration and connection
DB_PATH = r"D:/db/meta.duckdb"                 

con = duckdb.connect(DB_PATH)

con.execute("PRAGMA threads=2;") 
con.execute("SET memory_limit='5GB';") 
con.execute("SET preserve_insertion_order=false;") 

print("\n Set up ready")


 Set up ready


In [9]:
# Exctract all images filenames

root = "D:/IMAGES/posts_image/image"
out  = "D:/images_filenames_all.csv"
exts = (".jpg", ".jpeg", ".png")
BATCH_SIZE = 100000

buffer = []
count  = 0

with open(out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["filename"])  # intestazione

    for fn in os.listdir(root):
        if fn.lower().endswith(exts):
            buffer.append([fn])
            count += 1

            if len(buffer) >= BATCH_SIZE:
                w.writerows(buffer)
                buffer.clear()
                print(f"[progress] salvati {count:,d} file...")

    # flush finale
    if buffer:
        w.writerows(buffer)
        buffer.clear()

print(f"End! Total saved files: {count:,d}")

[progress] salvati 100,000 file...
[progress] salvati 200,000 file...
[progress] salvati 300,000 file...
[progress] salvati 400,000 file...
[progress] salvati 500,000 file...
[progress] salvati 600,000 file...
[progress] salvati 700,000 file...
[progress] salvati 800,000 file...
[progress] salvati 900,000 file...
[progress] salvati 1,000,000 file...
[progress] salvati 1,100,000 file...
[progress] salvati 1,200,000 file...
[progress] salvati 1,300,000 file...
[progress] salvati 1,400,000 file...
[progress] salvati 1,500,000 file...
[progress] salvati 1,600,000 file...
[progress] salvati 1,700,000 file...
[progress] salvati 1,800,000 file...
[progress] salvati 1,900,000 file...
[progress] salvati 2,000,000 file...
[progress] salvati 2,100,000 file...
[progress] salvati 2,200,000 file...
[progress] salvati 2,300,000 file...
[progress] salvati 2,400,000 file...
[progress] salvati 2,500,000 file...
[progress] salvati 2,600,000 file...
[progress] salvati 2,700,000 file...
[progress] salvati 

In [23]:
con.execute("""CREATE OR REPLACE VIEW all_image_files AS
SELECT * 
FROM "D:/images_filenames_all.csv"
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1fbe14b7630>

In [12]:
# There are NO duplicate files in the image folder
print(con.sql("""
SELECT COUNT(*) AS total_images, COUNT(DISTINCT filename) AS distinct_files, COUNT(*) - COUNT(DISTINCT filename) AS duplicates 
FROM all_image_files
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────────┬────────────────┬────────────┐
│ total_images │ distinct_files │ duplicates │
│    int64     │     int64      │   int64    │
├──────────────┼────────────────┼────────────┤
│     12933406 │       12933406 │          0 │
└──────────────┴────────────────┴────────────┘



In [14]:
# In the manifest but not in the folder
print(con.sql("""
SELECT 
    COUNT(*) AS total_in_manifest,
    SUM(CASE WHEN f.filename IS NULL THEN 1 ELSE 0 END) AS not_in_folder,
    SUM(CASE WHEN f.filename IS NOT NULL THEN 1 ELSE 0 END) AS found_in_folder
FROM images_manifest AS m
LEFT JOIN all_image_files AS f
    ON m.full_image_file = f.filename;
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌───────────────────┬───────────────┬─────────────────┐
│ total_in_manifest │ not_in_folder │ found_in_folder │
│       int64       │    int128     │     int128      │
├───────────────────┼───────────────┼─────────────────┤
│          12788311 │         12577 │        12775734 │
└───────────────────┴───────────────┴─────────────────┘



In [59]:
# In the manifest but not in the folder 17-18
print(con.sql("""
SELECT 
    COUNT(*) AS total_in_manifest,
    SUM(CASE WHEN f.filename IS NULL THEN 1 ELSE 0 END) AS not_in_folder,
    SUM(CASE WHEN f.filename IS NOT NULL THEN 1 ELSE 0 END) AS found_in_folder
FROM images_manifest1718 AS m
LEFT JOIN all_image_files AS f
    ON m.full_image_file = f.filename;
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌───────────────────┬───────────────┬─────────────────┐
│ total_in_manifest │ not_in_folder │ found_in_folder │
│       int64       │    int128     │     int128      │
├───────────────────┼───────────────┼─────────────────┤
│           8666392 │          1971 │         8664421 │
└───────────────────┴───────────────┴─────────────────┘



In [56]:
# In the folder but not in the manifest
print(con.sql("""
SELECT 
    COUNT(*) AS total_in_folder,
    SUM(CASE WHEN m.full_image_file IS NULL THEN 1 ELSE 0 END) AS not_in_manifest,
    SUM(CASE WHEN m.full_image_file IS NOT NULL THEN 1 ELSE 0 END) AS found_in_manifest
FROM all_image_files AS f
LEFT JOIN images_manifest AS m
    ON f.filename = m.full_image_file;
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────────┬─────────────────┬───────────────────┐
│ total_in_folder │ not_in_manifest │ found_in_manifest │
│      int64      │     int128      │      int128       │
├─────────────────┼─────────────────┼───────────────────┤
│        12933406 │          157672 │          12775734 │
└─────────────────┴─────────────────┴───────────────────┘



In [16]:
# Manifest ready removing the rows without matching image in the folder
con.execute("""
CREATE OR REPLACE TABLE images_manifest_clean AS
SELECT m.*
FROM images_manifest AS m
INNER JOIN all_image_files AS f
    ON m.full_image_file = f.filename;
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x250af7c5a30>

In [7]:
print(con.sql("""SELECT COUNT(*) AS images FROM images_manifest_clean"""))

┌──────────┐
│  images  │
│  int64   │
├──────────┤
│ 12775734 │
└──────────┘



In [17]:
# Same but for 17-18
con.execute("""
CREATE OR REPLACE TABLE images_manifest1718_clean AS
SELECT m.*
FROM images_manifest1718 AS m
INNER JOIN all_image_files AS f
    ON m.full_image_file = f.filename;
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x250af7c5a30>

In [9]:
print(con.sql("""SELECT COUNT(*) AS images FROM images_manifest1718_clean"""))

┌─────────┐
│ images  │
│  int64  │
├─────────┤
│ 8664413 │
└─────────┘



In [15]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

DB_PATH       = "D:/db/meta.duckdb"
RAW_IMG_ROOT  = "D:/IMAGES/posts_image"
CLEAN_IMG_DIR = "D:/dataset/images_224_rgb"
os.makedirs(CLEAN_IMG_DIR, exist_ok=True)

TARGET_SIZE = (224, 224)
JPEG_KW = dict(quality=85, subsampling=2, optimize=True, progressive=True)
MAX_WORKERS = max(4, os.cpu_count() * 2)
BATCH_SIZE  = 100_000

def src_path(t): return os.path.join(RAW_IMG_ROOT, "image", t.strip())
def dst_path(t): return os.path.join(CLEAN_IMG_DIR, t.strip())

def process_one(t):
    src = src_path(t); dst = dst_path(t); tmp = dst + ".part"
    try:
        if os.path.exists(dst):     return None # already in the destination folder
        if not os.path.exists(src): return "missing" # not in the source folder
        os.makedirs(os.path.dirname(dst), exist_ok=True)

        with Image.open(src) as img:
            img = ImageOps.exif_transpose(img).convert("RGB")
            img = ImageOps.fit(img, TARGET_SIZE, method=Image.Resampling.LANCZOS) 
            img.save(tmp, "JPEG", **JPEG_KW)
        os.replace(tmp, dst)
        return "ok"
    except Exception:
        try:
            if os.path.exists(tmp): os.remove(tmp)
        except: pass
        return "error"

In [None]:
# PREPROCESSING
con = duckdb.connect(DB_PATH)
total = con.sql("SELECT COUNT(*) FROM images_manifest1718_clean").fetchone()[0]
ok_count = miss_count = err_count = offset = 0

while offset < total:
    rows = [r[0] for r in con.sql(
        f"SELECT full_image_file FROM images_manifest1718_clean "
        f"LIMIT {BATCH_SIZE} OFFSET {offset}"
    ).fetchall()]

    rows = [t for t in rows if not os.path.exists(dst_path(t))]
    if not rows:
        offset += BATCH_SIZE
        continue

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        for res in ex.map(process_one, rows, chunksize=64):
            if res == "ok": ok_count += 1
            elif res == "missing": miss_count += 1
            elif res == "error": err_count += 1

    print(f"[BATCH offset={offset:,}] ok={ok_count:,} missing={miss_count:,} error={err_count:,}")
    offset += BATCH_SIZE

con.close()
print("\n[FINITO] Totale immagini preprocessate:", ok_count)


[BATCH offset=0] ok=100,000 missing=0 error=0
[BATCH offset=100,000] ok=200,000 missing=0 error=0
[BATCH offset=200,000] ok=300,000 missing=0 error=0
[BATCH offset=300,000] ok=400,000 missing=0 error=0
[BATCH offset=400,000] ok=500,000 missing=0 error=0
[BATCH offset=500,000] ok=600,000 missing=0 error=0
[BATCH offset=600,000] ok=700,000 missing=0 error=0
[BATCH offset=700,000] ok=800,000 missing=0 error=0
[BATCH offset=800,000] ok=900,000 missing=0 error=0
[BATCH offset=900,000] ok=1,000,000 missing=0 error=0
[BATCH offset=1,000,000] ok=1,100,000 missing=0 error=0
[BATCH offset=1,100,000] ok=1,200,000 missing=0 error=0
[BATCH offset=1,200,000] ok=1,300,000 missing=0 error=0
[BATCH offset=1,300,000] ok=1,400,000 missing=0 error=0
[BATCH offset=1,400,000] ok=1,500,000 missing=0 error=0
[BATCH offset=1,500,000] ok=1,600,000 missing=0 error=0
[BATCH offset=1,600,000] ok=1,700,000 missing=0 error=0
[BATCH offset=1,700,000] ok=1,800,000 missing=0 error=0
[BATCH offset=1,800,000] ok=1,900,00

In [18]:
# Exctract the clean images filenames

root = "D:/dataset/images_224_rgb"
out  = "D:/clean_filenames.csv"
exts = (".jpg", ".jpeg", ".png")
BATCH_SIZE = 100000

buffer = []
count  = 0

with open(out, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["filename"])

    for fn in os.listdir(root):
        if fn.lower().endswith(exts):
            buffer.append([fn])
            count += 1

            if len(buffer) >= BATCH_SIZE:
                w.writerows(buffer)
                buffer.clear()
                print(f"[progress] salvati {count:,d} file...")

    if buffer:
        w.writerows(buffer)
        buffer.clear()

print(f"End! Total saved files: {count:,d}")

[progress] salvati 100,000 file...
[progress] salvati 200,000 file...
[progress] salvati 300,000 file...
[progress] salvati 400,000 file...
[progress] salvati 500,000 file...
[progress] salvati 600,000 file...
[progress] salvati 700,000 file...
[progress] salvati 800,000 file...
[progress] salvati 900,000 file...
[progress] salvati 1,000,000 file...
[progress] salvati 1,100,000 file...
[progress] salvati 1,200,000 file...
[progress] salvati 1,300,000 file...
[progress] salvati 1,400,000 file...
[progress] salvati 1,500,000 file...
[progress] salvati 1,600,000 file...
[progress] salvati 1,700,000 file...
[progress] salvati 1,800,000 file...
[progress] salvati 1,900,000 file...
[progress] salvati 2,000,000 file...
[progress] salvati 2,100,000 file...
[progress] salvati 2,200,000 file...
[progress] salvati 2,300,000 file...
[progress] salvati 2,400,000 file...
[progress] salvati 2,500,000 file...
[progress] salvati 2,600,000 file...
[progress] salvati 2,700,000 file...
[progress] salvati 

In [34]:
con.execute("""
CREATE OR REPLACE TABLE clean_files AS
SELECT * FROM "D:/clean_filenames.csv"
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x1e697114a70>

In [62]:
# CHECK
print(con.sql("""SELECT COUNT(*) AS rows_manifest FROM images_manifest1718_clean"""))
print(con.sql("""SELECT COUNT(*) AS rows_cleaned FROM clean_files"""))

┌───────────────┐
│ rows_manifest │
│     int64     │
├───────────────┤
│       8664413 │
└───────────────┘

┌──────────────┐
│ rows_cleaned │
│    int64     │
├──────────────┤
│      8664413 │
└──────────────┘



In [44]:
# Check the changes have been made
DB_PATH       = r"D:/db/meta.duckdb"
RAW_IMG_ROOT  = r"D:/IMAGES/posts_image"
SRC_SUBDIR    = "image"
OUT_DIR       = r"D:/dataset/images_224_rgb"

def src_path(t): return os.path.join(RAW_IMG_ROOT, SRC_SUBDIR, t.strip())
def dst_path(t): return os.path.join(OUT_DIR, t.strip())

con = duckdb.connect(DB_PATH)
targets = [r[0] for r in con.sql("SELECT full_image_file FROM images_manifest1718_clean LIMIT 2000").fetchall()]
con.close()

both_exist = [t for t in targets if os.path.exists(src_path(t)) and os.path.exists(dst_path(t))]
sample = random.sample(both_exist, min(10, len(both_exist)))

report = []
for t in sample:
    with Image.open(src_path(t)) as im_src, Image.open(dst_path(t)) as im_dst:
        report.append({
            "target_name": t,
            "src_format": im_src.format, "src_mode": im_src.mode, "src_size": im_src.size,
            "dst_format": im_dst.format, "dst_mode": im_dst.mode, "dst_size": im_dst.size,
        })

pprint(report)

[{'dst_format': 'JPEG',
  'dst_mode': 'RGB',
  'dst_size': (224, 224),
  'src_format': 'JPEG',
  'src_mode': 'RGB',
  'src_size': (256, 320),
  'target_name': 'aforteforfashion-1856117939247008270.jpg'},
 {'dst_format': 'JPEG',
  'dst_mode': 'RGB',
  'dst_size': (224, 224),
  'src_format': 'JPEG',
  'src_mode': 'RGB',
  'src_size': (256, 321),
  'target_name': 'afromamanyc-1549686774297503403.jpg'},
 {'dst_format': 'JPEG',
  'dst_mode': 'RGB',
  'dst_size': (224, 224),
  'src_format': 'JPEG',
  'src_mode': 'RGB',
  'src_size': (256, 306),
  'target_name': 'africasblog-1662866696798687219.jpg'},
 {'dst_format': 'JPEG',
  'dst_mode': 'RGB',
  'dst_size': (224, 224),
  'src_format': 'JPEG',
  'src_mode': 'RGB',
  'src_size': (256, 320),
  'target_name': 'afreen_rahman-1763264467610670797.jpg'},
 {'dst_format': 'JPEG',
  'dst_mode': 'RGB',
  'dst_size': (224, 224),
  'src_format': 'JPEG',
  'src_mode': 'RGB',
  'src_size': (256, 256),
  'target_name': 'africasblog-1818517615807802793.jpg'}

In [52]:
con.sql("""SELECT * FROM information_schema.tables """).fetchdf()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action,TABLE_COMMENT
0,meta,main,clean_files,BASE TABLE,,,,,,YES,NO,,
1,meta,main,design_train_balanced,BASE TABLE,,,,,,YES,NO,,
2,meta,main,images_manifest,BASE TABLE,,,,,,YES,NO,,
3,meta,main,images_manifest1718,BASE TABLE,,,,,,YES,NO,,
4,meta,main,images_manifest1718_clean,BASE TABLE,,,,,,YES,NO,,
5,meta,main,images_manifest_clean,BASE TABLE,,,,,,YES,NO,,
6,meta,main,img_to_train,BASE TABLE,,,,,,YES,NO,,
7,meta,main,influencers,BASE TABLE,,,,,,YES,NO,,
8,meta,main,metadata,BASE TABLE,,,,,,YES,NO,,
9,meta,main,metadata1718,BASE TABLE,,,,,,YES,NO,,


In [64]:
con.close()