In [1]:
# Import
import os, sys, gc, math, json
import numpy as np
import pandas as pd
import duckdb
from tqdm import tqdm

import cv2
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from skimage.measure import shannon_entropy
from skimage.feature import local_binary_pattern, graycomatrix, graycoprops

In [6]:
# Configuration

DB_PATH       = r"D:/db/meta.duckdb"
CLEAN_IMG_DIR = r"D:/dataset/images_224_rgb"
OUT_DIR       = r"D:/dataset/img_features"

# DuckDB tables
METADATA_TABLE   = "metadata1718_ready" # metadata 
MANIFEST_TABLE   = "images_manifest1718_clean" # manifest
TRAIN_POSTS_TBL  = "train_balanced" # undersampled metadata for training

# Key column names
POST_ID_COL_METADATA  = "post_id" # metadata post id
POST_ID_COL_MANIFEST  = "post_id" # manifest post id
IMG_NAME_COL_MANIFEST = "full_image_file" # images filename column in the manifest
POST_ID_COL_TRAIN     = "post_id" # undersampled metadata for training post id

INCLUDE_LBP  = True
INCLUDE_GLCM = True

POOLING = "mean"
SAVE_FLOAT16 = True  
BATCH_IMG = 8192 # batch size

In [41]:
# Set up checks
def check_setup() -> None:
    ok = True
    for p in [os.path.dirname(DB_PATH), CLEAN_IMG_DIR, OUT_DIR]:
        if not os.path.exists(p):
            print(f"[MISSING] Non existing path: {p}")
            ok = False
        else:
            print(f"[OK] Path present: {p}")

    if not os.path.exists(DB_PATH):
        print(f"[MISSING] DuckDB not found: {DB_PATH}")
        ok = False
    else:
        print(f"[OK] DuckDB found: {DB_PATH}")

    # check tables and columns in duckdb tables
    try:
        con = duckdb.connect(DB_PATH, read_only=True)
        def table_has_cols(tbl, cols):
            info = con.execute(f"PRAGMA table_info('{tbl}')").fetchdf()
            have = set(info['name'].str.lower().tolist())
            need = set([c.lower() for c in cols])
            missing = [c for c in need if c not in have]
            return missing

        miss_meta = table_has_cols(METADATA_TABLE, [POST_ID_COL_METADATA, "split"])
        miss_man  = table_has_cols(MANIFEST_TABLE, [POST_ID_COL_MANIFEST, IMG_NAME_COL_MANIFEST])
        miss_train= table_has_cols(TRAIN_POSTS_TBL, [POST_ID_COL_TRAIN])

        if miss_meta:
            print(f"[MISSING] Missing columns in {METADATA_TABLE}: {miss_meta}"); ok = False
        else:
            print(f"[OK] {METADATA_TABLE} with requested columns")

        if miss_man:
            print(f"[MISSING] Missing columns in {MANIFEST_TABLE}: {miss_man}"); ok = False
        else:
            print(f"[OK] {MANIFEST_TABLE} with requested columns")

        if miss_train:
            print(f"[MISSING] Missing columns in {TRAIN_POSTS_TBL}: {miss_train}"); ok = False
        else:
            print(f"[OK] {TRAIN_POSTS_TBL} with requested columns")

        con.close()
    except Exception as e:
        print("[ERROR] Impossible to connect/read the duckdb schema:", e)
        ok = False

    if not ok:
        print("\n Review the missing and re-run the cell")
    else:
        print("\n Setup ok")

check_setup()

[OK] Path presente: D:/db
[OK] Path presente: D:/dataset/images_224_rgb
[OK] Path presente: D:/dataset/img_features
[OK] DuckDB trovato: D:/db/meta.duckdb
[OK] metadata1718_ready con colonne richieste
[OK] images_manifest1718_clean con colonne richieste
[OK] train_balanced con colonne richieste

 Setup ok


In [None]:
# Retrieve the set of images per split
CLEAN_FILES_TABLE   = "clean_files"
CLEAN_FILENAME_COL  = "filename"

# Training images view
VIEW_TRAIN_IMAGES            = "images_train"
VIEW_TRAIN_POST_ID_COL       = "post_id"
VIEW_TRAIN_IMG_NAME_COL      = "full_image_file"

def _relation_exists(con, name: str) -> bool:
    try:
        q = f"SELECT table_name FROM information_schema.tables WHERE lower(table_name)=lower('{name}')"
        return len(con.execute(q).fetchall()) > 0
    except Exception:
        return False

def get_images_for_split(split: str) -> pd.DataFrame:

    con = duckdb.connect(DB_PATH, read_only=True)

    if split == "train":
        # If the training set of images exists it is used
        if _relation_exists(con, VIEW_TRAIN_IMAGES):
            q = f"""
            SELECT
              {VIEW_TRAIN_POST_ID_COL} AS post_id,
              {VIEW_TRAIN_IMG_NAME_COL} AS image_rel
            FROM {VIEW_TRAIN_IMAGES}
            """
            df = con.execute(q).fetchdf()

        else:
            # otherwise the set of images to train it is defined
            # train = (train_balanced) + (manifest) + (clean_files)
            q = f"""
            WITH tr AS (
              SELECT {POST_ID_COL_TRAIN} AS post_id
              FROM {TRAIN_POSTS_TBL}
            ),
            join_im AS (
              SELECT tr.post_id,
                     im.{IMG_NAME_COL_MANIFEST} AS image_rel
              FROM tr
              JOIN {MANIFEST_TABLE} AS im
                ON im.{POST_ID_COL_MANIFEST} = tr.post_id
            ),
            ok AS (
              SELECT j.post_id, j.image_rel
              FROM join_im j
              JOIN {CLEAN_FILES_TABLE} cl
                ON cl.{CLEAN_FILENAME_COL} = j.image_rel
            )
            SELECT * FROM ok
            """
            df = con.execute(q).fetchdf()

    else:
        # validation/test: select the metadata posts for the split and the retrieve the images by merging the manifest and the clean files
        q = f"""
        WITH ids AS (
          SELECT {POST_ID_COL_METADATA} AS post_id
          FROM {METADATA_TABLE}
          WHERE split = '{split}'
        ),
        join_im AS (
          SELECT i.post_id, im.{IMG_NAME_COL_MANIFEST} AS image_rel
          FROM ids i
          JOIN {MANIFEST_TABLE} im
            ON im.{POST_ID_COL_MANIFEST} = i.post_id
        ),
        ok AS (
          SELECT j.post_id, j.image_rel
          FROM join_im j
          JOIN {CLEAN_FILES_TABLE} cl
            ON cl.{CLEAN_FILENAME_COL} = j.image_rel
        )
        SELECT * FROM ok
        """
        df = con.execute(q).fetchdf()

    con.close()

    # Build absolute path and additional cleaning if something went wrong
    df["image_rel"]  = df["image_rel"].astype(str)
    df["post_id"]    = df["post_id"].astype(str)
    df["image_path"] = df["image_rel"].map(lambda t: os.path.join(CLEAN_IMG_DIR, t))

    df = df.drop_duplicates(subset=["post_id","image_path"])
    df = df[df["image_path"].map(os.path.exists)].reset_index(drop=True)

    return df[["post_id","image_path"]]

# Some sanity checks
for sp in ("train","validation","test"):
    try:
        tmp = get_images_for_split(sp)
        print(f"[{sp}] righe: {len(tmp):,} | post unici: {tmp['post_id'].nunique():,}")
        print(tmp.head(3))
    except Exception as e:
        print(f"[{sp}] errore ->", e)

In [21]:
# Features extracted for each image
def colorfulness_hasler(img_bgr: np.ndarray) -> float:
    B, G, R = cv2.split(img_bgr.astype(np.float32))
    rg = np.abs(R - G)
    yb = np.abs(0.5*(R + G) - B)
    return float(np.sqrt(rg.std()**2 + yb.std()**2) + 0.3*np.sqrt(rg.mean()**2 + yb.mean()**2))

def hsv_stats_and_hists(img_bgr: np.ndarray, bins: int = 16):
    hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)
    feats = [
        float(h.mean()), float(h.std()),
        float(s.mean()), float(s.std()),
        float(v.mean()), float(v.std()),
    ]
    for ch in (h, s, v):
        hist = cv2.calcHist([ch],[0],None,[bins],[0,256]).ravel()
        hist = hist / (hist.sum() + 1e-8)
        feats += hist.astype(np.float32).tolist()
    return feats  

def gray_histo_entropy_sharp_edges(img_bgr: np.ndarray, bins: int = 16):
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    # gray normalized histogram
    hist = cv2.calcHist([gray],[0],None,[bins],[0,256]).ravel()
    hist = hist / (hist.sum() + 1e-8)
    ent = float(shannon_entropy(gray))
    lap_var = float(cv2.Laplacian(gray, cv2.CV_64F).var())
    edges = cv2.Canny(gray, 100, 200)
    edge_density = float((edges > 0).mean())
    return hist.astype(np.float32).tolist(), ent, lap_var, edge_density, gray

def feat_lbp59(gray: np.ndarray) -> np.ndarray:
    lbp = local_binary_pattern(gray, P=8, R=1, method="uniform")
    hist, _ = np.histogram(lbp, bins=np.arange(0, 60), range=(0,59), density=True)
    return hist.astype(np.float32)  

def feat_glcm16(gray: np.ndarray) -> np.ndarray:
    g = gray if gray.dtype == np.uint8 else cv2.normalize(gray, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    gl = graycomatrix(g, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
                      levels=256, symmetric=True, normed=True)
    vals = []
    for p in ["contrast","homogeneity","energy","correlation"]:
        vals.extend(graycoprops(gl, p).ravel().tolist())  
    return np.array(vals, dtype=np.float32)  

def feature_columns() -> list:
    cols = []
    cols += ["h_mean","h_std","s_mean","s_std","v_mean","v_std"]
    cols += [f"h_hist_{i:02d}" for i in range(16)]
    cols += [f"s_hist_{i:02d}" for i in range(16)]
    cols += [f"v_hist_{i:02d}" for i in range(16)]
    cols += [f"gray_hist_{i:02d}" for i in range(16)]
    cols += ["laplacian_var","edge_density","entropy_gray","colorfulness"]
    if INCLUDE_LBP:
        cols += [f"lbp_u59_{i:02d}" for i in range(59)]
    if INCLUDE_GLCM:
        for prop in ("contrast","homogeneity","energy","correlation"):
            for ang in ("0","45","90","135"):
                cols.append(f"glcm_{prop}_{ang}")
    return cols

def extract_handcrafted_one(img_path: str) -> np.ndarray | None:

    p = os.path.normpath(img_path)
    img_bgr = cv2.imread(p, cv2.IMREAD_COLOR)
    if img_bgr is None:
        return None
        
    f_hsv = hsv_stats_and_hists(img_bgr, bins=16)
    histg, ent, lap_var, edge_den, gray = gray_histo_entropy_sharp_edges(img_bgr, bins=16)
    cf = colorfulness_hasler(img_bgr) # 1
    feats = f_hsv + histg + [lap_var, edge_den, ent, cf]

    if INCLUDE_LBP:
        feats.extend(feat_lbp59(gray).tolist())
    if INCLUDE_GLCM:
        feats.extend(feat_glcm16(gray).tolist())
    
    return np.array(feats, dtype=np.float32)

print("Total number of features:", len(feature_columns()))

Dim. feature con i flag correnti: 152


In [25]:
# Extract the features in batches and shards, with the option to resume the work if something goes wrong

def compute_post_features_from_df(df_post_img: pd.DataFrame,
                                  batch_size: int = BATCH_IMG,
                                  cast_float16: bool = True,
                                  verbose: bool = True) -> pd.DataFrame:
    img_paths = df_post_img["image_path"].tolist()
    img_post  = df_post_img["post_id"].tolist()

    agg_mean: dict[str, np.ndarray] = {}
    agg_cnt:  dict[str, int]        = {}
    D: int | None = None
    n_img = len(img_paths)
    n_err = 0

    it = range(0, n_img, batch_size)
    if verbose:
        it = tqdm(it, total=math.ceil(n_img / batch_size), desc="batches")

    for i in it:
        for p, pid in zip(img_paths[i:i+batch_size], img_post[i:i+batch_size]):
            vec = extract_handcrafted_one(p)
            if vec is None:
                n_err += 1
                continue
            if D is None:
                D = int(vec.shape[0])
            if pid not in agg_mean:
                agg_mean[pid] = vec.astype(np.float32, copy=True)
                agg_cnt[pid]  = 1
            else:
                c = agg_cnt[pid] + 1
                agg_mean[pid] += (vec - agg_mean[pid]) / c  # running mean
                agg_cnt[pid]   = c

    if D is None:
        raise RuntimeError("No valid image: impossible to determine feature dimension")

    post_ids = list(agg_mean.keys())
    mat = np.vstack([agg_mean[pid] for pid in post_ids]).astype(np.float32)
    if cast_float16:
        try:
            mat = mat.astype(np.float16)
        except Exception:
            pass

    cols = feature_columns()
    if len(cols) != mat.shape[1]:
        cols = [f"f_{i:03d}" for i in range(mat.shape[1])]
    out = pd.DataFrame(mat, columns=cols)
    out.insert(0, "post_id", post_ids)
    out = out.sort_values("post_id").reset_index(drop=True)
    if verbose and n_err:
        print(f"Warning: {n_err} unreadable images (ignored)")
    return out

def compute_and_save_split_sharded_resume(split: str,
                                          n_shards: int = 8,
                                          batch_size: int = 2048,
                                          cast_float16: bool = True,
                                          out_dir: str = OUT_DIR,
                                          force: bool = False):
    os.makedirs(out_dir, exist_ok=True)
    df_all = get_images_for_split(split)
    if df_all.empty:
        raise RuntimeError(f"No image for split={split}")

    # Deterministic partitioning per post_id
    h = pd.util.hash_pandas_object(df_all["post_id"], index=False).astype("uint64")
    df_all = df_all.assign(_shard=(h % n_shards).astype(int))

    part_paths = []
    for k in range(n_shards):
        part_path = os.path.join(out_dir, f"{split}_handcrafted_post.part{k}.parquet")
        if os.path.exists(part_path) and not force:
            print(f"[{split}] shard {k+1}/{n_shards} già presente → skip")
            part_paths.append(part_path)
            continue

        df_k = df_all[df_all["_shard"] == k][["post_id","image_path"]].reset_index(drop=True)
        if df_k.empty:
            print(f"[{split}] shard {k+1}/{n_shards}: vuoto → skip")
            continue

        print(f"[{split}] shard {k+1}/{n_shards}: images={len(df_k):,}, post≈{df_k['post_id'].nunique():,}")
        feats_k = compute_post_features_from_df(df_k, batch_size=batch_size,
                                                cast_float16=cast_float16, verbose=True)
        try:
            feats_k.to_parquet(part_path, index=False)
        except Exception:
            num_cols = [c for c in feats_k.columns if c != "post_id"]
            feats_k[num_cols] = feats_k[num_cols].astype("float32")
            feats_k.to_parquet(part_path, index=False, compression="zstd")
        part_paths.append(part_path)
        print(f" Saved: {part_path}")

    # Merge all available shards
    parts = [p for p in part_paths if os.path.exists(p)]
    if not parts:
        raise RuntimeError("Nessuno shard disponibile da mergiare.")
    frames = [pd.read_parquet(p) for p in parts]
    full = pd.concat(frames, ignore_index=True)
    full = full.drop_duplicates(subset=["post_id"], keep="last").sort_values("post_id").reset_index(drop=True)

    out_path = os.path.join(out_dir, f"{split}_handcrafted_post.parquet")
    try:
        full.to_parquet(out_path, index=False)
    except Exception:
        num_cols = [c for c in full.columns if c != "post_id"]
        full[num_cols] = full[num_cols].astype("float32")
        full.to_parquet(out_path, index=False, compression="zstd")

    print(f"Merge: {out_path}  ({full.shape[0]} post, {full.shape[1]-1} dim)")
    print("The function can be relaunched: shards already present are skipped (resume)")

In [13]:
# Train
compute_and_save_split_sharded_resume("train", n_shards=12, batch_size=2048, cast_float16=True)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[train] shard 1/12 già presente → skip
[train] shard 2/12 già presente → skip
[train] shard 3/12 già presente → skip
[train] shard 4/12 già presente → skip
[train] shard 5/12 già presente → skip
[train] shard 6/12 già presente → skip
[train] shard 7/12 già presente → skip
[train] shard 8/12 già presente → skip
[train] shard 9/12 già presente → skip
[train] shard 10/12 già presente → skip
[train] shard 11/12 già presente → skip
[train] shard 12/12: images=78,673, post≈64,292


batches: 100%|██████████| 39/39 [2:09:08<00:00, 198.67s/it]  


 Saved: D:/dataset/img_features\train_handcrafted_post.part11.parquet
Merge: D:/dataset/img_features\train_handcrafted_post.parquet  (773307 post, 152 dim)
The function can be relaunched: shards already present are skipped (resume)


In [15]:
# Validation
compute_and_save_split_sharded_resume("validation", n_shards=16, batch_size=2048, cast_float16=True)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[validation] shard 1/16: images=85,514, post≈64,544


batches: 100%|██████████| 42/42 [2:24:33<00:00, 206.50s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part0.parquet
[validation] shard 2/16: images=86,609, post≈64,776


batches: 100%|██████████| 43/43 [2:35:32<00:00, 217.03s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part1.parquet
[validation] shard 3/16: images=86,364, post≈64,690


batches: 100%|██████████| 43/43 [2:35:17<00:00, 216.69s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part2.parquet
[validation] shard 4/16: images=86,834, post≈64,960


batches: 100%|██████████| 43/43 [2:37:23<00:00, 219.61s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part3.parquet
[validation] shard 5/16: images=86,555, post≈64,793


batches: 100%|██████████| 43/43 [2:35:53<00:00, 217.52s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part4.parquet
[validation] shard 6/16: images=86,526, post≈64,784


batches: 100%|██████████| 43/43 [2:35:08<00:00, 216.47s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part5.parquet
[validation] shard 7/16: images=87,177, post≈65,034


batches: 100%|██████████| 43/43 [2:34:06<00:00, 215.04s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part6.parquet
[validation] shard 8/16: images=86,066, post≈64,621


batches: 100%|██████████| 43/43 [2:31:59<00:00, 212.08s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part7.parquet
[validation] shard 9/16: images=86,382, post≈64,711


batches: 100%|██████████| 43/43 [2:32:09<00:00, 212.31s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part8.parquet
[validation] shard 10/16: images=86,148, post≈64,478


batches: 100%|██████████| 43/43 [2:31:27<00:00, 211.34s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part9.parquet
[validation] shard 11/16: images=86,721, post≈64,999


batches: 100%|██████████| 43/43 [2:33:51<00:00, 214.68s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part10.parquet
[validation] shard 12/16: images=86,933, post≈65,200


batches: 100%|██████████| 43/43 [2:34:07<00:00, 215.06s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part11.parquet
[validation] shard 13/16: images=85,346, post≈64,307


batches: 100%|██████████| 42/42 [2:30:52<00:00, 215.54s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part12.parquet
[validation] shard 14/16: images=87,678, post≈65,100


batches: 100%|██████████| 43/43 [2:34:44<00:00, 215.92s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part13.parquet
[validation] shard 15/16: images=85,966, post≈64,489


batches: 100%|██████████| 42/42 [2:31:01<00:00, 215.74s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part14.parquet
[validation] shard 16/16: images=86,187, post≈64,719


batches: 100%|██████████| 43/43 [2:30:47<00:00, 210.40s/it]  


 Saved: D:/dataset/img_features\validation_handcrafted_post.part15.parquet
Merge: D:/dataset/img_features\validation_handcrafted_post.parquet  (1036205 post, 152 dim)
The function can be relaunched: shards already present are skipped (resume)


In [28]:
# Test
compute_and_save_split_sharded_resume("test", n_shards=16, batch_size=2048, cast_float16=True)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[test] shard 1/16 già presente → skip
[test] shard 2/16 già presente → skip
[test] shard 3/16 già presente → skip
[test] shard 4/16 già presente → skip
[test] shard 5/16 già presente → skip
[test] shard 6/16 già presente → skip
[test] shard 7/16: images=90,882, post≈65,739


batches: 100%|██████████| 45/45 [2:38:43<00:00, 211.63s/it]  
  mat = mat.astype(np.float16)


 Saved: D:/dataset/img_features\test_handcrafted_post.part6.parquet
[test] shard 8/16: images=89,990, post≈65,462


batches: 100%|██████████| 44/44 [2:37:49<00:00, 215.22s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part7.parquet
[test] shard 9/16: images=89,799, post≈65,198


batches: 100%|██████████| 44/44 [2:36:09<00:00, 212.94s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part8.parquet
[test] shard 10/16: images=91,336, post≈65,661


batches: 100%|██████████| 45/45 [2:37:11<00:00, 209.58s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part9.parquet
[test] shard 11/16: images=90,555, post≈65,669


batches: 100%|██████████| 45/45 [2:30:10<00:00, 200.23s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part10.parquet
[test] shard 12/16: images=90,780, post≈65,844


batches: 100%|██████████| 45/45 [2:40:45<00:00, 214.35s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part11.parquet
[test] shard 13/16: images=90,502, post≈65,478


batches: 100%|██████████| 45/45 [2:37:30<00:00, 210.01s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part12.parquet
[test] shard 14/16: images=91,204, post≈65,959


batches: 100%|██████████| 45/45 [2:44:11<00:00, 218.92s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part13.parquet
[test] shard 15/16: images=89,996, post≈65,653


batches: 100%|██████████| 44/44 [2:45:54<00:00, 226.23s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part14.parquet
[test] shard 16/16: images=91,212, post≈65,993


batches: 100%|██████████| 45/45 [2:44:47<00:00, 219.73s/it]  


 Saved: D:/dataset/img_features\test_handcrafted_post.part15.parquet
Merge: D:/dataset/img_features\test_handcrafted_post.parquet  (1050569 post, 152 dim)
The function can be relaunched: shards already present are skipped (resume)


In [2]:
DB_PATH = r"D:/db/meta.duckdb"

con = duckdb.connect(DB_PATH)

In [None]:
# Create table with handcrafted features

con.execute("""
CREATE OR REPLACE TABLE features.img_handcrafted_post AS
SELECT *
FROM read_parquet([
  'D:/dataset/img_features/train_handcrafted_post.parquet',
  'D:/dataset/img_features/validation_handcrafted_post.parquet',
  'D:/dataset/img_features/test_handcrafted_post.parquet'
]);
""")

In [26]:
con.execute("""CREATE OR REPLACE TABLE md1718 AS
SELECT * FROM metadata1718_ready WHERE split = 'validation' OR split = 'test'
UNION ALL
SELECT * FROM train_balanced""")

con.sql("""SELECT COUNT(*) FROM md1718""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,count_star()
0,1609426


In [27]:
# Let's add the metadata split
con.execute("""
CREATE OR REPLACE TABLE features.img_handcrafted AS
SELECT
    i.*
    m.split                
FROM features.img_handcrafted_post i
JOIN md1718 m
  ON i.post_id = m.post_id;
""")

con.sql("SELECT COUNT(*) FROM features.img_handcrafted").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,count_star()
0,1611159


In [31]:
con.sql("""SELECT COUNT(*) - COUNT(DISTINCT post_id) AS duplicates from features.img_handcrafted""").df()

Unnamed: 0,duplicates
0,1733


In [33]:
con.execute("""CREATE OR REPLACE TABLE features.img_handcrafted AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY post_id ORDER BY post_id) AS rn
    FROM features.img_handcrafted
)
WHERE rn = 1;
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x2a1b81260b0>

In [34]:
con.sql("""SELECT COUNT(*) - COUNT(DISTINCT post_id) AS duplicates from features.img_handcrafted""").df()

Unnamed: 0,duplicates
0,0


In [35]:
# Check
print(con.sql("""
SELECT split, COUNT(*) AS n
FROM features.img_handcrafted
GROUP BY split
ORDER BY 1;
"""))

┌────────────┬────────┐
│   split    │   n    │
│  varchar   │ int64  │
├────────────┼────────┤
│ test       │ 423604 │
│ train      │ 773497 │
│ validation │ 412325 │
└────────────┴────────┘



In [36]:
# Number of images from manifest (il numero è più alto perchè abbiamo poi una riga per ogni post dato che calcoliamo la running mean in caso di più immagini per post)
print(con.sql("""
SELECT split, COUNT(*) AS n_images
FROM md1718 m JOIN images_manifest1718_clean i ON m.post_id = i.post_id
GROUP BY split"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬──────────┐
│   split    │ n_images │
│  varchar   │  int64   │
├────────────┼──────────┤
│ train      │   960048 │
│ test       │   588557 │
│ validation │   556982 │
└────────────┴──────────┘



In [38]:
splits = con.sql("""
    SELECT DISTINCT split FROM features.img_handcrafted
""").df()["split"]

for s in splits:
    con.sql(f"""
        COPY (
            SELECT *
            FROM features.img_handcrafted
            WHERE split = '{s}'
        ) TO 'D:/dataset/img_features_final/{s}_handcrafted_post.parquet' (FORMAT 'parquet');
    """)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [39]:
print(con.sql("""SELECT f.post_id, f.split 
FROM features.img_handcrafted f LEFT JOIN images_manifest1718_clean i ON f.post_id = i.post_id
WHERE i.post_id IS NULL
"""))

┌─────────┬─────────┐
│ post_id │  split  │
│ varchar │ varchar │
├─────────┴─────────┤
│      0 rows       │
└───────────────────┘



In [40]:
con.close()