# Download and filter Danbooru images using metadata

In [1]:
import os, traceback, json, pickle, shutil
from pathlib import Path
import pandas as pd

In [None]:
## Download danbooru metadata
# Checkout source https://www.gwern.net/Danbooru2021#rsync to get the actual download url
#!rsync --verbose rsync://link/to/danbooru2021/metadata.json.tar.xz ./danbooru/

In [None]:
# Untar jsons
#!tar -xvf --directory ./danbooru/raw_meta ./danbooru/metadata.json.tar.xz

In [None]:
SIMPLE_BACKGROUND = 412368
WHITE_BACKGROUND = 515193
sketch_tags = [513837, 1931] # grayscale, sketch
include_tags = [470575, 540830] # 1girl, 1boy
hair_tags = [87788, 16867, 13200, 10953, 16442, 11429, 15425, 8388, 5403, 16581, 87676, 16580, 94007, 403081, 468534]
eye_tags = [10959, 8526, 16578, 10960, 15654, 89189, 16750, 13199, 89368, 95405, 89228, 390186]
blacklist_tags = [63, 4751, 12650, 172609, 555246, 513475] # 

In [2]:
def  valid_meta(meta):
    if meta["rating"] !="s":
        return False

    tags = set(int(tag['id']) for tag in meta['tags'] if tag['category'] == '0')
        
    for black in blacklist_tags:
        if black in tags:
            return False

    if len(tags.intersection(sketch_tags)) >= 1 and WHITE_BACKGROUND in tags:
        return False

    if SIMPLE_BACKGROUND not in tags:
        return False

    conditions = all(len(tags.intersection(lst)) == 1 for lst in [include_tags, hair_tags, eye_tags])
    if not conditions:
        return False

    return True

In [3]:
def proc_record(meta):
    file_id = int(meta['id'])
    filename = f'{file_id%1000:04d}/{file_id}.jpg'
    tags = " ".join(set(tag['name'] for tag in meta['tags'] if tag['category'] == '0'))
    char = ""
    for t in meta['tags']:
        if t['category']=='4':
            char = t['name']
            break
    return (str(file_id), filename, meta['image_width'], meta['image_height'], tags, char)

In [4]:
def filter_size(meta):
    w, h = int(meta['image_width']), int(meta['image_height'])
    if w < 512 and h < 512:
        return False
    if not (3 / 4 < w / h < 4 / 3):
        return False
    return True

In [5]:
def proc_file(fname):
    outname = "./danbooru/clean_meta/" + fname.stem + ".csv"
    with open(fname,'r') as src, open(outname,'w') as out:
        out.write("id,fname,width,height,tags,character\n")
        for line in src:
            meta = json.loads(line)
            if not valid_meta(meta):
                continue
            res = ",".join(proc_record(meta))
            out.write(res+"\n")

In [6]:
from fastcore.parallel import parallel
# Turn jsons into CSV while filtering undesired files
parallel(proc_file, sorted(Path("./danbooru/raw_meta/").glob("*.json")), progress=True)

In [17]:
# Join CSVs
twsss = None
for fname in Path("./danbooru/clean_meta").glob("*.csv"):
    tw = pd.read_csv(fname, index_col='id')
    twsss = pd.concat([twsss, tw])
twsss.to_csv("./danbooru/clean_metadata.csv")

In [None]:
# Write the filtered files to include.txt
with open("./danbooru/include.txt", 'w') as f:
    f.write("\n".join(twsss.fname))

In [41]:
# Download ONLY the files listed in include.txt
# Checkout source https://www.gwern.net/Danbooru2021#rsync to get the actual download url
#!rsync --recursive --verbose --files-from=./danbooru/include.txt rsync://link/to/danbooru2021/512px/ ./danbooru/512px/ 