# Rai AI – Dataset Builder (Rice & Durian) 📸🌾

This notebook downloads ~90 images per class for **9 tags** (rice & durian), cleans duplicates/blurry images, and **zips one file per tag** for **Azure Custom Vision**.

### How to use (Google Colab recommended)
1. Run each cell from top to bottom.
2. Edit `TAGS` if needed. Keep labels **exactly** as in your app.
3. After the last step, download the **ZIP files** from the `/content/datasets_zips` folder.

> Legal note: default search uses the **bing-image-downloader** which fetches publicly available images. Use for research/MVP; if you commercialize, prefer datasets with clear licenses.

In [1]:
!pip -q install bing-image-downloader pillow imagehash opencv-python-headless tqdm
print("✅ Dependencies installed")

In [2]:
from pathlib import Path
import shutil, os, re
from PIL import Image
import imagehash
import cv2
from tqdm import tqdm
import numpy as np

OUT_ROOT = Path("/content/datasets")
ZIPS_ROOT = Path("/content/datasets_zips")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
ZIPS_ROOT.mkdir(parents=True, exist_ok=True)

TAGS = [
    ("rice_brown_spot",     "rice brown spot disease leaf"),
    ("rice_leaf_blast",     "rice leaf blast disease leaf"),
    ("rice_n_deficiency",   "rice nitrogen deficiency leaf"),
    ("rice_potassium_def",  "rice potassium deficiency leaf"),
    ("rice_healthy",        "healthy rice leaf"),
    ("durian_anthracnose",  "durian anthracnose leaf"),
    ("durian_n_deficiency", "durian nitrogen deficiency leaf"),
    ("durian_mealybug",     "durian mealybug leaf"),
    ("durian_healthy",      "healthy durian leaf"),
]

TARGET_PER_TAG = 120
MIN_W, MIN_H = 224, 224
BLUR_THRESHOLD = 50.0
HAMMING_NEAR_DUP = 4
print("Config ready")

In [3]:
from bing_image_downloader import downloader

for label, query in TAGS:
    out_dir = OUT_ROOT / label
    out_dir.mkdir(exist_ok=True, parents=True)
    print(f"\n🔎 Downloading: {label} ← '{query}'")
    downloader.download(query,
                        limit=TARGET_PER_TAG,
                        output_dir=str(OUT_ROOT),
                        adult_filter_off=True,
                        force_replace=False,
                        timeout=60)
    # move from query folder into label folder
    qdir = OUT_ROOT / query
    if qdir.exists():
        for p in qdir.rglob('*'):
            if p.is_file():
                dest = out_dir / p.name
                try:
                    shutil.move(str(p), str(dest))
                except Exception:
                    pass
        shutil.rmtree(qdir, ignore_errors=True)
print("\n✅ Download step finished")

In [4]:
def is_blurry(p):
    img = cv2.imdecode(np.fromfile(p, dtype=np.uint8), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True
    fm = cv2.Laplacian(img, cv2.CV_64F).var()
    return fm < BLUR_THRESHOLD

def clean_dir(tag_dir: Path):
    # Remove non-images and too small
    for p in list(tag_dir.iterdir()):
        if not p.is_file():
            continue
        try:
            with Image.open(p) as im:
                w, h = im.size
                if w < MIN_W or h < MIN_H:
                    p.unlink(missing_ok=True); continue
        except Exception:
            p.unlink(missing_ok=True); continue

    # Remove blurry
    for p in list(tag_dir.iterdir()):
        if not p.is_file():
            continue
        try:
            if is_blurry(str(p)):
                p.unlink(missing_ok=True)
        except Exception:
            pass

    # Deduplicate by perceptual hash
    hashes = {}
    for p in list(tag_dir.iterdir()):
        if not p.is_file():
            continue
        try:
            with Image.open(p) as im:
                h = imagehash.phash(im)
        except Exception:
            p.unlink(missing_ok=True); continue
        similar = next((h0 for h0 in hashes.keys() if h - h0 <= HAMMING_NEAR_DUP), None)
        if similar is not None:
            p.unlink(missing_ok=True)
        else:
            hashes[h] = p

for label, _ in TAGS:
    print(f"🧹 Cleaning {label}")
    clean_dir(OUT_ROOT / label)
print("\n✅ Cleaning complete")

In [5]:
MAX_KEEP = 90
for label, _ in TAGS:
    d = OUT_ROOT / label
    files = [p for p in d.iterdir() if p.is_file()]
    files.sort()
    if len(files) > MAX_KEEP:
        for p in files[MAX_KEEP:]:
            p.unlink(missing_ok=True)
print("✅ Downselected to ≤90 per tag")

In [6]:
import shutil
for label, _ in TAGS:
    src = OUT_ROOT / label
    zip_base = (Path('/content/datasets_zips') / label)
    try:
        shutil.make_archive(str(zip_base), 'zip', root_dir=src)
        print('📦', str(zip_base) + '.zip')
    except Exception as e:
        print('Zip error', label, e)
print("\n✅ All done. Download zips from /content/datasets_zips")