###  **1. 데이터 웹 크롤링 코드**

In [None]:
import os
import time
import shutil
from PIL import Image, ImageStat
from icrawler.builtin import GoogleImageCrawler
import imagehash
import numpy as np
import cv2
from tqdm import tqdm
import matplotlib.pyplot as plt

resize_size = (512, 512)

def is_blurry(image, threshold=100):
    try:
        img = np.array(image)
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        lap = cv2.Laplacian(gray, cv2.CV_64F).var()
        return lap < threshold
    except:
        return False

def is_mostly_black(image, threshold=15):
    stat = ImageStat.Stat(image)
    avg_brightness = sum(stat.mean) / len(stat.mean)
    return avg_brightness < threshold

def resize_images_in_folder(folder_path, size):
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            with Image.open(fpath).convert("RGB") as img:
                if is_blurry(img) or is_mostly_black(img):
                    os.remove(fpath)
                    continue
                img = img.resize(size)
                new_fpath = os.path.splitext(fpath)[0] + ".jpg"
                img.save(new_fpath, "JPEG")
                if fpath != new_fpath:
                    os.remove(fpath)
        except Exception as e:
            print(f"⚠️ 리사이징 실패: {fname} → {e}")
            os.remove(fpath)

def remove_duplicate_images(folder_path, hash_size=16):
    seen_hashes = set()
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            with Image.open(fpath) as img:
                img_hash = imagehash.phash(img, hash_size=hash_size)
            if img_hash in seen_hashes:
                os.remove(fpath)
                print(f"🗑️ 중복 제거: {fname}")
            else:
                seen_hashes.add(img_hash)
        except Exception as e:
            print(f"⚠️ 해시 실패: {fname} → {e}")
            os.remove(fpath)

def compute_hashes_in_folder(folder_path, hash_size=16):
    hashes = set()
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            with Image.open(fpath) as img:
                h = imagehash.phash(img, hash_size=hash_size)
                hashes.add(str(h))
        except Exception as e:
            print(f"⚠️ 해시 계산 실패: {fname} → {e}")
    return hashes

def crawl_new_images_for_class(class_name, search_queries, main_folder, temp_folder, max_num=50, hash_size=16):
    os.makedirs(temp_folder, exist_ok=True)
    existing_hashes = compute_hashes_in_folder(main_folder, hash_size=hash_size)

    for query in search_queries:
        query_folder = os.path.join(temp_folder, query.replace(" ", "_"))
        os.makedirs(query_folder, exist_ok=True)
        print(f"[{class_name}] 새로운 이미지 크롤링 중 - 검색어: '{query}'")
        crawler = GoogleImageCrawler(storage={"root_dir": query_folder})
        crawler.crawl(
            keyword=query,
            max_num=max_num,
            filters={"type": "photo", "size": "large"}
        )
        resize_images_in_folder(query_folder, resize_size)

        for fname in os.listdir(query_folder):
            fpath = os.path.join(query_folder, fname)
            try:
                with Image.open(fpath) as img:
                    h = str(imagehash.phash(img, hash_size=hash_size))
                if h in existing_hashes:
                    os.remove(fpath)
                    print(f"중복 이미지 제거됨: {fname}")
                else:
                    dest_path = os.path.join(main_folder, fname)
                    shutil.move(fpath, dest_path)
                    existing_hashes.add(h)
            except Exception as e:
                print(f"이미지 처리 오류 {fname}: {e}")

        if not os.listdir(query_folder):
            os.rmdir(query_folder)
        time.sleep(5)

    remove_duplicate_images(main_folder, hash_size=hash_size)

In [None]:
# ─── 카테고리별 크롤링 검색어 ───
crawl_targets = {
    "wood": ["wood", "plywood pieces", "furniture wood pieces"],
    "food_waste": [
        "egg shells", "duck egg shells", "quail egg shells", "ostrich egg shells",
        "walnut shells", "peanut shells", "chestnut shells", "acorn shells",
        "pineapple peels", "corn husks", "corn cobs",
        "green tea leaves", "herbal medicine residue",
        "pork bones", "beef bones", "chicken bones", "fish bones",
        "clam shells", "crab shells", "lobster shells",
        "rice husks", "food waste", "onion peels"
    ],
    "general_waste": ["ceramic dishes", "porcelain items"],
    "paper": ["paper packs", "paper cups", "newspapers", "books", "notebooks", "cardboard boxes"],
    "glass": ["glass bottles", "broken glass"],
    "can": ["steel cans", "aluminum cans", "butane gas cans", "pesticide cans"],
    "plastic": ["clear PET bottles", "colored PET bottles", "plastic bags"],
    "styrofoam": ["styrofoam", "contaminated styrofoam"],
    "battery": ["batteries", "AA batteries", "AAA batteries"],
    "electronics": ["TVs", "refrigerators", "washing machines", "air conditioners", "computers", "mobile phones"],
    "lighting": ["fluorescent lamps"],
    "metal": ["scrap metal", "iron pipes"],
    "clothing": ["clothes", "old clothes"]
}

modified_crawl_targets = {
    category: [f"{query} waste img" for query in queries]
    for category, queries in crawl_targets.items()
}

base_dir = r"C:\Users\Administrator\Downloads\end-to-end-image-scraper\downloaded_images"

In [None]:
def crawl_images_by_category(crawl_targets, base_dir, max_per_variant=100):
    for category, queries in crawl_targets.items():
        category_folder = os.path.join(base_dir, category)
        os.makedirs(category_folder, exist_ok=True)
        print(f"\n=== 카테고리: {category} ===")
        for query in tqdm(queries, desc=f"Processing {category}", unit="query"):
            sub_folder = os.path.join(category_folder, query.replace(" ", "_"))
            os.makedirs(sub_folder, exist_ok=True)
            print(f"📦 크롤링: '{query}' → {sub_folder}")
            crawler = GoogleImageCrawler(storage={"root_dir": sub_folder})
            crawler.crawl(
                keyword=query,
                max_num=max_per_variant,
                filters={"type": "photo", "size": "large"}
            )
            resize_images_in_folder(sub_folder, resize_size)
            remove_duplicate_images(sub_folder)
            time.sleep(10)

In [None]:
def count_images_per_class(base_dir):
    counts = {}
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        total = 0
        for root, _, files in os.walk(category_path):
            total += len([f for f in files if f.lower().endswith((".jpg", ".jpeg", ".png"))])
        counts[category] = total
    return counts

def visualize_image_counts(base_dir):
    counts = count_images_per_class(base_dir)
    categories = list(counts.keys())
    image_counts = list(counts.values())

    plt.figure(figsize=(12, 6))
    plt.bar(categories, image_counts, color='skyblue')
    plt.xlabel("카테고리")
    plt.ylabel("이미지 개수")
    plt.title("카테고리별 이미지 수")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# 크롤링 실행 (필요 시 주석 해제)
# crawl_images_by_category(modified_crawl_targets, base_dir, max_per_variant=300)
# visualize_image_counts(base_dir)

### **2. 부족한 클래스 재크롤링**

In [None]:
import os
import time
from PIL import Image, ImageStat
import imagehash
import numpy as np
import cv2
from icrawler.builtin import GoogleImageCrawler

# 기존 resize, is_blurry, is_mostly_black, remove_duplicate_images 함수는 위에서 정의
def get_existing_hashes(base_dir):
    existing_hashes = set()
    for root, _, files in os.walk(base_dir):
        for fname in files:
            fpath = os.path.join(root, fname)
            try:
                with Image.open(fpath) as img:
                    h = imagehash.phash(img)
                    existing_hashes.add(h)
            except Exception:
                continue
    return existing_hashes

def remove_duplicate_with_existing(folder_path, existing_hashes, hash_size=16):
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            with Image.open(fpath) as img:
                img_hash = imagehash.phash(img, hash_size=hash_size)
            if img_hash in existing_hashes:
                os.remove(fpath)
                print(f"🗑️ 기존 중복 제거: {fname}")
            else:
                existing_hashes.add(img_hash)
        except Exception as e:
            print(f"⚠️ 해시 실패: {fname} → {e}")
            os.remove(fpath)

def get_class_counts(base_dir):
    counts = {}
    for category in os.listdir(base_dir):
        category_path = os.path.join(base_dir, category)
        if os.path.isdir(category_path):
            num_images = len([
                f for f in os.listdir(category_path)
                if os.path.isfile(os.path.join(category_path, f))
            ])
            counts[category] = num_images
    return counts

def recrawl_category_with_modifier(category, modifier, save_path, max_per_variant=50):
    search_query = f"{modifier} {category}"
    sub_folder = f"{category.replace(' ', '_')}_{modifier.replace(' ', '_')}"
    output_path = os.path.join(save_path, sub_folder)
    os.makedirs(output_path, exist_ok=True)

    print(f"📦 재크롤링: {search_query} → {output_path}")
    crawler = GoogleImageCrawler(storage={"root_dir": output_path})
    crawler.crawl(
        keyword=search_query,
        max_num=max_per_variant,
        filters={"type": "photo", "size": "large"}
    )

    resize_images_in_folder(output_path, resize_size)
    remove_duplicate_with_existing(output_path, existing_hashes)
    time.sleep(10)


In [None]:
# 재크롤링 실행
existing_hashes = get_existing_hashes(base_dir)
for category in os.listdir(base_dir):
    category_path = os.path.join(base_dir, category)
    if os.path.isdir(category_path):
        remove_duplicate_with_existing(category_path, existing_hashes)

class_counts = get_class_counts(base_dir)
print("현재 클래스별 이미지 수:")
for cat, count in class_counts.items():
    print(f"{cat}: {count}")

# 부족 클래스 재크롤링
minimum_images = 200
extra_modifiers = ["recycled", "old", "broken", "dirty", "used", "disposed"]

for category, count in class_counts.items():
    if count < minimum_images:
        print(f"카테고리 '{category}'의 이미지가 부족합니다 ({count}개). 재크롤링을 시작합니다.")
        category_folder = os.path.join(base_dir, category)
        for modifier in extra_modifiers:
            recrawl_category_with_modifier(category, modifier, category_folder, max_per_variant=50)

# 최종 이미지 수 출력
final_counts = get_class_counts(base_dir)
print("최종 클래스별 이미지 수:")
for cat, count in final_counts.items():
    print(f"{cat}: {count}")

**데이터 크롤링은 저작권 문제로 인해 사용하지 않기로 결정**\
**-> 다양한 데이터셋을 이용하여 모델 학습 진행**

### **사전 학습 모델을 통해 데이터셋 분류**

In [None]:
import os
import torch
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import json
import urllib.request

model = models.resnet50(pretrained=True)
model.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# ImageNet class label 다운로드
url, filename = ("https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json", "imagenet_class_index.json")
urllib.request.urlretrieve(url, filename)
with open('imagenet_class_index.json') as f:
    class_idx = json.load(f)
idx2label = {int(k): v[1] for k, v in class_idx.items()}

# 분류 대상 이미지 폴더 지정
image_dir = r"C:\Users\Administrator\Downloads\capstone data\DATASET\TRAIN\R"
results = []

for image_file in os.listdir(image_dir):
    if image_file.lower().endswith(('.jpg', '.jpeg', '.png')):
        img_path = os.path.join(image_dir, image_file)
        image = Image.open(img_path).convert('RGB')
        input_tensor = transform(image).unsqueeze(0)

        with torch.no_grad():
            output = model(input_tensor)
        probs = torch.nn.functional.softmax(output[0], dim=0)
        confidence, pred_idx = torch.max(probs, dim=0)
        label = idx2label[pred_idx.item()]

        results.append({
            "filename": image_file,
            "predicted_label": label,
            "confidence": confidence.item()
        })

# 예측 결과 저장
pd.DataFrame(results).to_csv("predictions.csv", index=False)
with open("predictions.json", "w") as f:
    json.dump(results, f, indent=4)


**데이터셋 내의 데이터셋을 정확하게 분류하지 못함**\
**따라서 직접 데이터 분류하여 모델의 정확도를 높이려 노력**

### **조사한 데이터셋 이미지 전처리 파이프라인**

In [None]:
import os
import hashlib
from PIL import Image, ImageOps

In [41]:
DATA_DIR    = r"C:\Users\Administrator\Desktop\data3\WashingMachine"
CLEAN_DIR   = r"C:\Users\Administrator\Desktop\WashingMachine_dataset"
TARGET_SIZE = 224
PADDING     = True

In [42]:
# ─── 1. 무결성 검사 & 중복 제거 (MD5) ───
seen_hashes = set()
for root, _, files in os.walk(DATA_DIR):
    for fname in files:
        src_path = os.path.join(root, fname)
        try:
            data = open(src_path, "rb").read()
            h = hashlib.md5(data).hexdigest()
        except Exception:
            os.remove(src_path)
            continue
        if h in seen_hashes:
            os.remove(src_path)
        else:
            seen_hashes.add(h)

# ─── 2. 확장자 검증 ───
for root, _, files in os.walk(DATA_DIR):
    for fname in files:
        if not fname.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
            os.remove(os.path.join(root, fname))

# ─── 3. 전처리 & 저장 (PNG) ───
os.makedirs(CLEAN_DIR, exist_ok=True)
for root, _, files in os.walk(DATA_DIR):
    rel_dir = os.path.relpath(root, DATA_DIR)
    dst_dir = os.path.join(CLEAN_DIR, rel_dir)
    os.makedirs(dst_dir, exist_ok=True)

    for fname in files:
        src_p = os.path.join(root, fname)
        base, _ = os.path.splitext(fname)
        dst_p = os.path.join(dst_dir, base + ".png")

        try:
            img = Image.open(src_p).convert("RGB")
        except Exception:
            continue

        # Resize: 짧은 변형면 → TARGET_SIZE + 32
        short = min(img.size)
        scale = (TARGET_SIZE + 32) / short
        img = img.resize((int(img.width * scale), int(img.height * scale)), Image.BILINEAR)

        # Pad or Center Crop
        if PADDING:
            dw = max(TARGET_SIZE - img.width, 0)
            dh = max(TARGET_SIZE - img.height, 0)
            pad = (dw // 2, dh // 2, dw - dw // 2, dh - dh // 2)
            img = ImageOps.expand(img, pad, fill=(0, 0, 0))
        img = ImageOps.fit(img, (TARGET_SIZE, TARGET_SIZE), method=Image.BILINEAR, centering=(0.5, 0.5))

        img.save(dst_p, format="PNG")

# ─── 4. 결과 확인 ───
print("⚙️ 전처리 완료:", CLEAN_DIR)
print("\n=== 샘플 파일 목록 (최대 10개) ===")
count = 0
for root, _, files in os.walk(CLEAN_DIR):
    for f in files:
        print(os.path.join(root, f))
        count += 1
        if count >= 10:
            break
    if count >= 10:
        break


⚙️ 전처리 완료: C:\Users\Administrator\Desktop\WashingMachine_dataset

=== 샘플 파일 목록 (최대 10개) ===
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_0.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_1.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_10.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_100.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_101.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_102.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_104.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_107.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_109.png
C:\Users\Administrator\Desktop\WashingMachine_dataset\Washing_Machine_11.png


### **ChromaDB 문서 임베딩 및 검색**

In [None]:
# 필요 패키지 설치 (주석 해제하여 최초 1회 실행)
# !pip install -U langchain-community
# !pip install langchain chromadb unstructured sentence-transformers docarray

import os
from langchain.document_loaders import UnstructuredWordDocumentLoader, PyPDFLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import BaseRetriever

os.makedirs("chroma_db", exist_ok=True)

In [None]:
def ingest_to_chromadb(
    source_path: str,
    persist_directory: str = "./chroma_db",
    collection_name: str = "waste_policy",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    embedding_model: str = "all-MiniLM-L6-v2"
) -> Chroma:
    # 파일 로더 선택
    if source_path.lower().endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(source_path)
    elif source_path.lower().endswith(".pdf"):
        loader = PyPDFLoader(source_path)
    else:
        raise ValueError("지원하지 않는 파일 형식입니다. (.docx, .pdf 만 가능)")

    docs = loader.load()

    # 문서 쪼개기
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=["Heading 1", "Heading 2", "Heading 3"],
        max_chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)

    # 임베딩 및 저장
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    vectordb = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory,
        collection_name=collection_name
    )
    vectordb.persist()
    return vectordb

def get_simple_retriever(vectordb: Chroma, k: int = 4) -> BaseRetriever:
    return vectordb.as_retriever(search_type="similarity", search_kwargs={"k": k})

In [None]:
db = ingest_to_chromadb("document_data.pdf")
retriever = get_simple_retriever(db)