In [1]:
!pip install icrawler
!pip install ImageHash



In [2]:
from icrawler.builtin import BingImageCrawler
from pathlib import Path


# Check PyTorch version and GPU availability
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))
    

2.1.2
CUDA available: True
NVIDIA GeForce RTX 4070 Laptop GPU


# Prepare Data Folders
Create directories to store raw images for each category

In [3]:
import os

here = Path(os.getcwd()) / "v0.1"  
root = here / "data" / "raw"

(root / "manga").mkdir(parents=True, exist_ok=True)
(root / "manhwa").mkdir(parents=True, exist_ok=True) 
(root / "manhua").mkdir(parents=True, exist_ok=True)

# Define Search Queries
Use Bing search queries to fetch category-specific images

In [4]:
queries = {
    "manga": [
        'manga page -manhwa -manhua',
        'manga panel site:ja.wikipedia.org'
    ],
    "manhwa": [
        'manhwa panel -manhua -manga',
        'webtoon panel -manga -manhua',
        '웹툰 패널 site:comic.naver.com',
        'webtoon panel site:webtoons.com -manga -manhua'
    ],
    "manhua": [
        'manhua panel -manhwa -manga',
        '国漫 漫画 分镜 -webtoon',
        '中国 漫画 分镜 -webtoon',
        'manhua site:bilibili.com -webtoon -manhwa -manga'
    ],
}

# Image Crawling Function

In [5]:
def crawl_images(label, n_per_query): 
    """
    Crawl images for a given category using BingImageCrawler.
    Avoids re-downloading if folder already has images.
    """
    save_path = root / label
    if save_path.exists() and any(save_path.iterdir()):
        print(f"Images for '{label}' already exist. Skipping download.")
        return
        
    for q in queries[label]: 
        crawler = BingImageCrawler(storage={'root_dir': str(root/label)})
        crawler.crawl(
            keyword=q,
            max_num=n_per_query,
            min_size=(256, 256),  
            file_idx_offset='auto'
        )




#  Crawl Images for All Categories

In [6]:
for label in queries:
    crawl_images(label, n_per_query=150)

2025-08-11 22:12:54,378 - INFO - icrawler.crawler - start crawling...
2025-08-11 22:12:54,379 - INFO - icrawler.crawler - starting 1 feeder threads...
2025-08-11 22:12:54,380 - INFO - icrawler.crawler - starting 1 parser threads...
2025-08-11 22:12:54,381 - INFO - icrawler.crawler - starting 1 downloader threads...
2025-08-11 22:12:54,749 - INFO - parser - parsing result page https://www.bing.com/images/async?q=manga page -manhwa -manhua&first=0
2025-08-11 22:12:55,001 - INFO - downloader - image #1	https://cdn.mangaclash.com/manga_5f75a1cb8c75c/31beba2bd0516e94fccb9b147643fa5a/7.jpg
2025-08-11 22:12:55,389 - INFO - downloader - image #2	https://wallpapers.com/images/hd/manga-pages-yjxwq1kmwmmeg1d6.jpg
2025-08-11 22:12:55,485 - INFO - downloader - image #3	https://dthezntil550i.cloudfront.net/py/latest/py2104292348463810008012142/1280_960/3561bab5-cd12-4090-afec-14be051cd162.jpg
2025-08-11 22:12:55,710 - INFO - downloader - image #4	https://static.vecteezy.com/system/resources/previews

# Prepare Data for Training

In [7]:
from fastai.vision.all import *

# Create a DataBlock for image classification
dls = DataBlock(
    blocks=(ImageBlock, CategoryBlock),           # Images → Labels
    get_items=get_image_files,                    # Find image files
    splitter=RandomSplitter(valid_pct=0.2, seed=42), # Train/valid split
    get_y=parent_label,                           # Labels = folder names
    item_tfms=Resize(460),                        # Resize before batching
    batch_tfms=[*aug_transforms(size=224), Normalize.from_stats(*imagenet_stats)] # Augment & normalize
).dataloaders(
    root, shuffle=True
)

# Train Model

In [8]:
learn = vision_learner(dls, resnet34, metrics=accuracy)
learn.fine_tune(5)   

epoch,train_loss,valid_loss,accuracy,time
0,1.701097,1.662319,0.444444,00:31




epoch,train_loss,valid_loss,accuracy,time
0,1.073709,1.191181,0.555556,00:26
1,1.00528,1.0423,0.650794,00:26
2,0.858196,0.950072,0.674603,00:26
3,0.732771,0.937576,0.730159,00:27
4,0.647361,0.921252,0.738095,00:28


# Manual Testing

In [13]:
classes = list(learn.dls.vocab)
print(classes)

im = PILImage.create("v0.1/data/test/test1_manga.jpg") # Load a image to test

predicted_class,_,probs = learn.predict(im)

print(f"This is a: {predicted_class}.")
predicted_idx = classes.index(predicted_class)

for idx, ele in enumerate(classes):
    print(f"Probability it's a {ele}: {probs[idx]:.4f}")

['manga', 'manhua', 'manhwa']


This is a: manga.
Probability it's a manga: 0.9895
Probability it's a manhua: 0.0105
Probability it's a manhwa: 0.0000
