In [None]:
import sys
!{sys.executable} -m pip install aiohttp
!{sys.executable} -m pip install aiofiles
!{sys.executable} -m pip install beautifulsoup4

In [None]:
import os
import json
import asyncio
import aiohttp
import aiofiles
import glob
from bs4 import BeautifulSoup

In [None]:
async def fetch(url: str) -> BeautifulSoup:
    loop = asyncio.get_event_loop()
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False), loop=loop) as session:
        print(f'fetch url contents: {url}')
        async with session.get(url=url) as resp:
            #print(resp.status)
            text = await resp.text()
            return BeautifulSoup(text, "html.parser")

In [None]:
async def download_image(url, save_path):
    loop = asyncio.get_event_loop()
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False ), loop=loop) as session:
        print(f'download image: {url}')
        async with session.get(url) as response:
            if response.status == 200:
                async with aiofiles.open(save_path, mode='wb') as file:
                    while True:
                        chunk = await response.content.read(1024)
                        if not chunk:
                            break
                        await file.write(chunk)
                print(f"Image downloaded and saved to: {save_path}")
            else:
                print(f"Failed to download image: {response.status}")

In [None]:
async def save_image(src: str, folder: str , name: str) -> str:
    loop = asyncio.get_event_loop()
    cwd = os.getcwd()
    folder_path = os.path.join('data', 'images', folder)
    file_path = os.path.join(folder_path , f'{name.strip()}.jpg')
    os.makedirs(os.path.join(cwd, folder_path), exist_ok=True)
    output_path = os.path.join(cwd, file_path)
    if not os.path.exists(output_path):
        asyncio.run_coroutine_threadsafe(download_image(src,  output_path ), loop=loop)
    return file_path

In [None]:
async def get_categories():
    soup = await fetch(url='https://www.midlibrary.io/categories')
    div_listitems = soup.find_all("div", attrs={"class": "cat-item"})
    data = []
    for item in div_listitems:
        child_element = item.find("a")
        link = child_element.attrs.get("href")
        children = list(child_element.children)
        src = children[0].attrs.get('src')
        name = children[1].find("div").find("div").text
        data.append({
            'name': name,
            'cover': await save_image(src=src, folder= 'categories', name= name),
            'link': link
        })

    return data

In [None]:
async def get_category(link: str, suffix: str = ''):
    url = f'https://www.midlibrary.io/{link}{suffix}'
    soup = await fetch(url= url)
    page = soup.find("a", attrs={"class": "w-pagination-next"})
    all_styles = soup.find("div", attrs={"class": "shelf-3"})

    items = all_styles.find_all("div", attrs={"role": "listitem", "class": "cat-item-2 w-dyn-item"})
    data  = []
    for item in items:
        a = item.find("a", attrs={"class":"cat-link w-inline-block"})
        href = a.attrs.get("href")
        img = a.find("img")
        src = img.attrs.get("src")
        name = item.find("div",attrs={"fs-cmssort-field": "name"}).text
        sub = item.find("div", attrs={"fs-cmsfilter-field": "sections"})
        sub_title = sub.text if sub else None

        if src is None:
            print(name)
        
        data.append({
            'name': name,
            'cover': await save_image(src=src, folder= link, name= name),
            'link': href,
            'sub_category': sub_title
        })

    if page:
        data += await get_category(link=link, suffix=page.attrs.get("href"))
    return data

In [None]:
async def get_page(link: str):
    url = f'https://www.midlibrary.io/{link}'
    soup = await fetch(url= url)
    hero = soup.find("div" , attrs={"class":"hero-275"})
    bio = soup.find("div", attrs={"class": "_3-bio"})

    samples = soup.find_all("div", attrs={"class": "sample"})
    output = {
        'hero': None,
        'bio': bio.text if bio else None
    }
    if hero is not None:
        hero_img_src = hero.find("img").attrs.get("src")
        output["hero"] = await save_image(src=hero_img_src, folder= link, name= 'hero')


    data = []

    for sample in samples:
        img = sample.find("img")
        img_src=  img.attrs.get("src")
        copy_items = sample.find("div", attrs = {"class": "copy3"})
        prompt =  copy_items.text
        name = prompt
        data.append({
            'cover': await save_image(src=img_src, folder= link, name= name),
            'prompt': prompt
        })
    output['samples'] = data
    return output



file_pattern = 'data/json/categories/*.json'
data = []
for file_name in glob.glob(file_pattern):
    with open(file_name) as file:
        data += json.load(file)

folder_path = os.path.join(os.getcwd(), 'data', 'json', 'styles')
os.makedirs(folder_path, exist_ok=True)
count = len(data)
i = 0
for item in data:
    i+=1
    link = item.get("link").replace("/","",1)

    print(f'get link {link}, current: {i} / {count}')

    data = await get_page(link= link)
    json_data = json.dumps(data)
    name = link.split("/")[-1]
    async with aiofiles.open(os.path.join(folder_path,  f'{name}.json' ), mode='w') as f:
        await f.write(json_data)



#await get_page(link= 'styles/3d-graffiti')

In [None]:
async def main():
    data = await get_categories()
    json_data = json.dumps(data)
    folder_path = os.path.join(os.getcwd(), 'data', 'json')
    os.makedirs(folder_path, exist_ok=True)
    async with aiofiles.open(os.path.join(folder_path, 'categories.json' ), mode='w') as f:
        await f.write(json_data)  

await main()      

In [None]:
async def start_download_category(link: str):
    data = await get_category(link = link)
    json_data = json.dumps(data)
    folder_path = os.path.join(os.getcwd(), 'data', 'json', 'categories')
    os.makedirs(folder_path, exist_ok=True)
    async with aiofiles.open(os.path.join(os.getcwd(), 'data', 'json',  f'{link}.json' ), mode='w') as f:
        await f.write(json_data)  

async def main():
    async with aiofiles.open(os.path.join(os.getcwd(), 'data', 'json', 'categories.json' ), mode='r') as f:
        contents = await f.read()
        categories = json.loads(contents)
        for categoriy in categories:
            link = categoriy.get("link").replace("/", "", 1)
            print(link)
            await start_download_category(link=link)

await main()     