# Obtaining Data from ChampKoi

In [1]:
# Imports

from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import uuid
import re
from html import unescape

In [2]:
# Define the website

BASE_URL = 'https://www.champkoi.com/collections/all-koi'
START_URL = 'https://www.champkoi.com/collections/all-koi?page={}'
website = requests.get(BASE_URL)

# Define the output directory

OUTPUT_DIR = 'champkoi_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Check status

website.status_code

200

In [4]:
# Get the object

soup = BeautifulSoup(website.text, "html.parser")

In [5]:
# soup

In [6]:
product = soup.select("ul.products > li.product")

In [7]:
# product

In [8]:
data_collected = {}

BASE_URL    = "https://www.champkoi.com"
COLLECT_URL = BASE_URL + "/collections/all-koi/products.json"
PER_PAGE    = 250   # Shopify’s max per page

def scrape_page(page_num):
    params = {"limit": PER_PAGE, "page": page_num}
    resp = requests.get(COLLECT_URL, params=params)
    resp.raise_for_status()
    products = resp.json().get("products", [])
    if not products:
        return False

    for prod in products:
        sku    = str(prod["id"])
        raw_title = prod["title"].strip()
        handle = prod["handle"]
        link   = f"{BASE_URL}/products/{handle}"
        price  = prod["variants"][0]["price"]
        in_stock = prod["variants"][0]["available"]

        # — extract variety from the first option —
        options = prod.get("options", [])              # e.g. ["Variety","Size"]
        variant = prod["variants"][0]                  # take the first variant
        if "Variety" in options:
            idx     = options.index("Variety") + 1     # 1-based
            variety = variant.get(f"option{idx}", "").strip()
        else:
            # fallback: assume option1 is variety
            variety = variant.get("option1", "").strip()

        # build a title that **includes** the variety up front
        title = f"{variety} — {raw_title}" if variety else raw_title

        # strip HTML from description
        raw_html    = prod.get("body_html", "")
        description = unescape(re.sub(r'<[^>]+>', '', raw_html)).strip()

        category = prod.get("product_type", "")
        tags     = prod.get("tags", [])

        # first image
        imgs    = prod.get("images", [])
        img_url = imgs[0]["src"] if imgs else None

        # download image
        if img_url:
            ext      = os.path.splitext(img_url)[1].split("?")[0] or ".jpg"
            unit_id  = uuid.uuid4().hex[:6]
            safe     = re.sub(r'[\\/*?:"<>|]', "", title).replace(" ", "_")
            filename = f"{unit_id}_{safe}{ext}"
            path     = os.path.join(OUTPUT_DIR, filename)

            r_img = requests.get(img_url, stream=True)
            r_img.raise_for_status()
            with open(path, "wb") as f:
                for chunk in r_img.iter_content(8192):
                    if chunk:
                        f.write(chunk)
        else:
            path = None

        data_collected[unit_id] = {
            "sku":        sku,
            "variety":    variety,
            "title":      title,
            "price":      price,
            "in_stock":   in_stock,
            "link":       link,
            "description":description,
            "category":   category,
            "tags":       tags,
            "image_url":  img_url,
            "image_path": path
        }

    return True

In [9]:
if __name__ == "__main__":
    page = 1
    while scrape_page(page):
        print(f"✔ Page {page} done, total items: {len(data_collected)}")
        page += 1

    print(f"\n✅ Finished – collected {len(data_collected)} products.")

✔ Page 1 done, total items: 185

✅ Finished – collected 185 products.


In [10]:
df = pd.DataFrame(data_collected)

In [11]:
df = df.T

In [12]:
df

Unnamed: 0,sku,variety,title,price,in_stock,link,description,category,tags,image_url,image_path
87e22e,9689837273400,"A: 14"" (36cm) Female Showa [Isa]","A: 14"" (36cm) Female Showa [Isa] — 2025WINTER-012",450.00,False,https://www.champkoi.com/products/2025winter-012,,Group_Koi,[Showa],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/87e22e_A_14_(36cm)_Female_Showa_...
188b5a,9689846350136,"A: 14"" (36cm) Male Showa [Isa]","A: 14"" (36cm) Male Showa [Isa] — 2025WINTER-014",450.00,True,https://www.champkoi.com/products/2025winter-014,,Group_Koi,[Showa],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/188b5a_A_14_(36cm)_Male_Showa_[I...
911317,9689850118456,"A: 13"" (33cm) Male Showa [Isa]","A: 13"" (33cm) Male Showa [Isa] — 2025WINTER-015",450.00,True,https://www.champkoi.com/products/2025winter-015,,Group_Koi,[Showa],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/911317_A_13_(33cm)_Male_Showa_[I...
dd27e7,9689853329720,"A: 14"" (36cm) Female Showa [Isa]","A: 14"" (36cm) Female Showa [Isa] — 2025WINTER-016",450.00,False,https://www.champkoi.com/products/2025winter-016,,Group_Koi,[Showa],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/dd27e7_A_14_(36cm)_Female_Showa_...
de2cb4,9689970475320,"A: 14"" (36cm) Male Kohaku [Marusaka]","A: 14"" (36cm) Male Kohaku [Marusaka] — 2025WIN...",450.00,False,https://www.champkoi.com/products/2025winter-017,,Group_Koi,"[Kohaku, Shiro Utsuri, Shusui]",https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/de2cb4_A_14_(36cm)_Male_Kohaku_[...
...,...,...,...,...,...,...,...,...,...,...,...
0db514,9876548682040,Default Title,Default Title — 25D-024 Kujaku,5800.00,True,https://www.champkoi.com/products/25d-024-kujaku,,Single_Koi,[Kujaku],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/0db514_Default_Title_—_25D-024_K...
79245b,9610889953592,Default Title,Default Title — 2024FALL-080 Mukashi Ogon,6800.00,True,https://www.champkoi.com/products/2024fall-080...,,Presale_Koi,[Mukashi Ogon],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/79245b_Default_Title_—_2024FALL-...
5a5805,9876550025528,Default Title,Default Title — 25D-031 Tancho Showa,6800.00,True,https://www.champkoi.com/products/25d-031-tanc...,,Single_Koi,[Tancho Showa],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/5a5805_Default_Title_—_25D-031_T...
18f755,8815742452024,"A: 32"" Female Kohaku [Nogami]","A: 32"" Female Kohaku [Nogami] — 2023FALL-109",14000.00,True,https://www.champkoi.com/products/2023fall-109,,Presale_Koi,[Kohaku],https://cdn.shopify.com/s/files/1/0749/6132/22...,champkoi_data/18f755_A_32_Female_Kohaku_[Nogam...


In [13]:
df.duplicated(subset='image_url').sum()

np.int64(0)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 185 entries, 87e22e to 52e490
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          185 non-null    object
 1   variety      185 non-null    object
 2   title        185 non-null    object
 3   price        185 non-null    object
 4   in_stock     185 non-null    object
 5   link         185 non-null    object
 6   description  185 non-null    object
 7   category     185 non-null    object
 8   tags         185 non-null    object
 9   image_url    185 non-null    object
 10  image_path   185 non-null    object
dtypes: object(11)
memory usage: 17.3+ KB


In [16]:
# df.to_csv('champkoi_data.csv', index=True)