# Obtaining Data from GC Koi

In [1]:
# Imports

from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import uuid
import re
from html import unescape

In [2]:
# # 250 is the maximum Shopify allows per request
# API_URL = "https://gckoi.com/collections/koi/products.json?limit=250"

# resp = requests.get(API_URL)
# resp.raise_for_status()
# data = resp.json()

# for prod in data["products"]:
#     title = prod["title"]
#     price = prod["variants"][0]["price"]
#     url   = prod["handle"]
#     print(f"{title} – ${price}  →  https://gckoi.com/products/{url}")

In [3]:
# Define the website

BASE_URL = 'https://gckoi.com/collections/koi'
START_URL = 'https://gckoi.com/collections/koi?page={}'
website = requests.get(BASE_URL)

# Define the output directory

OUTPUT_DIR = 'gckoi_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
# Check status

website.status_code

200

In [5]:
# Get the object

soup = BeautifulSoup(website.text, "html.parser")

In [6]:
# soup

##### What the html look like (double click to see)

In [7]:
# product = soup.select("article.card.epic-product-card")

In [8]:
# product

In [9]:
# ——— CONFIG ———
BASE_URL    = "https://gckoi.com"
COLLECT_URL = BASE_URL + "/collections/koi/products.json"
PER_PAGE    = 250        # Shopify max per request

os.makedirs(OUTPUT_DIR, exist_ok=True)

data_collected = {}

def scrape_page(page_num):
    params = {
        "limit": PER_PAGE,
        "page":  page_num
    }
    resp = requests.get(COLLECT_URL, params=params)
    resp.raise_for_status()

    payload = resp.json()
    products = payload.get("products", [])
    if not products:
        return False

    for prod in products:
        # 1) Basic fields
        sku      = str(prod["id"])
        title    = prod["title"].strip()
        handle   = prod["handle"]
        link     = f"{BASE_URL}/products/{handle}"
        price    = prod["variants"][0]["price"]
        in_stock = prod["variants"][0]["available"]

        # 2) Description (strip HTML tags)
        raw_html    = prod.get("body_html", "")
        description = unescape(re.sub(r'<[^>]+>', '', raw_html)).strip()

        # 3) Category / tags
        #    Shopify product_type is often the “category”; you can also include tags
        category = prod.get("product_type", "")
        tags     = prod.get("tags", "") if prod.get("tags") else []

        # 4) Main image URL
        images = prod.get("images", [])
        if images:
            img_url = images[0]["src"]
        else:
            img_url = None

        # 5) Download image
        if img_url:
            ext = os.path.splitext(img_url)[1].split("?")[0] or ".jpg"
            unit_id = uuid.uuid4().hex[:7]
            safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
            safe_title = safe_title.replace(" ", "_").replace("–", "-")
            filename = f"{unit_id}_{safe_title}{ext}"
            filepath = os.path.join(OUTPUT_DIR, filename)

            r_img = requests.get(img_url, stream=True)
            r_img.raise_for_status()
            with open(filepath, "wb") as f:
                for chunk in r_img.iter_content(8192):
                    if chunk:
                        f.write(chunk)
        else:
            filepath = None

        # 6) Store
        data_collected[unit_id] = {
            "sku":         sku,
            "title":       title,
            "price":       price,
            "in_stock":    in_stock,
            "link":        link,
            "description": description,
            "category":    category,
            "tags":        tags,
            "image_url":   img_url,
            "image_path":  filepath
        }

    return True

In [10]:
if __name__ == "__main__":
    page = 1
    while True:
        print(f"Scraping page {page}…")
        more = scrape_page(page)
        if not more:
            break
        page += 1

    # At the end, `data_collected` holds everything
    print(f"Done! Collected {len(data_collected)} products.")

Scraping page 1…
Scraping page 2…
Scraping page 3…
Done! Collected 408 products.


In [11]:
df = pd.DataFrame(data_collected)

In [12]:
df = df.T

In [13]:
df

Unnamed: 0,sku,title,price,in_stock,link,description,category,tags,image_url,image_path
9718985,8428729860265,"GENJIRO GOSANKE 7""-9""",95.00,True,https://gckoi.com/products/genjiro-gosanke-7-9,"Breeder: Genjiro Kohaku, Sanke and ShowaSize: ...",,"[genjiro, kohaku, koi, sanke, showa, tosai]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/9718985_GENJIRO_GOSANKE_7-9.jpg
d9a7dbc,7856409346217,HIGH QUALITY SAKAI BLOODLINE TANCHO KOHAKU 24”...,3500.00,True,https://gckoi.com/products/high-quality-sakai-...,Breeder: MarukyuSize: 3sai 24”Sex: female,,"[kohaku, koi, Marukyu, nisai]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/d9a7dbc_HIGH_QUALITY_SAKAI_BLOODLIN...
00d04b3,8291399860393,"HIROI DOITSU SANKE-SHOWA-TANCHO OCHIBA 14""-16""",600.00,False,https://gckoi.com/products/hiroi-doitsu-sanke-...,*SHIP END OF NOVEMBER*Breeder: HiroiSex: 4 fem...,Koi,"[fall 2024, HIROI, koi, nisai, ochiba, sanke, ...",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/00d04b3_HIROI_DOITSU_SANKE-SHOWA-TA...
564bc0a,8305083449513,IKARASHI MIX BOWL #1,600.00,False,https://gckoi.com/products/ikarashi-mix-bowl-1,*SHIP END OF NOVEMBER*Breeder: IkarashiSex: 5 ...,,"[Beni Kikokuryu, fall 2024, ikarashi, koi, KUJ...",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/564bc0a_IKARASHI_MIX_BOWL_#1.png
88ec869,7993174229161,ISA KOHAKU GUARANTEE FEMALE WITH CERTIFICATE 2...,2500.00,True,https://gckoi.com/products/isa-showa-grow-out-...,"Breeder: Isa with certificateSize: 21.5""Sex: f...",,"[isa, kohaku, koi, nisai]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/88ec869_ISA_KOHAKU_GUARANTEE_FEMALE...
...,...,...,...,...,...,...,...,...,...,...
2fc5440,8479347048617,"TORAZO KOHAKU JUMBO TOSAI 9""-11"" WITH CERTIFIC...",400.00,False,https://gckoi.com/products/torazo-kohaku-jumbo...,Breeder: Torazo with certificate Sex: uncheckS...,,"[kohaku, koi, torazo, tosai]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/2fc5440_TORAZO_KOHAKU_JUMBO_TOSAI_9...
7b86441,8479347081385,"TORAZO KOHAKU JUMBO TOSAI 9""-11"" WITH CERTIFIC...",400.00,False,https://gckoi.com/products/torazo-kohaku-jumbo...,Breeder: Torazo with certificate Sex: uncheckS...,,"[kohaku, koi, torazo, tosai]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/7b86441_TORAZO_KOHAKU_JUMBO_TOSAI_9...
2e290d0,7864493572265,YAMAMATSU KOHAKU 17”-21” BOWL #1,450.00,False,https://gckoi.com/products/yamamatsu-sanke-17-...,"Breeder: Yamamatsu Size: 2sai 17”-21""Sex: uncheck",,"[fall 2024, kohaku, koi, nisai, Yamamatsu]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/2e290d0_YAMAMATSU_KOHAKU_17”-21”_BO...
cd8689c,8307867680937,YAMAMATSU KOHAKU 18”-19” BOWL #2,450.00,False,https://gckoi.com/products/yamamatsu-kohaku-18...,"Breeder: Yamamatsu Size: 2sai 18”-19""Sex: uncheck",,"[fall 2024, kohaku, koi, nisai, Yamamatsu]",https://cdn.shopify.com/s/files/1/0565/8830/45...,gckoi_data/cd8689c_YAMAMATSU_KOHAKU_18”-19”_BO...


In [15]:
df['in_stock'].sum()

159

In [16]:
df.duplicated(subset='image_url').sum()

np.int64(0)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 408 entries, 9718985 to c4831bd
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          408 non-null    object
 1   title        408 non-null    object
 2   price        408 non-null    object
 3   in_stock     408 non-null    object
 4   link         408 non-null    object
 5   description  408 non-null    object
 6   category     408 non-null    object
 7   tags         408 non-null    object
 8   image_url    408 non-null    object
 9   image_path   408 non-null    object
dtypes: object(10)
memory usage: 35.1+ KB


In [18]:
# df.to_csv('gckoi_data.csv', index=True)