# Obtaining Data from Sacramento Koi

In [1]:
# Imports

from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urljoin
import os
import uuid
import time


In [2]:
# Define the website

BASE_URL = 'https://sacramentokoi.com/koi/'
START_URL = 'https://sacramentokoi.com/koi/page/{}'
website = requests.get(BASE_URL)

# Define the output directory

OUTPUT_DIR = 'sacramentokoi_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Check status

website.status_code

200

In [4]:
# Get the object

soup = BeautifulSoup(website.text, "html.parser")

##### What the html look like (double click to see)
<!-- <div class="product-small box">
<div class="box-image">
<div class="image-fade_in_back">
<a aria-label='Ai Goromo 20" - 240389171' href="https://sacramentokoi.com/ai-goromo-20-240389171/">
<img alt="" class="lazy-load attachment-woocommerce_thumbnail size-woocommerce_thumbnail" data-src="https://sacramentokoi.com/wp-content/uploads/2024/12/240389171-2.jpg" decoding="async" height="437" src="data:image/svg+xml,%3Csvg%20viewBox%3D%220%200%20247%20437%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%3C%2Fsvg%3E" width="247"/> </a>
</div>
<div class="image-tools is-small top right show-on-hover">
</div>
<div class="image-tools is-small hide-for-small bottom left show-on-hover">
</div>
<div class="image-tools grid-tools text-center hide-for-small bottom hover-slide-in show-on-hover">
</div>
<div class="out-of-stock-label">Out of stock</div> </div>
<div class="box-text box-text-products">
<div class="title-wrapper"> <p class="category uppercase is-smaller no-text-overflow product-cat op-7">
			Female		</p>
<p class="name product-title woocommerce-loop-product__title"><a class="woocommerce-LoopProduct-link woocommerce-loop-product__link" href="https://sacramentokoi.com/ai-goromo-20-240389171/">Ai Goromo 20″ – 240389171</a></p></div><div class="price-wrapper">
<span class="price"><span class="woocommerce-Price-amount amount"><bdi><span class="woocommerce-Price-currencySymbol">$</span>1,200.00</bdi></span></span>
</div> </div>
</div>
<input class="wpmProductId" data-id="51605" type="hidden"/>
<script>
			(window.wpmDataLayer = window.wpmDataLayer || {}).products             = window.wpmDataLayer.products || {};
			window.wpmDataLayer.products[51605] = {"id":"51605","sku":"240389171","price":1200,"brand":"","quantity":1,"dyn_r_ids":{"post_id":"51605","sku":"240389171","gpf":"woocommerce_gpf_51605","gla":"gla_51605"},"is_variable":false,"type":"simple","name":"Ai Goromo 20\" - 240389171","category":["Female","Fukasawa","Goromo","Koi","Koi 2024"],"is_variation":false};
					window.pmw_product_position = window.pmw_product_position || 1;
		window.wpmDataLayer.products[51605]['position'] = window.pmw_product_position++;
				</script>
</div>
</div><div class="product-small col has-hover out-of-stock product type-product post-51608 status-publish last outofstock product_cat-female-sex product_cat-fukasawa product_cat-goromo-koi-type product_cat-koi product_cat-koi-2024 has-post-thumbnail virtual taxable purchasable product-type-simple">
<div class="col-inner">
<div class="badge-container absolute left top z-1">
</div> -->

In [5]:
product = soup.select("div.product-small.box")

In [6]:
product

[<div class="product-small box">
 <div class="box-image">
 <div class="image-fade_in_back">
 <a aria-label='Ai Goromo 10" - 240054222' href="https://sacramentokoi.com/ai-goromo-10-240054222/">
 <img alt="" class="lazy-load attachment-woocommerce_thumbnail size-woocommerce_thumbnail" data-src="https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1-247x437.jpg" data-srcset="https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1-247x437.jpg 247w, https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1-226x400.jpg 226w, https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1-452x800.jpg 452w, https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1-868x1536.jpg 868w, https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1-510x903.jpg 510w, https://sacramentokoi.com/wp-content/uploads/2024/12/240054222-1.jpg 1130w" decoding="async" fetchpriority="high" height="437" sizes="(max-width: 247px) 100vw, 247px" src="data:image/svg+xml,%3Csvg%2

In [7]:
data_collected = {}

def scrape_page(page_num):
    url  = START_URL.format(page_num)
    resp = requests.get(url)
    if resp.status_code != 200:
        return False    # no such page

    soup = BeautifulSoup(resp.text, "html.parser")

    products = soup.select("div.product-small.box")
    if not products:
        return False    # no products → end of pagination

    for prod in products:
        # 1. product page link
        a      = prod.select_one("p.name a")
        link   = a["href"]

        # 2. title
        title = a.get_text(strip=True)

        # 3. category
        cat_tag  = prod.select_one("p.category")
        category = cat_tag.get_text(strip=True) if cat_tag else None

        # 4. image URL
        img_tag = prod.find("img", class_="lazy-load")
        raw_src = img_tag.get("data-src") or img_tag.get("src")
        img_url = urljoin(BASE_URL, raw_src)

        # 5. price
        price_tag = prod.select_one("div.price-wrapper span.price bdi")
        price     = price_tag.get_text(strip=True) if price_tag else "N/A"

        # 6. out‑of‑stock?
        out_of_stock = bool(prod.select_one("div.out-of-stock-label"))

        # 7. SKU
        import re
        m = re.search(r"–\s*(\d+)$", title)
        sku = m.group(1) if m else None

        # 8. Save the image file named by that sku (or fallback to title)
        ext = os.path.splitext(raw_src)[1] or ".jpg"

        safe_title = re.sub(r'[\\/*?:"<>|]', "", title)      # remove illegal chars
        safe_title = safe_title.replace(" ", "_")            # make spaces underscores

        # generate a short 6‑hex‑digit ID
        unit_id = uuid.uuid4().hex[:6]  # e.g. '9f1c2a'
        filename  = f"{unit_id}_{safe_title}{ext}"
        filepath      = os.path.join(OUTPUT_DIR, filename)

        # download in streaming mode
        resp = requests.get(img_url, stream=True)
        resp.raise_for_status()
        with open(filepath, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if not chunk:
                    continue
                f.write(chunk)

        # 9. output or store your data
        data_collected[unit_id] = {
            "link":     link,
            "title":    title,
            "category":  category,
            "out_of_stock":  out_of_stock,
            "price":    price,
            "sku":      sku,
            "image":    filepath,      # path where you saved it
            "image_url": img_url,      # original URL
        }
        pass
    return True

In [8]:
page = 1
while True:
    print(f"Scraping page {page}…")
    success = scrape_page(page)
    if not success:
        print("No more pages. Done.")
        break
    page += 1
    time.sleep(1)  # be polite

Scraping page 1…
Scraping page 2…
Scraping page 3…
Scraping page 4…
Scraping page 5…
Scraping page 6…
Scraping page 7…
Scraping page 8…
Scraping page 9…
No more pages. Done.


In [9]:
df = pd.DataFrame(data_collected)

In [10]:
df = df.T

In [11]:
df

Unnamed: 0,link,title,category,out_of_stock,price,sku,image,image_url
cd3cf5,https://sacramentokoi.com/ai-goromo-10-240054222/,Ai Goromo 10″ – 240054222,Goromo,True,$150.00,240054222,sacramentokoi_data/cd3cf5_Ai_Goromo_10″_–_2400...,https://sacramentokoi.com/wp-content/uploads/2...
617c1b,https://sacramentokoi.com/ai-goromo-20-240389171/,Ai Goromo 20″ – 240389171,Female,True,"$1,200.00",240389171,sacramentokoi_data/617c1b_Ai_Goromo_20″_–_2403...,https://sacramentokoi.com/wp-content/uploads/2...
5027b5,https://sacramentokoi.com/ai-goromo-23-240396257/,Ai Goromo 23″ – 240396257,Female,True,"$1,200.00",240396257,sacramentokoi_data/5027b5_Ai_Goromo_23″_–_2403...,https://sacramentokoi.com/wp-content/uploads/2...
ebda82,https://sacramentokoi.com/ai-goromo-24-231616208/,Ai Goromo 27″ – 231616208,Female,True,"$4,200.00",231616208,sacramentokoi_data/ebda82_Ai_Goromo_27″_–_2316...,https://sacramentokoi.com/wp-content/uploads/2...
10edc1,https://sacramentokoi.com/aka-matsuba-11-24005...,Aka Matsuba 11″ – 240054184,Koi,True,$175.00,240054184,sacramentokoi_data/10edc1_Aka_Matsuba_11″_–_24...,https://sacramentokoi.com/wp-content/uploads/2...
...,...,...,...,...,...,...,...,...
e04f2c,https://sacramentokoi.com/yellow-dragon-17-240...,Yellow Dragon 17″ – 240154405,Aokiya,True,$600.00,240154405,sacramentokoi_data/e04f2c_Yellow_Dragon_17″_–_...,https://sacramentokoi.com/wp-content/uploads/2...
754a14,https://sacramentokoi.com/yellow-ginga-14-2400...,Yellow Ginga 14″ – 240054090,Ginga,True,$375.00,240054090,sacramentokoi_data/754a14_Yellow_Ginga_14″_–_2...,https://sacramentokoi.com/wp-content/uploads/2...
1cb46d,https://sacramentokoi.com/yellow-ginga-19-2404...,Yellow Ginga 19″ – 240489285,Ginga,True,"$1,600.00",240489285,sacramentokoi_data/1cb46d_Yellow_Ginga_19″_–_2...,https://sacramentokoi.com/wp-content/uploads/2...
ba392c,https://sacramentokoi.com/yellow-ginga-20-2402...,Yellow Ginga 20″ – 240290281,Female,True,"$2,300.00",240290281,sacramentokoi_data/ba392c_Yellow_Ginga_20″_–_2...,https://sacramentokoi.com/wp-content/uploads/2...


In [12]:
df.duplicated(subset='image_url').sum()

np.int64(0)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 423 entries, cd3cf5 to 9a51ec
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   link          423 non-null    object
 1   title         423 non-null    object
 2   category      423 non-null    object
 3   out_of_stock  423 non-null    object
 4   price         423 non-null    object
 5   sku           422 non-null    object
 6   image         423 non-null    object
 7   image_url     423 non-null    object
dtypes: object(8)
memory usage: 29.7+ KB


In [14]:
df.to_csv('sacramentokoi_data.csv', index=True)