# Obtaining Data from Grandkoi

In [1]:
# Imports

from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urljoin
import os
import uuid
import time


In [2]:
# Define the website

BASE_URL = 'https://www.grandkoi.com/shop/'
START_URL = 'https://www.grandkoi.com/shop/page/{}'
website = requests.get(BASE_URL)

# Define the output directory

OUTPUT_DIR = 'grandkoi_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
# Check status

website.status_code

200

In [4]:
# Get the object

soup = BeautifulSoup(website.text, "html.parser")

In [5]:
product = soup.select("li.entry.product")

In [6]:
data_collected = {}

def scrape_page(page_num):
    url  = START_URL.format(page_num)
    resp = requests.get(url)
    if resp.status_code != 200:
        return False    # no such page
    soup = BeautifulSoup(resp.text, "html.parser")
    products = soup.select("li.entry.product")
    if not products:
        return False    # no products → end of pagination
    for prod in products:
        # 1. product page link
        a      = prod.select_one("a.woocommerce-LoopProduct-link")
        link   = a["href"]

        # 2. image URL (lazy or normal)
        img_tag = prod.select_one("img")
        raw_src = img_tag.get("data-lazy-src") or img_tag.get("src")
        img_url = urljoin(BASE_URL, raw_src)

        # 3. title
        title = prod.select_one("h2.woocommerce-loop-product__title") \
                    .get_text(strip=True)

        # 4. price
        price_tag = prod.select_one("span.price")
        price     = price_tag.get_text(strip=True) if price_tag else "N/A"

        # 5. SKU
        sku_text = prod.select_one("div.sku-no").get_text(strip=True)
        sku      = sku_text.replace("SKU:", "").strip()

        # 6. details
        detail_html = prod.select_one("div.excerpt").decode_contents()
        details     = [line.strip() for line in detail_html.split("<br/>")]


        # 7. save the image
        #    a) pick extension (fallback to .jpg)
        ext = os.path.splitext(raw_src)[1] or ".jpg"

        #    b) sanitize title for filename
        safe_title = "".join(c if c.isalnum() or c in "-_ " else "_" for c in title)
        # generate a short 6‑hex‑digit ID
        unit_id = uuid.uuid4().hex[:6]  # e.g. '9f1c2a'
        filename   = f"{unit_id}_{safe_title}{ext}"
        filepath   = os.path.join(OUTPUT_DIR, filename)

        #    c) download in streaming mode
        resp = requests.get(img_url, stream=True)
        resp.raise_for_status()
        with open(filepath, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if not chunk:
                    continue
                f.write(chunk)

        # 8. output or store your data
        data_collected[unit_id] = {
            "link":     link,
            "title":    title,
            "price":    price,
            "sku":      sku,
            "details":  details,
            "image":    filepath,      # path where you saved it
            "image_url": img_url,      # original URL
        }
        pass
    return True

In [8]:
# page = 1
# while True:
#     print(f"Scraping page {page}…")
#     success = scrape_page(page)
#     if not success:
#         print("No more pages. Done.")
#         break
#     page += 1
#     time.sleep(1)  # be polite

In [9]:
df = pd.DataFrame(data_collected)

In [10]:
df = df.T

In [11]:
df

Unnamed: 0,link,title,price,sku,details,image,image_url
d7c302,https://www.grandkoi.com/product/high-quality-...,High Quality showa- SOLD,,8098,[Breeder: Sakai<br>Inches: 30</br>],grandkoi_data/d7c302_High Quality showa- SOLD....,https://www.grandkoi.com/wp-content/uploads/20...
12a49e,https://www.grandkoi.com/product/gin-rin-showa...,High Quality Gin Rin Showa- SOLD,"$34,999.00",8097,[Breeder: Sakai<br>Inches: 32</br>],grandkoi_data/12a49e_High Quality Gin Rin Show...,https://www.grandkoi.com/wp-content/uploads/20...
9ac3a8,https://www.grandkoi.com/product/high-quality-...,High Quality Gin Rin showa- SOLD,"$34,999.00",8096,[Breeder: Sakai<br>Inches: 32</br>],grandkoi_data/9ac3a8_High Quality Gin Rin show...,https://www.grandkoi.com/wp-content/uploads/20...
adf84d,https://www.grandkoi.com/product/karashigoi-8092/,Karashigoi,"$32,999.00",8092,"[Breeder: Dainichi, Inches: 36]",grandkoi_data/adf84d_Karashigoi.jpeg,https://www.grandkoi.com/wp-content/uploads/20...
a0b886,https://www.grandkoi.com/product/doitsu-kogane...,Doitsu Kogane Ochiba- SOLD,"$24,999.00",8091,"[Breeder: Marusei, Inches: 31]",grandkoi_data/a0b886_Doitsu Kogane Ochiba- SOL...,https://www.grandkoi.com/wp-content/uploads/20...
...,...,...,...,...,...,...,...
6d337b,https://www.grandkoi.com/product/gin-rin-goshi...,Gin Rin Goshiki- High Quality,"$4,449.00",6049,"[Breeder: Hiroi, Inches: 22.4]",grandkoi_data/6d337b_Gin Rin Goshiki- High Qua...,https://www.grandkoi.com/wp-content/uploads/20...
7f1f60,https://www.grandkoi.com/product/goshiki-47/,Goshiki High Quality,"$4,500.00",6038,"[Breeder: Hiroi, Inches: 25]",grandkoi_data/7f1f60_Goshiki High Quality.png,https://www.grandkoi.com/wp-content/uploads/20...
9d9f8e,https://www.grandkoi.com/product/gin-rin-sanke-8/,Gin Rin Sanke High Quality,"$4,500.00",6029,"[Breeder: Hiroi, Inches: 23]",grandkoi_data/9d9f8e_Gin Rin Sanke High Qualit...,https://www.grandkoi.com/wp-content/uploads/20...
c962ee,https://www.grandkoi.com/product/gin-rin-sanke-7/,Gin Rin Sanke High Quality,"$4,500.00",6028,"[Breeder: Hiroi, Inches: 26]",grandkoi_data/c962ee_Gin Rin Sanke High Qualit...,https://www.grandkoi.com/wp-content/uploads/20...


In [12]:
df.duplicated(subset='image_url').sum()

2

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 367 entries, d7c302 to 1f8f92
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   link       367 non-null    object
 1   title      367 non-null    object
 2   price      367 non-null    object
 3   sku        367 non-null    object
 4   details    367 non-null    object
 5   image      367 non-null    object
 6   image_url  367 non-null    object
dtypes: object(7)
memory usage: 22.9+ KB


In [15]:
df['details']

d7c302    [Breeder: Sakai<br>Inches: 30</br>]
12a49e    [Breeder: Sakai<br>Inches: 32</br>]
9ac3a8    [Breeder: Sakai<br>Inches: 32</br>]
adf84d        [Breeder: Dainichi, Inches: 36]
a0b886         [Breeder: Marusei, Inches: 31]
                         ...                 
6d337b         [Breeder: Hiroi, Inches: 22.4]
7f1f60           [Breeder: Hiroi, Inches: 25]
9d9f8e           [Breeder: Hiroi, Inches: 23]
c962ee           [Breeder: Hiroi, Inches: 26]
1f8f92           [Breeder: Hiroi, Inches: 23]
Name: details, Length: 367, dtype: object

In [16]:
df.to_csv('grandkoi_data/collected_data.csv', index=True)