In [3]:
"""
Notebook to explore reconstructing store inventory from myshopify.com/products.json

References:
https://practicaldatascience.co.uk/data-science/how-to-scrape-a-shopify-site-in-python-via-products-json
"""
import json
import pandas as pd
import requests

In [45]:
def to_df(products_json):
    """
    Convert products.json to a pandas DataFrame.
    Args:
        products_json (json): Products.json from the store.
    Returns:
        df: Pandas DataFrame of the products.json.
    """

    try:
        products_dict = json.loads(products_json)
        df = pd.DataFrame.from_dict(products_dict['products'])
        return df
    except Exception as e:
        print(e)

In [58]:
def get_json(url, page):
    """
    Get Shopify products.json from a store URL.
    Args:
        url (str): URL of the store.
        page (int): Page number of the products.json.
    Returns:
        products_json: Products.json from the store.
    """

    try:
        response = requests.get(f'{url}/products.json?limit=250&page={page}', timeout=5)
        products_json = response.text
        response.raise_for_status()
        return products_json

    except requests.exceptions.HTTPError as error_http:
        print("HTTP Error:", error_http)

    except requests.exceptions.ConnectionError as error_connection:
        print("Connection Error:", error_connection)

    except requests.exceptions.Timeout as error_timeout:
        print("Timeout Error:", error_timeout)

    except requests.exceptions.RequestException as error:
        print("Error: ", error)

In [67]:
url = 'https://hfxgames.com'

result_df = pd.DataFrame()

products_json_1 = get_json('https://hfxgames.com', 1)

# print pretty json
print(json.dumps(json.loads(products_json_1), indent=4, sort_keys=True))

df = to_df(products_json_1)
print(df['product_type'].value_counts())


{
    "products": [
        {
            "body_html": "<p data-mce-fragment=\"1\"><strong data-mce-fragment=\"1\">Number of Players:</strong><span data-mce-fragment=\"1\">\u00a0</span>2+</p>\n<p data-mce-fragment=\"1\"><strong data-mce-fragment=\"1\">Playing Time:</strong><span data-mce-fragment=\"1\">\u00a0</span>20 Minutes</p>\n<p data-mce-fragment=\"1\"><strong data-mce-fragment=\"1\">Recommended Ages:</strong><span data-mce-fragment=\"1\">\u00a0</span>8+</p>\n<p data-mce-fragment=\"1\"><strong data-mce-fragment=\"1\"><br data-mce-fragment=\"1\">In This Game, Your Useless Knowledge Will Win You Useful Points</strong></p>\n<p data-mce-fragment=\"1\">Think fast, not hard in this merciless word-shouting board game. Start by drawing a category card. Your team will then shout words from that category that begin with the letter on the race track. Move along the track every time you get a word right and race against your opponents to win. This is a simple game for versatile occasions. Pla

In [84]:
def get_products(url):
    """
    Get all products from a store.
    Returns:
        df: Pandas DataFrame of the products.json.
    """

    results = True
    page = 1
    df = pd.DataFrame()

    while results == True and page < 7:
        products_json = get_json(url, page + 7)
        products_dict = to_df(products_json)

        if len(products_dict) == 0:
            break
        else:
            df = pd.concat([df, products_dict], ignore_index=True)
            page += 1
            print(f"Page {page + 7} scraped...")

    df['url'] = f"{url}/products/" + df['handle']
    return df

In [85]:
result_df = get_products('https://exorgames.com')
print(result_df['product_type'].value_counts())
print(result_df.info())
# print images for each some product
for i in range(0, 10):
    print(result_df['images'][i][0]['src'])

# filter out product_type != MTG Single
result_df = result_df[result_df['product_type'] == 'MTG Single']
# convert to array of results
result_df = result_df.to_dict('records')



Page 9 scraped...
Page 10 scraped...
Page 11 scraped...
Page 12 scraped...
Page 13 scraped...
Page 14 scraped...
product_type
MTG Single                628
One Piece Single          259
Flesh And Blood Single    236
Yugioh Single             147
Miniatures                127
Event Ticket               49
Supplies                   42
Pokemon Single              8
Board Games                 3
Model                       1
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1500 non-null   int64 
 1   title         1500 non-null   object
 2   handle        1500 non-null   object
 3   body_html     1500 non-null   object
 4   published_at  1500 non-null   object
 5   created_at    1500 non-null   object
 6   updated_at    1500 non-null   object
 7   vendor        1500 non-null   object
 8   product_type  15

In [86]:
# only a few small adjustments needed to scraper for it to pull from myshopify.com/products.json directly.
scraper_results = []

for card in result_df:
    titleAndSet = card['title']
    if 'Art Card' in titleAndSet:
        continue
    # split the title and set
    title = titleAndSet.split("[")[0].strip()
    setName = titleAndSet.split("[")[1].split("]")[0].strip()

    # remove any excess tags inside () or [] in the title
    title = title.split("(")[0].strip()

    image = card['images'][0]['src']        # image adjusted
    handle = card['handle']
    link = card['url']

    for variant in card['variants']:
        if not variant['available']:
            continue

        condition = variant['title'].split(" ")[0].strip()
        if condition == "Lightly":
            condition = "LP"
        elif condition == "Near":
            condition = "NM"
        elif condition == "Moderately":
            condition = "MP"
        elif condition == "Heavily":
            condition = "HP"
        elif condition == "Damaged":
            condition = "DMG"
        
        foil = False
        if "Foil" in variant['title']:
            foil = True

        price = variant['price']

        scraper_results.append({
            'name': title,
            'link': link,
            'image': image,
            'set': setName,
            'condition': condition,
            'foil': foil,
            'price': price,
            'website': ""
        })

print(scraper_results)

[{'name': 'Oaken Brawler', 'link': 'https://exorgames.com/products/oaken-brawler-the-list', 'image': 'https://cdn.shopify.com/s/files/1/0467/3083/8169/products/3bcb0f29-5eee-5cd7-b7a1-0c23719b37e3.jpg?v=1689008128', 'set': 'The List', 'condition': 'NM', 'foil': False, 'price': '0.25', 'website': ''}, {'name': 'Rakdos Riteknife', 'link': 'https://exorgames.com/products/rakdos-riteknife-the-list', 'image': 'https://cdn.shopify.com/s/files/1/0467/3083/8169/products/1841c365-d76c-5cef-90ce-10dc403afa5e.jpg?v=1689008095', 'set': 'The List', 'condition': 'LP', 'foil': False, 'price': '0.90', 'website': ''}, {'name': 'Lobelia Sackville-Baggins', 'link': 'https://exorgames.com/products/lobelia-sackville-baggins-399-the-lord-of-the-rings-tales-of-middle-earth-prerelease-promos', 'image': 'https://cdn.shopify.com/s/files/1/0467/3083/8169/products/501093.jpg?v=1687376649', 'set': 'The Lord of the Rings: Tales of Middle-Earth Prerelease Promos', 'condition': 'NM', 'foil': True, 'price': '2.45', 'w