In [6]:
import sys
import json


with open("../data/dog_breeds.jsonld", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Type:", type(data))

if isinstance(data, list):
    print("First item:\n", data[0])
elif isinstance(data, dict):
    # some datasets are JSON-LD with '@graph'
    if "@graph" in data:
        print("First item in @graph:\n", data["@graph"][0])
    else:
        first_key = list(data.keys())[0]
        print(f"First key: {first_key}\nValue:\n", data[first_key])


Type: <class 'dict'>
First key: @context
Value:
 https://schema.org/


In [8]:

if "@graph" in data:
    breeds = data["@graph"]
    print(f"Number of breeds: {len(breeds)}\n")

    # Print the first breed entry nicely
    print("First breed entry:\n")
    print(json.dumps(breeds[0], indent=2, ensure_ascii=False))

    # Just the name for quick check
    print("\nBreed name:", breeds[0].get("name"))
else:
    print("No '@graph' key found. Top-level keys:", data.keys())


No '@graph' key found. Top-level keys: dict_keys(['@context', '@type', '@id', 'name', 'description', 'url', 'inLanguage', 'isAccessibleForFree', 'dateCreated', 'dateModified', 'version', 'keywords', 'license', 'temporalCoverage', 'spatialCoverage', 'creator', 'publisher', 'publishingPrinciples', 'distribution', 'mainEntity', 'additionalProperty'])


In [9]:
print("Type of mainEntity:", type(data.get("mainEntity")))
print("Preview of mainEntity:\n")

import pprint
pprint.pprint(data.get("mainEntity"), depth=2, width=120)


Type of mainEntity: <class 'list'>
Preview of mainEntity:

[{'@context': 'https://schema.org/',
  '@id': 'https://pawsomeauthority.com/dog-breeds/akita/profile/',
  '@type': 'Thing',
  'additionalProperty': [...],
  'additionalType': 'https://schema.org/Animal',
  'name': 'Akita',
  'url': 'https://pawsomeauthority.com/dog-breeds/akita/profile/'},
 {'@context': 'https://schema.org/',
  '@id': 'https://pawsomeauthority.com/dog-breeds/american-cocker-spaniel/profile/',
  '@type': 'Thing',
  'additionalProperty': [...],
  'additionalType': 'https://schema.org/Animal',
  'name': 'American Cocker Spaniel',
  'url': 'https://pawsomeauthority.com/dog-breeds/american-cocker-spaniel/profile/'},
 {'@context': 'https://schema.org/',
  '@id': 'https://pawsomeauthority.com/dog-breeds/australian-cattle-dog/profile/',
  '@type': 'Thing',
  'additionalProperty': [...],
  'additionalType': 'https://schema.org/Animal',
  'name': 'Australian Cattle Dog',
  'url': 'https://pawsomeauthority.com/dog-breeds/

In [10]:
import json
import re

breeds_list = [
    "Labrador_Retriever",
    "German_Shepherd",
    "Golden_Retriever",
    "Boxer",
    "Beagle",
    "Pomeranian",
    "Siberian_Husky",
    "Doberman",
    "Shih-Tzu",
    "Yorkshire_Terrier",
]

# Normalize: lowercase, remove spaces, underscores, hyphens
def normalize(name):
    return re.sub(r"[-_\s]", "", name.lower())

target = {normalize(b): b for b in breeds_list}
found = []
missing = []

# Extract all breed names from JSON
json_breeds = [entry["name"] for entry in data["mainEntity"] if "name" in entry]

for breed_norm, orig in target.items():
    if any(normalize(name) == breed_norm for name in json_breeds):
        found.append(orig)
    else:
        missing.append(orig)

print("✅ Found in JSON:", found)
print("❌ Missing in JSON:", missing)


✅ Found in JSON: ['Labrador_Retriever', 'German_Shepherd', 'Golden_Retriever', 'Boxer', 'Beagle', 'Pomeranian', 'Siberian_Husky', 'Shih-Tzu', 'Yorkshire_Terrier']
❌ Missing in JSON: ['Doberman']


In [11]:
import json
import re
from difflib import get_close_matches


entries = data.get("mainEntity", [])

# ---- Helpers ----
def normalize_basic(s: str) -> str:
    # lower, remove spaces/underscores/hyphens/punctuation
    return re.sub(r"[^a-z]", "", s.lower())

def normalize_doberman(s: str) -> str:
    s = s.lower()
    # unify common variants
    s = s.replace("dobermann", "doberman")
    s = s.replace("pinscher", "")  # allow matching 'doberman pinscher'
    # strip non-letters
    s = re.sub(r"[^a-z]", "", s)
    return s

# Target variants we want to catch
targets = {
    "doberman",
    "dobermanpinscher",   # exact phrase before stripping
    "dobermann",
}

# ---- Exact-ish search with smart normalization ----
candidates = []
for e in entries:
    name = e.get("name", "")
    url = e.get("url") or e.get("@id") or ""
    nb = normalize_basic(name)
    nd = normalize_doberman(name)

    if (
        "doberman" in nb
        or "doberman" in nd
        or any(t in nb for t in targets)
        or any(t in nd for t in targets)
    ):
        candidates.append((name, url))

# ---- If nothing found with the above, do fuzzy search as fallback ----
if not candidates:
    names = [e.get("name", "") for e in entries if "name" in e]
    norm_names = [normalize_doberman(n) for n in names]
    # fuzzy on normalized strings
    close = get_close_matches("doberman", norm_names, n=5, cutoff=0.6)
    for n in close:
        idx = norm_names.index(n)
        e = entries[idx]
        candidates.append((e.get("name", ""), e.get("url") or e.get("@id") or ""))

# ---- Also scan additionalProperty for synonyms/aliases, just in case ----
if not candidates:
    for e in entries:
        props = e.get("additionalProperty") or []
        for p in props:
            val = p.get("value") or ""
            if isinstance(val, str):
                nb = normalize_basic(val)
                nd = normalize_doberman(val)
                if "doberman" in nb or "doberman" in nd:
                    candidates.append((e.get("name", ""), e.get("url") or e.get("@id") or ""))
                    break

# ---- Print results ----
if candidates:
    print("Possible Doberman entries found:")
    for name, url in candidates:
        print(f" - {name}  ({url})")
else:
    print("No Doberman variants found. Consider opening the JSON and searching for 'dob' / 'pinscher'.")


Possible Doberman entries found:
 - Doberman Pinscher  (https://pawsomeauthority.com/dog-breeds/doberman-pinscher/profile/)


In [12]:
import json
import pprint

entries = data.get("mainEntity", [])

# Normalize names to match easily
def normalize(s):
    return s.lower().replace("-", " ").replace("_", " ").strip()

# Find Boxer entry
target_name = "boxer"
boxer_entry = None
for entry in entries:
    if normalize(entry.get("name", "")) == target_name:
        boxer_entry = entry
        break

if boxer_entry:
    print("✅ Boxer entry found. Keys available:")
    pprint.pprint(boxer_entry.keys())
    
    print("\n--- Full entry preview ---")
    pprint.pprint(boxer_entry, depth=2, width=120)
    
    # Optionally print 'additionalProperty' for extra attributes
    if "additionalProperty" in boxer_entry:
        print("\n--- Additional properties ---")
        for prop in boxer_entry["additionalProperty"]:
            pprint.pprint(prop)
else:
    print("Boxer not found in the dataset.")


✅ Boxer entry found. Keys available:
dict_keys(['@context', '@type', '@id', 'name', 'url', 'additionalType', 'additionalProperty'])

--- Full entry preview ---
{'@context': 'https://schema.org/',
 '@id': 'https://pawsomeauthority.com/dog-breeds/boxer/profile/',
 '@type': 'Thing',
 'additionalProperty': [{...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
                        {...},
      

In [13]:
for i, prop in enumerate(boxer_entry["additionalProperty"], 1):
    print(f"{i}: {prop}")


1: {'@type': 'PropertyValue', 'name': 'breed', 'value': 'Boxer'}
2: {'@type': 'PropertyValue', 'name': 'pronunciation', 'value': 'Bok-ser'}
3: {'@type': 'PropertyValue', 'name': 'alternateName', 'value': 'Deutscher Boxer, German Boxer'}
4: {'@type': 'PropertyValue', 'name': 'nickname', 'value': '-'}
5: {'@type': 'PropertyValue', 'name': 'countryOfOrigin', 'value': 'Germany'}
6: {'@type': 'PropertyValue', 'name': 'category', 'value': 'Working'}
7: {'@type': 'PropertyValue', 'name': 'size', 'value': ['Large']}
8: {'@type': 'PropertyValue', 'name': 'heightMaleInches', 'value': {'@type': 'QuantitativeValue', 'minValue': 23, 'maxValue': 25, 'unitCode': 'INH'}}
9: {'@type': 'PropertyValue', 'name': 'heightMaleCentimeters', 'value': {'@type': 'QuantitativeValue', 'minValue': 58.5, 'maxValue': 63.5, 'unitCode': 'CMT'}}
10: {'@type': 'PropertyValue', 'name': 'heightFemaleInches', 'value': {'@type': 'QuantitativeValue', 'minValue': 21.5, 'maxValue': 23.5, 'unitCode': 'INH'}}
11: {'@type': 'Prope