# Rijksmuseum API

In [1]:
import requests
import re

SEARCH_URL = "https://data.rijksmuseum.nl/search/collection"

def search_portraits(title=None, creator=None, t='painting'):
    params = {
        "creator": creator, 
        "title": title,
        "imageAvailable": "true",
        "type": t
    }

    r = requests.get(SEARCH_URL, params=params)
    r.raise_for_status()
    data = r.json()

    return data

In [142]:
def parse_artwork_details(data: dict) -> dict:
    """
    Extracts useful structured fields from Rijksmuseum Linked.Art objects
    """
    
    en_code = "http://vocab.getty.edu/aat/300388277"
    nl_code = "http://vocab.getty.edu/aat/300388256" #dutch language has more information

    unit_map = {
        "http://vocab.getty.edu/aat/300379098": "cm",
        "http://vocab.getty.edu/aat/300379226": "kg",
    }
    
    attr_map = {
        "https://id.rijksmuseum.nl/22011": "hoogte",
        "https://id.rijksmuseum.nl/22012": "breedte",
        "https://id.rijksmuseum.nl/220217": "gewicht",
    }

    # ------------ TITLE ------------
    # First look for Dutch version
    
    title = None
    for s in data.get("subject_of", []):
        for part in s.get("part", []):
            for sub in part.get("part", []):
                if sub.get("type") == "Name":
                    langs = sub.get("language", [])
                    if any(l.get("id") == nl_code for l in langs):
                        title = sub.get("content")
                        break
            if title:
                break
        if title:
            break

    # fallback: take any title if no English was found
    if not title:
        for s in data.get("subject_of", []):
            for part in s.get("part", []):
                for sub in part.get("part", []):
                    if sub.get("type") == "Name":
                        title = sub.get("content")
                        break
                if title:
                    break
            if title:
                break

    # ------------ ARTIST / MAKER ------------
    artist_name = None
    artist_id = None
    
    prod = data.get("produced_by")
    if isinstance(prod, dict):
        for part in prod.get("part", []):
            # get the person URI
            for agent in part.get("carried_out_by", []):
                artist_id = agent.get("id")
    
            # read Dutch referred_to_by labels
            for ref in part.get("referred_to_by", []):
                if ref.get("type") == "LinguisticObject":
                    langs = ref.get("language", [])
                    if any(l.get("id") == nl_code for l in langs):
                        artist_name = ref.get("content")
                        break
    
            # fallback: any referred_to_by without language filter
            if artist_name is None:
                for ref in part.get("referred_to_by", []):
                    if ref.get("type") == "LinguisticObject":
                        artist_name = ref.get("content")
                        break

    # ------------ YEAR ------------
    year = None
    ts = prod.get("timespan") if prod else None
    if isinstance(ts, dict):
        # Try identified_by textual year first
        if isinstance(ts.get("identified_by"), list):
            for ident in ts["identified_by"]:
                c = ident.get("content")
                if c and any(ch.isdigit() for ch in c):
                    year = c
                    break

        # fallback to machine timestamps
        if year is None:
            b = ts.get("begin_of_the_begin")
            if b: 
                year = b[:4]

    # ------------ DESCRIPTION ------------
    descriptions_nl = []

    for entry in data.get("subject_of", []):
        langs = entry.get("language", [])
        if not any(l.get("id") == nl_code for l in langs):
            continue
    
        # level 1: direct content
        if "content" in entry:
            descriptions_nl.append(entry["content"])
    
        # level 2: parts
        for p in entry.get("part", []):
            if "content" in p:
                descriptions_nl.append(p["content"])
            for sub in p.get("part", []):
                if "content" in sub:
                    descriptions_nl.append(sub["content"])
    # deduplicate
    descriptions_nl = list(dict.fromkeys(descriptions_nl))

    description = " ".join(descriptions_nl)

    # ------------ LOCATION ------------
    location = None
    room = None
    loc = data.get('current_location', [])

    if loc:
        for item in loc.get("identified_by", []):
        
            # 1. Extract identifier
            if item.get("type") == "Identifier":
                if "content" in item:
                    room = item["content"]
        
            # 2. Extract location name in english
            if item.get("type") == "Name":
                langs = item.get("language", [])
                if any(l.get("id") == nl_code for l in langs):
                    parts = item.get("part", [])
                    names = [p.get("content") for p in parts if p.get("content")]
                    location = " ".join(names)
                    
    # ------------ DIMENSION ------------        
    entries = []
    
    for item in data.get("dimension", []):
        if item.get("type") != "Dimension":
            continue
        
        value = item.get("value")
        unit_id = item.get("unit", {}).get("id")
        unit = unit_map.get(unit_id, "")
        
        # get attribute from classified_as
        attr = None
        for c in item.get("classified_as", []):
            a = attr_map.get(c.get("id"))
            if a:
                attr = a
        
        # gather the Dutch annotation text
        annotation = None
        for ref in item.get("referred_to_by", []):
            langs = ref.get("language", [])
            if any(l.get("id") == nl_code for l in langs):
                annotation = ref.get("content")
    
        if attr and value and unit:
            entries.append(f"{attr} {value} {unit}" + (f" ({annotation})" if annotation else ""))
            
    dimension_str = " x ".join(entries)

    # ------------ MATERIAL ------------        

    material_code = "http://vocab.getty.edu/aat/300435429"
    
    materials = []
    
    for item in data.get("referred_to_by", []):
        if item.get("type") != "LinguisticObject":
            continue
        
        langs = item.get("language", [])
        if not any(l.get("id") == nl_code for l in langs):
            continue
    
        classes = item.get("classified_as", [])
        if not any(c.get("id") == material_code for c in classes):
            continue
    
        content = item.get("content")
        if content:
            materials.append(content)
    
    materials = list(dict.fromkeys(materials))

    
    return {
        "title": title,
        "artist": artist_name,
        "year": year,
        "description": description,
        "location": location,
        "room": room,
        "dimension": dimension_str,
        "material": materials,
        "source": data.get("id"),
    }

### Retrieve metadata for selected artwork

In [143]:
# what to search
creator = 'Johannes Vermeer'
title = 'Milkmaid'

data = search_portraits(title=title, creator=creator)
artwork_id = data["orderedItems"][0]['id']

extracted_info = requests.get(artwork_id, headers={"Accept": "application/ld+json"}).json()

extracted_data = parse_artwork_details(extracted_info)


In [144]:
extracted_data  # what we extract

{'title': 'Het melkmeisje',
 'artist': 'schilder: Johannes Vermeer',
 'year': 'c. 1660',
 'description': 'Het melkmeisje Johannes Vermeer (1632–1675), olieverf op doek, ca. 1660 Geheel verdiept in haar werk schenkt een dienstmeisje melk in. Behalve de witte melkstraal lijkt niets te bewegen. Die alledaagse handeling balde Vermeer samen tot een indrukwekkend schilderij – als een beeld staat de figuur vrij in de lichte ruimte. Vermeer had oog voor hoe het licht in honderden kleurige puntjes over de voorwerpen speelt.',
 'location': 'Hoofdgebouw Eregalerij',
 'room': 'HG-2.30.3',
 'dimension': 'breedte 41 cm x hoogte 45.5 cm',
 'material': ['olieverf op doek'],
 'source': 'https://id.rijksmuseum.nl/200108369'}

In [145]:
print(f'Title: {extracted_data["title"]}')

Title: Het melkmeisje


### Retrieve metadata of other artworks of the same artist

In [146]:
rel_artworks = []
data_artist = search_portraits(creator=creator)
if len(data_artist['orderedItems']) > 1:
    for items in data_artist['orderedItems']:
        if artwork_id != items['id']:
            rel_art_id = items['id']
            rel_art_extracted_info = requests.get(rel_art_id, headers={"Accept": "application/ld+json"}).json()
            rel_art_extracted_data = parse_artwork_details(rel_art_extracted_info)
            rel_artworks.append(rel_art_extracted_data)

In [147]:
print(f'Other artworks of {creator}: \n')
for d in rel_artworks:
    print(d['title'])

Other artworks of Johannes Vermeer: 

Gezicht op huizen in Delft, bekend als ‘Het straatje’
Brieflezende vrouw
De liefdesbrief


In [148]:
# from openai import OpenAI
# client = OpenAI()

# def mt(text):
#     resp = client.chat.completions.create(
#         model="gpt-5.2",
#         messages=[
#             {"role": "system", "content": "Translate from Dutch to English using museum catalog terminology. Do NOT summarize or rewrite, only translate."},
#             {"role": "user", "content": text}
#         ],
#         temperature=0
#     )
#     return resp.choices[0].message.content.strip()


# Retrieve data from Wikipedia

In [149]:
def wikidata_search(title):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "language": "en",
        "format": "json",
        "search": title
    }
    headers = {"User-Agent": "RijksmuseumRAGBot/1.0 (https://example.com; contact@example.com)"}
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()
    return r.json()["search"]

In [150]:
def wikidata_get(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    headers = {"User-Agent": "RijksmuseumRAGBot/1.0 (https://example.com; contact@example.com)"}
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    return r.json()["entities"][qid]

In [151]:
def select_painting(results):
    for item in results:
        qid = item["id"]
        entity = wikidata_get(qid)
        claims = entity.get("claims", {})
        if "P31" in claims:
            for inst in claims["P31"]:
                if inst["mainsnak"]["datavalue"]["value"]["id"] == "Q3305213":
                    return qid
    return None


In [152]:
results = wikidata_search(title)
qid = select_painting(results)
print(qid)

Q167605


In [153]:
def wikidata_get_sitelink(qid, lang="en"):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    headers = {
        "User-Agent": "RijksmuseumRAGBot/1.0 (https://example.com; contact@example.com)"
    }
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    data = r.json()
    entity = data["entities"][qid]
    return entity["sitelinks"][f"{lang}wiki"]["title"]

In [154]:
def wikipedia_content(title, lang="en"):
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": True,    # remove HTML
        "format": "json",
        "titles": title
    }
    headers = {
        "User-Agent": "RijksmuseumRAGBot/1.0 (https://example.com; contact@example.com)"
    }
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()
    data = r.json()
    pages = data["query"]["pages"]
    page = next(iter(pages.values()))
    return page.get("extract", "")


In [155]:
wiki_title = wikidata_get_sitelink(qid)
print(wiki_title)

The Milkmaid (Vermeer)


In [156]:
wiki_artwork_content = wikipedia_content(wiki_title, lang="en")
print(wiki_artwork_content[:1000])

The Milkmaid (Dutch: De melkmeid or Het melkmeisje), sometimes called The Kitchen Maid (Dutch: De keukenmeid), is an oil-on-canvas painting of a "milkmaid", in fact, a domestic kitchen maid, by the Dutch artist Johannes Vermeer. It is in the Rijksmuseum in Amsterdam, the Netherlands, which regards it as "unquestionably one of the museum's finest attractions".
The exact year of the painting's completion is unknown, with estimates varying by source. The Rijksmuseum estimates it as circa 1658. According to the Metropolitan Museum of Art in New York City, it was painted in about 1657 or 1658. The "Essential Vermeer" website gives a broader range of 1658–1661.


== Descriptions and commentary ==
The painting shows a milkmaid, a woman who milks cows and makes dairy products like butter and cheese, in a plain room carefully pouring milk into a squat earthenware container on a table. Milkmaids began working solely in the stables before large houses hired them to do housework as well rather tha

In [157]:
def select_artist(results):
    for item in results:
        qid = item["id"]
        entity = wikidata_get(qid)
        claims = entity.get("claims", {})

        # check instance of = human (Q5)
        if "P31" in claims:
            if any(inst["mainsnak"]["datavalue"]["value"]["id"] == "Q5"
                   for inst in claims["P31"]):
                return qid
    return None

In [158]:
artist_results = wikidata_search(creator)
artist_qid = select_artist(artist_results)
artist_wiki_title = wikidata_get_sitelink(artist_qid, lang="en")
wiki_artist_bio = wikipedia_content(artist_wiki_title)

print(artist_wiki_title)
print(wiki_artist_bio[:1000])

Johannes Vermeer
Johannes Vermeer ( vər-MEER, vər-MAIR, Dutch: [joːˈɦɑnəs fərˈmeːr]; see below; also known as Jan Vermeer; October 1632 – 15 December 1675) was a Dutch painter who specialized in domestic interior scenes of middle-class life. He is considered one of the greatest painters of the Dutch Golden Age. During his lifetime, he was a moderately successful provincial genre painter, recognized in Delft and The Hague. He produced relatively few paintings, primarily earning his living as an art dealer. He was not wealthy; at his death, his wife was left in debt.
Vermeer worked slowly and with great care, and frequently used very expensive pigments. He is particularly renowned for making masterful use of light in his work. "Almost all his paintings", Hans Koningsberger wrote, "are apparently set in two smallish rooms in his house in Delft; they show the same furniture and decorations in various arrangements and they often portray the same people, mostly women."
The modest celebrity h

### Aggregate info

In [159]:
extracted_data

{'title': 'Het melkmeisje',
 'artist': 'schilder: Johannes Vermeer',
 'year': 'c. 1660',
 'description': 'Het melkmeisje Johannes Vermeer (1632–1675), olieverf op doek, ca. 1660 Geheel verdiept in haar werk schenkt een dienstmeisje melk in. Behalve de witte melkstraal lijkt niets te bewegen. Die alledaagse handeling balde Vermeer samen tot een indrukwekkend schilderij – als een beeld staat de figuur vrij in de lichte ruimte. Vermeer had oog voor hoe het licht in honderden kleurige puntjes over de voorwerpen speelt.',
 'location': 'Hoofdgebouw Eregalerij',
 'room': 'HG-2.30.3',
 'dimension': 'breedte 41 cm x hoogte 45.5 cm',
 'material': ['olieverf op doek'],
 'source': 'https://id.rijksmuseum.nl/200108369'}

In [160]:
painting_data = extracted_data.copy()
painting_data['id'] = re.search(r'/(\d+)(?:\?|$)', extracted_data['source']).group(1)
painting_data['wiki_artwork'] = wiki_artwork_content
painting_data['wiki_artist'] = wiki_artist_bio
painting_data['artist_artworks'] = rel_artworks

# RAG

In [162]:
painting_data.keys()

dict_keys(['title', 'artist', 'year', 'description', 'location', 'room', 'dimension', 'material', 'source', 'id', 'wiki_artwork', 'wiki_artist', 'artist_artworks'])

In [163]:
import textwrap

def chunk_text(text, size=800):    
    text = text.replace("\n", " ")
    return textwrap.wrap(text, size)

In [164]:
def prepare_chunks(painting):
    chunks = []

    # metadata chunk
    meta = f"""
    Title: {painting['title']}
    Artist: {painting['artist']}
    Year: {painting['year']}
    Room: {painting['room']}
    Location: {painting['location']}
    Material: {painting['material']}
    Dimensions: {painting['dimension']}
    """
    chunks.append({"type": "metadata", "text": meta})

    # curator description
    chunks.extend({"type": "curatorial", "text": c} 
                  for c in chunk_text(painting["description"]))

    # wikipedia painting
    chunks.extend({"type": "wiki_painting", "text": c}
                  for c in chunk_text(painting["wiki_artwork"]))

    # wikipedia artist
    chunks.extend({"type": "wiki_artist", "text": c}
                  for c in chunk_text(painting["wiki_artist"]))

    return chunks


In [165]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [166]:
import chromadb
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

chroma = chromadb.Client()
collection = chroma.get_or_create_collection("rijks_art")

In [167]:
def embed(text):
    resp = client.embeddings.create(
        model="text-embedding-3-large",
        input=text)
    return resp.data[0].embedding

In [168]:
def index_painting(painting):
    chunks = prepare_chunks(painting)
    
    for i, chunk in enumerate(chunks):
        chunk_id = f"{painting['id']}_{i}"
        collection.add(
            ids=[chunk_id],
            embeddings=[embed(chunk["text"])],
            metadatas=[{
                "painting_id": painting["id"],
                "title": painting["title"],
                "artist": painting["artist"],
                "type": chunk["type"]
            }],
            documents=[chunk["text"]]
        )

In [169]:
index_painting(painting_data)

In [170]:
painting_data.keys()

dict_keys(['title', 'artist', 'year', 'description', 'location', 'room', 'dimension', 'material', 'source', 'id', 'wiki_artwork', 'wiki_artist', 'artist_artworks'])

In [97]:
def retrieve(query, k=5):
    query_emb = embed(query)
    results = collection.query(
        query_embeddings=[query_emb],
        n_results=k)
    return results

In [184]:
def answer(query):
    results = retrieve(query, k=6)

    context = "\n\n".join(results["documents"][0])

    prompt = f"""
    You are an expert Rijksmuseum art assistant. Suppose that when the user asks you a question, he is already in the Rijksmuseum. You can answer questions ONLY about the artwork: {title} and the creator {creator}.
    
    User question:
    {query}
    
    Context:
    {context}
    
    Answer using ONLY the context above. If not answerable, say "I don't know from available information."
    If it is irrelevant to the artwork and the creator, you will politely respond that your purpose is to provide information only about the painting and the artist.

    """

    completion = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}]
    )

    return completion.choices[0].message.content

In [180]:
print(answer("Where can I see The milkmaid, in which room?"))

You can see Het melkmeisje (The Milkmaid) in the Rijksmuseum's main building, in the Eregalerij (Gallery of Honour), room HG-2.30.3.


In [100]:
print(answer("Describe Vermeer's artistic style."))

Vermeer’s artistic style is characterized by extraordinary precision, meticulous technique, and a masterful use of light. He often painted domestic interior scenes of middle-class life, with his subjects providing a cross-section of seventeenth-century Dutch society. His works are largely genre pieces and portraits, with few cityscapes and allegories. 

Vermeer may have begun his paintings tonally, using monochrome shades of grey ("grisaille") or a limited palette of browns and greys ("dead coloring"), later applying more saturated colors in transparent glazes. He is particularly noted for his frequent and lavish use of the expensive pigment ultramarine (made from natural lapis lazuli), often used in areas not naturally of that color, as well as his use of lead-tin-yellow for exceptional luminosity. Vermeer’s palette is considered more brilliant and less somber than that of his contemporaries.

He worked slowly and carefully, perhaps producing few paintings because of his precision. Ve

In [105]:
print(answer("Which football club is the best in the Netherlands?"))

My purpose is to provide information only about the artwork "Milkmaid" (Het melkmeisje) and its creator, Johannes Vermeer. I cannot answer questions about football clubs in the Netherlands. If you have any questions about the painting or the artist, I would be happy to help!


In [185]:
print(answer("Which are other paintings by Vermeer in the Rijksmuseum? show me the rooms of each"))

- Gezicht op huizen in Delft, bekend als ‘Het straatje’ (ca. 1658), Room: HG-2.30.3
- Brieflezende vrouw (c. 1663), Room: HG-2.30.3
- De liefdesbrief (c. 1669 - c. 1670), Room: HG-2.30.3
Besides Het melkmeisje, other paintings by Vermeer in the Rijksmuseum are:

1. Gezicht op huizen in Delft, bekend als ‘Het straatje’ (ca. 1658)  
   Room: HG-2.30.3

2. Brieflezende vrouw (c. 1663)  
   Room: HG-2.30.3

3. De liefdesbrief (c. 1669 - c. 1670)  
   Room: HG-2.30.3

All these works can be found in Room HG-2.30.3.


In [108]:
# collection.query(
#     query_embeddings=[query_emb],
#     n_results=5,
#     where={"artist": "Johannes Vermeer"}
# )

### Save

In [187]:
import chromadb

chroma = chromadb.PersistentClient(path="./rijks_db")
collection = chroma.get_or_create_collection("rijks_art")


### Load

In [188]:
import chromadb

chroma = chromadb.PersistentClient(path="./rijks_db")
collection = chroma.get_collection("rijks_art")


#### TODO:

integrate into chroma db info about relevant artworks of the artist

In [None]:
# def index_painting(painting):
#     chunks = prepare_chunks(painting)

#     # store related artworks for lookup (NOT for embedding)
#     related_artworks[painting['id']] = painting.get("artist_artworks", [])

#     for i, chunk in enumerate(chunks):
#         chunk_id = f"{painting['id']}_{i}"
#         collection.add(
#             ids=[chunk_id],
#             embeddings=[embed(chunk["text"])],
#             metadatas=[{
#                 "painting_id": painsting["id"],
#                 "title": painting["title"],
#                 "artist": painting["artist"],
#                 "type": chunk["type"]
#             }],
#             documents=[chunk["text"]]
#         )

In [None]:

# def answer(query, painting_id):
#     # retrieve semantic context
#     results = retrieve(query, k=6)
#     context = "\n\n".join(results["documents"][0])

#     # fetch related artworks for this painting
#     related = related_artworks.get(painting_id, [])

#     # turn related artworks into structured text
#     related_str = "\n".join([
#         f"- {p['title']} ({p.get('year', 'unknown')}), located in {p.get('room', 'unknown room')}"
#         for p in related
#     ])

#     prompt = f"""
#     You are a Rijksmuseum assistant. The visitor is standing in front of painting ID {painting_id}.

#     User question:
#     {query}

#     Context about the artwork:
#     {context}

#     Related artworks by the same artist currently in the Rijksmuseum:
#     {related_str}

#     Instructions:
#     - If the user asks about the artwork or artist, answer using only the context.
#     - If the user asks for recommendations, use ONLY the related artworks section.
#     - If not answerable, say "I don't know from available information."
#     - Do not hallucinate artworks not in related list.
#     """

#     completion = client.chat.completions.create(
#         model="gpt-4.1",
#         messages=[{"role": "user", "content": prompt}]
#     )

#     return completion.choices[0].message.content