# Loading datasets

## Metropolitan Museum of Art Collection

In [2]:
import json
import asyncio
import aiohttp
import pandas as pd
from tqdm import tqdm

In [3]:
MET_BASE = "https://collectionapi.metmuseum.org/public/collection/v1"

In [4]:
async def fetch_all_object_ids(session):
    url = f"{MET_BASE}/objects"
    async with session.get(url) as resp:
        resp.raise_for_status()
        data = await resp.json()
        return data["objectIDs"]

async def fetch_object_metadata(session, object_id):
    url = f"{MET_BASE}/objects/{object_id}"
    async with session.get(url) as resp:
        resp.raise_for_status()
        return await resp.json()

In [5]:
async def load_data():
    async with aiohttp.ClientSession() as session:
        object_ids = await fetch_all_object_ids(session)
        print(f"Total object IDs fetched: {len(object_ids)}")
        
        if object_ids:
            obj_id = object_ids[0]
            print(f"Inspecting object ID: {obj_id}")
            obj_data = await fetch_object_metadata(session, obj_id)
            return obj_data

In [6]:
res = await load_data()
print(json.dumps(res, indent=2))

Total object IDs fetched: 498452
Inspecting object ID: 1
{
  "objectID": 1,
  "isHighlight": false,
  "accessionNumber": "1979.486.1",
  "accessionYear": "1979",
  "isPublicDomain": false,
  "primaryImage": "",
  "primaryImageSmall": "",
  "additionalImages": [],
  "constituents": [
    {
      "constituentID": 164292,
      "role": "Maker",
      "name": "James Barton Longacre",
      "constituentULAN_URL": "http://vocab.getty.edu/page/ulan/500011409",
      "constituentWikidata_URL": "https://www.wikidata.org/wiki/Q3806459",
      "gender": ""
    }
  ],
  "department": "The American Wing",
  "objectName": "Coin",
  "title": "One-dollar Liberty Head Coin",
  "culture": "",
  "period": "",
  "dynasty": "",
  "reign": "",
  "portfolio": "",
  "artistRole": "Maker",
  "artistPrefix": "",
  "artistDisplayName": "James Barton Longacre",
  "artistDisplayBio": "American, Delaware County, Pennsylvania 1794\u20131869 Philadelphia, Pennsylvania",
  "artistSuffix": "",
  "artistAlphaSort": "Lon

In [7]:
def extract_search_fields(obj_data):
    """Extract only fields relevant for artwork search"""
    return {
        'objectID': obj_data.get('objectID'),
        'title': obj_data.get('title'),
        'objectName': obj_data.get('objectName'),
        'primaryImage': obj_data.get('primaryImage'),
        'primaryImageSmall': obj_data.get('primaryImageSmall'),
        'additionalImages': obj_data.get('additionalImages', []),
        'artistDisplayName': obj_data.get('artistDisplayName'),
        'artistDisplayBio': obj_data.get('artistDisplayBio'),
        'artistNationality': obj_data.get('artistNationality'),
        'artistBeginDate': obj_data.get('artistBeginDate'),
        'artistEndDate': obj_data.get('artistEndDate'),
        'artistRole': obj_data.get('artistRole'),
        'department': obj_data.get('department'),
        'culture': obj_data.get('culture'),
        'period': obj_data.get('period'),
        'objectDate': obj_data.get('objectDate'),
        'objectBeginDate': obj_data.get('objectBeginDate'),
        'objectEndDate': obj_data.get('objectEndDate'),
        'medium': obj_data.get('medium'),
        'classification': obj_data.get('classification'),
        'tags': ', '.join([tag.get('term', '') for tag in obj_data.get('tags', [])]) if obj_data.get('tags') else None,
        'isPublicDomain': obj_data.get('isPublicDomain'),
        'isHighlight': obj_data.get('isHighlight'),
        'objectURL': obj_data.get('objectURL'),
        'accessionNumber': obj_data.get('accessionNumber'),
    }

In [8]:

async def fetch_objects_batch(session, object_ids, semaphore, progress_bar):
    """Fetch metadata for a batch of object IDs with concurrency control"""
    tasks = []
    for obj_id in object_ids:
        async def fetch_with_semaphore(obj_id):
            async with semaphore:
                try:
                    data = await fetch_object_metadata(session, obj_id)
                    progress_bar.update(1)
                    return data
                except Exception as e:
                    progress_bar.update(1)
                    return None
        
        tasks.append(fetch_with_semaphore(obj_id))
    
    results = await asyncio.gather(*tasks)
    return [r for r in results if r is not None]

async def load_all_data(max_objects=None, max_concurrent=50):
    """
    Load metadata for all object IDs and return as a list of dictionaries
    
    Args:
        max_objects: Limit number of objects to fetch (None for all)
        max_concurrent: Maximum number of concurrent requests
    """
    async with aiohttp.ClientSession() as session:
        object_ids = await fetch_all_object_ids(session)
        total_ids = len(object_ids)
        
        if max_objects:
            object_ids = object_ids[:max_objects]
            print(f"Fetching metadata for {max_objects} objects (out of {total_ids} total)")
        else:
            print(f"Fetching metadata for all {total_ids} objects")
        
        semaphore = asyncio.Semaphore(max_concurrent)
        progress_bar = tqdm(total=len(object_ids), desc="Fetching objects")
        
        all_data = await fetch_objects_batch(session, object_ids, semaphore, progress_bar)
        progress_bar.close()
        
        print(f"Successfully fetched {len(all_data)} objects")
        return all_data

In [9]:
print("Loading data...")
all_objects = await load_all_data(max_objects=1000, max_concurrent=50)

search_data = [extract_search_fields(obj) for obj in all_objects]
df = pd.DataFrame(search_data)

print(f"\nDataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nDataFrame info:")
df.info()
print(f"\nFirst few rows:")
df.head()

Loading data...
Fetching metadata for 1000 objects (out of 498452 total)


Fetching objects: 100%|██████████| 1000/1000 [00:03<00:00, 281.79it/s]


Successfully fetched 175 objects

DataFrame shape: (175, 25)

Columns: ['objectID', 'title', 'objectName', 'primaryImage', 'primaryImageSmall', 'additionalImages', 'artistDisplayName', 'artistDisplayBio', 'artistNationality', 'artistBeginDate', 'artistEndDate', 'artistRole', 'department', 'culture', 'period', 'objectDate', 'objectBeginDate', 'objectEndDate', 'medium', 'classification', 'tags', 'isPublicDomain', 'isHighlight', 'objectURL', 'accessionNumber']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   objectID           175 non-null    int64 
 1   title              175 non-null    object
 2   objectName         175 non-null    object
 3   primaryImage       175 non-null    object
 4   primaryImageSmall  175 non-null    object
 5   additionalImages   175 non-null    object
 6   artistDisplayName  175 non-null    

Unnamed: 0,objectID,title,objectName,primaryImage,primaryImageSmall,additionalImages,artistDisplayName,artistDisplayBio,artistNationality,artistBeginDate,...,objectDate,objectBeginDate,objectEndDate,medium,classification,tags,isPublicDomain,isHighlight,objectURL,accessionNumber
0,1,One-dollar Liberty Head Coin,Coin,,,[],James Barton Longacre,"American, Delaware County, Pennsylvania 1794–1...",American,1794.0,...,1853,1853,1853,Gold,,,False,False,https://www.metmuseum.org/art/collection/search/1,1979.486.1
1,2,Ten-dollar Liberty Head Coin,Coin,,,[],Christian Gobrecht,1785–1844,American,1785.0,...,1901,1901,1901,Gold,,,False,False,https://www.metmuseum.org/art/collection/search/2,1980.264.5
2,3,Two-and-a-Half Dollar Coin,Coin,,,[],,,,,...,1909–27,1909,1927,Gold,,,False,False,https://www.metmuseum.org/art/collection/search/3,67.265.9
3,4,Two-and-a-Half Dollar Coin,Coin,,,[],,,,,...,1909–27,1909,1927,Gold,,,False,False,https://www.metmuseum.org/art/collection/search/4,67.265.10
4,5,Two-and-a-Half Dollar Coin,Coin,,,[],,,,,...,1909–27,1909,1927,Gold,,,False,False,https://www.metmuseum.org/art/collection/search/5,67.265.11
