# Download Metadat/Images

De várias fontes...

## [Museu Paulista](https://pt.wikipedia.org/wiki/Wikipédia:GLAM/Museu_Paulista)

Queries:
- negative: https://w.wiki/C$kL
- positive: https://w.wiki/C$kK

In [None]:
import json
import requests

from hashlib import md5
from io import BytesIO
from os import path, makedirs
from PIL import Image as PImage
from SPARQLWrapper import SPARQLWrapper

In [None]:
WIKI_DATA_PATH = "./metadata/json/MuseuPaulista"
WIKI_INFO_FILEPATH = path.join(WIKI_DATA_PATH, "info.json")

makedirs(WIKI_DATA_PATH, exist_ok=True)

In [None]:
WIKI_QUERY_URL = "https://query.wikidata.org/sparql"
WIKI_CLAIM_URL = "https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&property=P18&entity="
WIKI_MEDIA_URL = "https://upload.wikimedia.org/wikipedia/commons"

USER_AGENT = "Acervos-Digitais/0.1 (https://www.acervosdigitais.fau.usp.br/; acervosdigitais@usp.br)"

OBJ_QCODES = {
  "photograph": "Q125191",
  "floor plan": "Q18965",
  "postcard": "Q192425",
  "map": "Q4006",
  "painting": "Q3305213",
  "print": "Q11060274",
  "topographic map": "Q216526",
  "printed matter": "Q1261026",
  "ornament": "Q335261",
  "negative": "Q595597",
  "toy": "Q11422",
  "plate": "Q57216",
  "doll": "Q168658",
  "doll clothes": "Q44201312",
  "vase": "Q191851",
  "towel": "Q131696",
  "saucer": "Q1422576",
  "equipment": "Q10273457",
  "photograph album": "Q488053",
  "tin": "Q15706035",
  "teacup": "Q81707",
  "pillowcase": "Q1094401",
  "drawing": "Q93184",
  "light fixture": "Q815738",
  "furniture": "Q14745",
  "lantern": "Q862454",
  "teapot": "Q245005",
  "chair": "Q15026",
  "illustration": "Q178659",
  "product packaging": "Q207822",
  "sculpture": "Q860861",
  "statue": "Q179700",
}

def prep_query(object_label):
  return f"""#defaultView:Table
    SELECT DISTINCT ?item ?itemLabel ?qid ?image ?creatorLabel ?date WHERE {{
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    ?item wdt:P276 wd:Q371803.
    ?item wdt:P18 ?image.

    ?item wdt:P31 wd:{OBJ_QCODES[object_label]}.

    BIND(STRAFTER(STR(?item), STR(wd:)) AS ?qid).

    OPTIONAL {{ ?item wdt:P170 ?creator. }}
    ### GET DEPICTS P180
    OPTIONAL {{ ?item wdt:P571 ?date. }}
    OPTIONAL {{ ?item wdt:P31 ?object. }}
  }}
  """

In [None]:
def get_results(endpoint_url, query):
  sparql = SPARQLWrapper(endpoint_url, agent=USER_AGENT)
  sparql.setQuery(query)
  sparql.setReturnFormat("json")
  return sparql.query().convert()

In [None]:
pQuery = prep_query("painting")
results = get_results(WIKI_QUERY_URL, pQuery)["results"]["bindings"]

In [None]:
wiki_data = {}

if (path.isfile(WIKI_DATA_PATH)):
  with open(WIKI_DATA_PATH, "r") as ifp:
    wiki_data = json.load(ifp)

### Prep Metadata

In [None]:
defval = {"value": "unknown"}

for cnt,result in enumerate(results):
  if cnt % 10 == 0:
    print(cnt)

  id = result["qid"]["value"]
  if id in wiki_data:
    continue

  img_url = result["image"]["value"]

  wiki_data[id] = {
    "id": result["qid"]["value"],
    "title": result["itemLabel"]["value"],
    "date": result.get("date", defval)["value"],
    "creator": result.get("creatorLabel", defval)["value"],
    "image": img_url
  }

### Download Images

In [None]:
IMG_PATH = "../../imgs/MuseuPaulista"

IMG_PATH_FULL = path.join(IMG_PATH, "full")
IMG_PATH_500 = path.join(IMG_PATH, "500")

makedirs(IMG_PATH_FULL, exist_ok=True)
makedirs(IMG_PATH_500, exist_ok=True)

In [None]:
def qid_to_img_url(qid):
  response = requests.get(f"{WIKI_CLAIM_URL}{qid}")
  response.raise_for_status()
  info = json.loads(response.content)
  fname = info["claims"]["P18"][0]["mainsnak"]["datavalue"]["value"].replace(" ", "_")
  fname_md5 = md5(fname.encode())
  md5_hex = fname_md5.hexdigest()
  img_url = f"{WIKI_MEDIA_URL}/{md5_hex[0]}/{md5_hex[:2]}/{fname}"
  return img_url


In [None]:
# TODO: download

img_path_full = path.join(IMG_PATH_FULL, f"{id}.jpg")
img_path_500 = path.join(IMG_PATH_500, f"{id}.jpg")

if (not path.isfile(img_path_full)) or (not path.isfile(img_path_500)):
  try:
    response = requests.get(img_url, headers={"User-Agent": USER_AGENT})
    response.raise_for_status()
    pimg = PImage.open(BytesIO(response.content)).convert("RGB")
  except:
    print(id)
    print(img_url)
    print(response)
    continue

if (not path.isfile(img_path_full)):
  pimg.save(img_path_full)

if (not path.isfile(img_path_500)):
  pimg.thumbnail([500, 500])
  pimg.save(path.join(IMG_PATH_500, f"{id}.jpg"))
