##### Script for scraping charity data from Wikidata Public API
##### Author: Matthew Albert

In [None]:
# Imports for workflow
import requests
import copy
import json

In [None]:
# Define the list of charities to scrape
charities = ["UNHCR", "UNICEF", "World Food Programme"]

# Define the mapping from label to property ID in Wikidata pages
label_to_prop = {
  "org_type": "P31",
  "logo_img": "P154",
  "org_img": "P18",
  "established": "P571",
  "abbreviation": "P1813",
  "parent_org": "P749",
  "headquarters": "P159",
  "awards_received": "P166",
  "website": "P856"
}

# Define json structure that scraped data will be stored in 
charity_instance = {
  "name" : "",
  "id" : "",
  "attributes" : {
    "description" : "",
    "org_type": [],
    "logo_img": "",
    "org_img": "",
    "established": "",
    "abbreviation": "",
    "parent_org": "",
    "headquarters": "",
    "awards_received": [],
    "website": ""
  }
}

URL = "https://www.wikidata.org/w/api.php"

##### Utlity Functions

In [None]:
# Define necessary functions to parse/fetch data from Wikidata

# Returns page ID of the Wikidata page from a string name
def get_page_id(name):
  params = {
    "action": "wbsearchentities",
    "format": "json",
    "search": name,
    "language": "en",
  }
  ret = ""
  try:
    response = requests.get(URL, params=params)
    ret = response.json()["search"][0]["id"]
  except:
    print("Request to WikiData failed")
    ret = ""
  return ret

# Returns string title of Wikidata page from page ID
def get_page_title(page_id):
  params = {
    "action": "wbgetentities",
    "format": "json",
    "ids": page_id,
    "props": "labels",
    "languages": "en",
  }
  ret = ""
  try:
    response = requests.get(URL, params=params)
    ret = response.json()["entities"][page_id]["labels"]["en"]["value"]
  except:
    print("Request to WikiData failed")
    ret = ""
  return ret
  
# Return the json object of data relating to the Wikidata page
def get_page_data(page_id):
  params = {
    "action": "wbgetentities",
    "ids": f"{page_id}|P159",
    "format": "json",
    "languages": "en",
  }
  try:
    response = requests.get(URL, params=params)
  except:
    print("Request to WikiData failed")
  return response.json()

# Return the URL of an image from a file name
def get_img_url(file_name):
  params = {
    "action": "query",
    "titles": f"File:{file_name.replace(' ', '_')}",
    "prop": "imageinfo",
    "iiprop": "url",
    "format": "json",
  }
  ret = ""
  try:
    # Search for image using Wikipedia image API
    response = requests.get("https://en.wikipedia.org/w/api.php", params=params)
    ret = response.json()["query"]["pages"]["-1"]["imageinfo"][0]["url"]
  except:
    print("Request to WikiData failed")
    ret = ""
  return ret

get_page_data(get_page_id("UNHCR"))

In [None]:
# Main function that will be used to parse the json data and populate the charity data fields
def populate_fields(json_response, page_id):
  # Create deep copy of charity_instance to store data
  data = copy.deepcopy(charity_instance)
  # Populate fields with data (or empty string if property does not exist for page)
  data["name"] = json_response["entities"][page_id]["labels"]["en"]["value"]
  data["id"] = page_id
  data["attributes"]["description"] = json_response["entities"][page_id]["descriptions"]["en"]["value"]
  # Use try/except to handle cases where property does not exist for page
  
  # Try to scrape organization type
  try:
    org_type_list = json_response["entities"][page_id]["claims"][label_to_prop["org_type"]]
    org_types = []
    for org in org_type_list:
      org_type = get_page_title(org["mainsnak"]["datavalue"]["value"]["id"])
      data["attributes"]["org_type"].append(org_type)
  except:
    pass
  
  # Try to scrape logo image
  try:
    img_file = json_response["entities"][page_id]["claims"][label_to_prop["logo_img"]][0]["mainsnak"]["datavalue"]["value"]
    data["attributes"]["logo_img"] = get_img_url(img_file)
  except:
    pass
  
  # Try to scrape for image related to organization
  try:
    img_file = json_response["entities"][page_id]["claims"][label_to_prop["org_img"]][0]["mainsnak"]["datavalue"]["value"]
    data["attributes"]["org_img"] = get_img_url(img_file)
  except:
    pass
  
  # Try to scrape the year the organization was established
  try:
    data["attributes"]["established"] = json_response["entities"][page_id]["claims"][label_to_prop["established"]][0]["mainsnak"]["datavalue"]["value"]["time"]
  except:
    pass
  
  # Try to scrape the abbreviation of the organization
  try:
    data["attributes"]["abbreviation"] = json_response["entities"][page_id]["claims"][label_to_prop["abbreviation"]][0]["mainsnak"]["datavalue"]["value"]["text"]
  except:
    pass
  
  # Try to scrape for the parent organization
  try:
    parent_org = json_response["entities"][page_id]["claims"][label_to_prop["parent_org"]][0]["mainsnak"]["datavalue"]["value"]["id"]
    data["attributes"]["parent_org"] = get_page_title(parent_org)
  except:
    pass
  
  # Try to scrape for headquarters of organization
  try:
    headquarters = json_response["entities"][page_id]["claims"][label_to_prop["headquarters"]][0]["mainsnak"]["datavalue"]["value"]["id"]
    data["attributes"]["headquarters"] = get_page_title(headquarters)
  except:
    pass
  
  # Try to scrape for awards received by organization
  try:
    awards_list = json_response["entities"][page_id]["claims"][label_to_prop["awards_received"]]
    for award in awards_list:
      award_name = get_page_title(award["mainsnak"]["datavalue"]["value"]["id"])
      data["attributes"]["awards_received"].append(award_name)
  except:
    pass
  
  # Try to scrap for website of organization
  try:
    data["attributes"]["website"] = json_response["entities"][page_id]["claims"][label_to_prop["website"]][0]["mainsnak"]["datavalue"]["value"]
  except:
    pass
  
  return data

# Used for incremental testing
# populate_fields(get_page_data(get_page_id("UNHCR")), get_page_id("UNHCR"))
  

In [None]:
# Scrape data for each charity in the master list
charity_instances = []
for charity in charities:
  page_id = get_page_id(charity)
  charity_instances.append(populate_fields(get_page_data(page_id), page_id))

# Write data to json file
json_file_path = "./models_data/charity_db.json"
with open(json_file_path, "w") as json_file:
  json.dump(charity_instances, json_file, indent=2)