##### Script for scraping charity data from Wikidata Public API
##### Author: Matthew Albert

In [29]:
# Imports for workflow
import requests
import copy
import json
from wikidata_utilities import *
from reliefweb_utilities import get_relevant_countries

In [30]:
# Define the list of charities to scrape
charities = ["UNHCR", "UNICEF", "World Food Programme"]

# Define the mapping from label to property ID in Wikidata pages
label_to_prop = {
  "org_type": "P31",
  "logo_img": "P154",
  "org_img": "P18",
  "established": "P571",
  "abbreviation": "P1813",
  "parent_org": "P749",
  "headquarters": "P159",
  "awards_received": "P166",
  "website": "P856"
}

# Define json structure that scraped data will be stored in 
charity_instance = {
  "name" : "",
  "id" : "",
  "attributes" : {
    "description" : "",
    "org_type": [],
    "logo_img": "",
    "org_img": "",
    "established": "",
    "short_name": "",
    "long_name": "",
    "parent_org": "",
    "headquarters": "",
    "hq_country": "",
    "awards_received": [],
    "website": "",
    "relief_web_id": "",
    "relief_provided": [],
    "relevant_countries": []
  }
}

WIKIDATA_URL = "https://www.wikidata.org/w/api.php"
RELIF_WEB_URL = "https://api.reliefweb.int/v1/sources"

##### Utility Functions

In [31]:
# Function used to scrape wikidata and populate relevant fields
def populate_wikidata_fields(json_response, page_id, data):
  # Populate fields with data (or empty string if property does not exist for page)
  data["name"] = json_response["entities"][page_id]["labels"]["en"]["value"]
  data["id"] = page_id
  if data["attributes"]["description"] == "":
    # Populate field with description from Wikidata page
    data["attributes"]["description"] = json_response["entities"][page_id]["descriptions"]["en"]["value"]
    
  # Use try/except to handle cases where property does not exist for page
  
  if (data["attributes"]["org_type"] == []):
    # Try to scrape organization type
    try:
      org_type_list = json_response["entities"][page_id]["claims"][label_to_prop["org_type"]]
      org_types = []
      for org in org_type_list:
        org_type = get_page_title(org["mainsnak"]["datavalue"]["value"]["id"])
        data["attributes"]["org_type"].append(org_type)
    except:
      pass
  
  # Try to scrape logo image
  try:
    img_file = json_response["entities"][page_id]["claims"][label_to_prop["logo_img"]][0]["mainsnak"]["datavalue"]["value"]
    data["attributes"]["logo_img"] = get_img_url(img_file)
  except:
    pass
  
  # Try to scrape for image related to organization
  try:
    img_file = json_response["entities"][page_id]["claims"][label_to_prop["org_img"]][0]["mainsnak"]["datavalue"]["value"]
    data["attributes"]["org_img"] = get_img_url(img_file)
  except:
    pass
  
  # Try to scrape the year the organization was established
  try:
    date = json_response["entities"][page_id]["claims"][label_to_prop["established"]][0]["mainsnak"]["datavalue"]["value"]["time"]
    data["attributes"]["established"] = date[1:]
  except:
    pass
  
  if data["attributes"]["short_name"] == "":
    # Try to scrape the abbreviation of the organization
    try:
      data["attributes"]["abbreviation"] = json_response["entities"][page_id]["claims"][label_to_prop["abbreviation"]][0]["mainsnak"]["datavalue"]["value"]["text"]
    except:
      pass
  
  # Try to scrape for the parent organization
  try:
    parent_org = json_response["entities"][page_id]["claims"][label_to_prop["parent_org"]][0]["mainsnak"]["datavalue"]["value"]["id"]
    data["attributes"]["parent_org"] = get_page_title(parent_org)
  except:
    pass
  
  # Try to scrape for headquarters of organization
  try:
    headquarters = json_response["entities"][page_id]["claims"][label_to_prop["headquarters"]][0]["mainsnak"]["datavalue"]["value"]["id"]
    data["attributes"]["headquarters"] = get_page_title(headquarters)
  except:
    pass
  
  # Try to scrape for awards received by organization
  try:
    awards_list = json_response["entities"][page_id]["claims"][label_to_prop["awards_received"]]
    for award in awards_list:
      award_dict = {
        "award_name": "",
        "award_date": ""
      }
      award_name = get_page_title(award["mainsnak"]["datavalue"]["value"]["id"])
      date_property = "P585"
      award_date = award["qualifiers"][date_property][0]["datavalue"]["value"]["time"][1:]
      # Populate fields of award json object
      award_dict["award_name"] = award_name
      award_dict["award_date"] = award_date
      data["attributes"]["awards_received"].append(award_dict)
  except:
    pass
  
  if data["attributes"]["website"] == "":
    # Try to scrap for website of organization
    try:
      data["attributes"]["website"] = json_response["entities"][page_id]["claims"][label_to_prop["website"]][0]["mainsnak"]["datavalue"]["value"]
    except:
      pass

# Function used to scrape data from ReliefWeb API
def populate_relief_web_fields(charity, data):
  # Define the parameters for the API request
  params = {
    "appname": "syrianrefugeecrisis",
    "profile": "full",
    "limit": 1,
    "query[value]": charity
  }
  # Make API request to get organization data
  response = requests.get(RELIF_WEB_URL, params=params)
  # Very response was a success
  if response.status_code == 200 and response.json()["count"] > 0:
    # Extract org data object
    org_data = response.json()["data"][0]
    # Populate fields with relevant data from API response
    
    # Use try/except to handle cases where field does not exist for organization
    try:
      data["attributes"]["relief_web_id"] = org_data["id"]
    except:
      pass
    
    try:
      data["attributes"]["description"] = org_data["fields"]["description"]
    except:
      pass
    
    try:
      data["attributes"]["org_type"].append(org_data["fields"]["type"]["name"])
    except:
      pass
    
    try:
      data["attributes"]["short_name"] = org_data["fields"]["shortname"]
    except:
      pass
    
    try:
      data["attributes"]["long_name"] = org_data["fields"]["longname"]
    except:
      pass
    
    try:
      data["attributes"]["website"] = org_data["fields"]["homepage"]
    except:
      pass
    
    try:
      data["attributes"]["hq_country"] = org_data["fields"]["country"][0]["name"]
    except:
      pass
    
    try:
      data["attributes"]["logo_img"] = org_data["fields"]["logo"]["url"]
    except:
      pass
    
  else:
    print(f"ReliefWeb API Request for {charity} failed")
  

In [32]:
# Function used to scrape data for a single charity/organization
def scrape_charity(charity):
  # Create deep copy of charity_instance and use to populate with data
  charity_data = copy.deepcopy(charity_instance)
  
  # First populate data from ReliefWeb
  populate_relief_web_fields(charity, charity_data)
  
  # Populate data from Wikidata
  page_id = get_page_id(charity)
  page_data = get_page_data(page_id)
  populate_wikidata_fields(page_data, page_id, charity_data)
  
  # Populate relevant countries for charity
  charity_data["attributes"]["relevant_countries"] = get_relevant_countries(charity, charity_data["attributes"]["relief_web_id"])
  return charity_data

scrape_charity("World Food Programme")

{'name': 'World Food Programme',
 'id': 'Q204344',
 'attributes': {'description': 'United Nations branch related to food-assistance',
  'org_type': ['International Organization'],
  'logo_img': 'https://upload.wikimedia.org/wikipedia/commons/5/59/World_Food_Programme_Logo_Simple.svg',
  'org_img': 'https://upload.wikimedia.org/wikipedia/commons/5/5b/World_Food_Programme.jpg',
  'established': '1961-12-19T00:00:00Z',
  'short_name': 'WFP',
  'long_name': 'United Nations World Food Programme',
  'parent_org': 'United Nations',
  'headquarters': 'Rome',
  'hq_country': 'Italy',
  'awards_received': [{'award_name': 'Nobel Peace Prize',
    'award_date': '2020-00-00T00:00:00Z'}],
  'website': 'http://www.wfp.org',
  'relief_web_id': '1741',
  'relief_provided': [],
  'relevant_countries': {'primary_countries': ['ARM',
    'LBN',
    'SYR',
    'IRQ',
    'JOR',
    'EGY',
    'TUR'],
   'secondary_countries': ['JPN',
    'NLD',
    'NOR',
    'IRL',
    'DEU',
    'RUS',
    'FRA',
    'USA

In [33]:

with open('charities.json', 'r') as f:
  data = json.load(f)
  print(len(data))

# Scrape data for each charity in the master list
# charity_instances = []
# for charity in charities:
#   page_id = get_page_id(charity)
#   charity_instances.append(populate_fields(get_page_data(page_id), page_id))

# # Write data to json file
# json_file_path = "./models_data/charity_db.json"
# with open(json_file_path, "w") as json_file:
#   json.dump(charity_instances, json_file, indent=2)

47
