##### Description: Scrape UNHCR API for data on Syrian refugees in a specific countries of asylum
##### Author: Matthew Albert

In [40]:
# Import necesssary packages and modules
import requests
import json
import copy
import time
from dateutil.parser import isoparse
from maps_utilities import get_map_info
from wikidata_utilities import get_page_data, get_page_id, get_page_title, get_img_url

In [41]:
# Define specific parameters for the scrape

syria_iso = "SYR"                              # ISO code for Syria (country of origin)
endpoints = ["population", "asylum-decisions"] # API endpoints to scrape
num_instances = 50                             # Num items to return
year_from = 2010                               # Fetch results starting from this year (inclusive)
year_to = 2023                                 # Fetch results ending at this year (inclusive)

json_file_path = "./models_data/country_db.json"

In [42]:
# Define JSON structure that will be used to store the scraped data
country_instance = {
  "country_name" : "",
  "id" : "",
  "attributes" : {
    "country_iso3" : "",
    "flag_url" : "",
    "map_info" : "",
    "capital" : "",
    "num_refugees" : 0,
    "num_asylum_decisions" : 0,
    "year_of_decisions" : 0,
    "num_recognized" : 0,
    "num_other" : 0,
    "num_apps_rejected" : 0,
    "num_closed" : 0
  }
}

In [43]:
# Miscellaneous functions

# Extract the current capital(s) of a country from a list of previous capital cities
def extract_curr_capital(capital_data):
  # Store capitals in set to avoid duplicates
  capitals = set()
  if len(capital_data) == 1:
    # Return, only one capital city in list
    capitals.add(get_page_title(capital_data[0]["mainsnak"]["datavalue"]["value"]["id"]))
    return list(capitals)
  for capital_entry in capital_data:
    # Store property ID for the "end time" field
    end_time_property = "P582"
    if end_time_property not in capital_entry["qualifiers"]:
      # This is a current capital city, add to list
      capitals.add(get_page_title(capital_entry["mainsnak"]["datavalue"]["value"]["id"]))
  return list(capitals)

# Extract the current flag of a country from a list of previous flags
def extract_curr_flag(flag_data):
  if (len(flag_data) == 1):
    # Only one flag in list, return
    return flag_data[0]["mainsnak"]["datavalue"]["value"]
  # Try to sort list of json objects by the "start time" field (P580)
  start_time = "P580"
  try:
    # Sort objects by ISO 8601 formatted dates
    sorted_flag_data = sorted(flag_data, key = lambda i: isoparse(i["qualifiers"][start_time][0]["datavalue"]["value"]["time"][1:]), reverse=True)
    # Return flag at front of list (most recent flag)
    return sorted_flag_data[0]["mainsnak"]["datavalue"]["value"]
  except:
    # Failed to sort list, most likely problem with data format, return first flag in list
    return flag_data[0]["mainsnak"]["datavalue"]["value"]

# Function to scrape country data that is only available through Wikidata API
def get_wikidata_fields(country_name, country_data):
  # Retrieve page ID of country page
  page_id = get_page_id(country_name)
  # Retrieve Wikidata data for country page
  json_response = get_page_data(page_id)
  # Attempt to scrape capital city data
  try:
    capital_property = "P36"
    # Extract property info from JSON response
    capital_entries = json_response["entities"][page_id]["claims"][capital_property]
    capitals = extract_curr_capital(capital_entries)
    # Store list with capital city (or cities)
    country_data["attributes"]["capital"] = capitals
  except:
    print(f"Failed to retrieve capital city data for {country_name}")
  # Attempt to scrape country flag image url
  try:
    flag_property = "P41"
    # Get image file name from page data
    file_name = extract_curr_flag(json_response["entities"][page_id]["claims"][flag_property])
    # Get image URL from file name
    img_url = get_img_url(file_name)
    # Store image URL in instance data
    country_data["attributes"]["flag_url"] = img_url
  except:
    print(f"Failed to retrieve flag image url for {country_name}")
    
# Function to select the index with the most recent and populous asylum decisions data
def filter_asylum_data(asylum_data):
  selected_index = -1
  # Store the most recent year (that is available)
  recent_year = asylum_data[-1]["year"]
  # Since most recent data is stored at end of list, iterate in reverse
  for i in range(len(asylum_data) - 1, -1, -1):
    if asylum_data[i]["year"] == recent_year:
      # Store index if this item contains more asylum decisions than the previously stored one
      if asylum_data[i]["dec_total"] > asylum_data[selected_index]["dec_total"]:
        selected_index = i
    else:
      # No more items with the most recent year, exit loop
      break
  return selected_index


In [44]:
# Function to scrape data for a single country (using a country's ISO3 code)
def scrape_country_data(country_code):
  # Retrieve deep copy of country_instance dict to store country data
  data = copy.deepcopy(country_instance)
  data["id"] = data["attributes"]["country_iso3"] = country_code
  # Iterate through each endpoint to scrape necessary attributes
  for endpoint in endpoints:
    params = {
      "limit" : num_instances,
      "yearFrom" : year_from,
      "yearTo" : year_to,
      "coo" : syria_iso,
      "coa" : country_code,
      "cf_type" : "ISO"       # Search for countries using ISO3 country codes
    }
    # Make request to UNHCR API
    response = requests.get(f"https://api.unhcr.org/population/v1/{endpoint}/", params=params)
    if response.status_code == 200:
      # Success, store data in json format
      response_data = response.json()
      if len(response_data["items"]) == 0:
        # No data available for this country, skip country
        return None
      if endpoint == "population":
        # Select item at end of list (b/c it is the most recent year)
        item_index = -1
        # Store population dataset specific attribute
        data["attributes"]["num_refugees"] = response_data["items"][item_index]["refugees"]
        data["country_name"] = response_data["items"][item_index]["coa_name"]
      else:
        # Select data item with most recent year and most asylum decisions
        item_index = filter_asylum_data(response_data["items"])
        # Store asylum decisions dataset specific attributes
        data["attributes"]["num_asylum_decisions"] = response_data["items"][item_index]["dec_total"]
        data["attributes"]["year_of_decisions"] = response_data["items"][item_index]["year"]
        data["attributes"]["num_recognized"] = response_data["items"][item_index]["dec_recognized"]
        data["attributes"]["num_apps_rejected"] = response_data["items"][item_index]["dec_rejected"]
        data["attributes"]["num_other"] = response_data["items"][item_index]["dec_other"]
        data["attributes"]["num_closed"] = response_data["items"][item_index]["dec_closed"]
    else:
      # Error, print status code
      print(f"Request Error: {response.status_code}")
      print(f"Request URL: {response.url}")
      print(f"Country: {country_code}")
      exit(-1)
  # Scrape remaining data fields (flag image url, capital city (or cities), map info)
  data["attributes"]["map_info"] = get_map_info(data["country_name"])
  get_wikidata_fields(data["country_name"], data)
  return data

# Function to scrape the name and ISO3 code for all countries in UNHCR database
def get_all_countries():
  countries = []
  # Set request parameters
  params = {
    "limit" : 600
  }
  # Make request to UNHCR API
  response = requests.get("https://api.unhcr.org/population/v1/countries/", params=params)
  if response.status_code == 200:
    # Success, store data in json format
    response_data = response.json()
    # Parse and store country name and ISO3 code for each country
    for country in response_data["items"]:
      countries.append({
        "name" : country["name"],
        "iso3" : country["iso"]
      })
  else:
    # Error, print status code
    print(f"Request Error: {response.status_code}")
    print(f"Request URL: {response.url}")
    exit(-1)
  return countries

In [45]:
# Function used to scrape data for all countries in UNHCR database
def scrape_all_countries():
  country_data = []
  # Retrieve list of countries
  countries = get_all_countries()
  count = 1
  # Iterate through each country and scrape data
  for country in countries:
    print(f"Scraping data for {country['name']}...{count}/{len(countries)}")
    country_code = country["iso3"]
    if country_code != None:
      data = scrape_country_data(country["iso3"])
      if data is not None:
        # Data was successfully scraped, add to list
        country_data.append(data)
    count += 1
    time.sleep(5)
  return country_data
  
data = scrape_all_countries()

Scraping data for Afghanistan...1/230
Scraping data for Albania...2/230
Scraping data for Algeria...3/230
Scraping data for Andorra...4/230
Scraping data for Angola...5/230
Scraping data for Anguilla...6/230
Scraping data for Antigua and Barbuda...7/230
Scraping data for Argentina...8/230
Scraping data for Armenia...9/230
Scraping data for Aruba...10/230
Scraping data for Australia...11/230
Scraping data for Austria...12/230
Scraping data for Azerbaijan...13/230
Scraping data for Bahamas...14/230
Scraping data for Bahrain...15/230
Scraping data for Bangladesh...16/230
Scraping data for Barbados...17/230
Scraping data for Belarus...18/230
Scraping data for Belgium...19/230
Scraping data for Belize...20/230
Scraping data for Benin...21/230
Failed to retrieve capital city data for Benin
Scraping data for Bermuda...22/230
Scraping data for Bhutan...23/230
Scraping data for Bolivia (Plurinational State of)...24/230
Scraping data for Bonaire, Saint Eustatius and Saba...25/230
Scraping data f

In [47]:
# Function to write scraped data to JSON file
def write_to_json(data):
  # Write scraped data to JSON file
  with open(json_file_path, "w") as outfile:
    json.dump(data, outfile, indent=2)

print(len(data))

127
