worldpop data: https://www.worldpop.org/geodata/listing?id=64

In [26]:
import os
import copy
import time
import datetime
import warnings
import requests
import pandas as pd

In [27]:
def download_file(url, local_filename):
    """Download a file from url to local_filename

    Downloads in chunks
    """
    with requests.get(url, stream=True, verify=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024*1024):
                f.write(chunk)

In [28]:
def file_exists(path):
    return os.path.isfile(path)

In [29]:
def get_current_timestamp(format_str=None):
    if format_str is None:
        format_str = '%Y_%m_%d_%H_%M'
    timestamp = datetime.datetime.fromtimestamp(int(time.time())).strftime(format_str)
    return timestamp

In [30]:
def read_csv(path):
    df = pd.read_csv(
        path, quotechar='\"',
        na_values='', keep_default_na=False,
        encoding='utf-8')
    return df

In [31]:
def task_wrapper(func, args):
    try:
        result = func(*args)
        return (0, "Success", args, result)
    except Exception as e:
        return (1, repr(e), args, None)

In [32]:
def run_tasks(func, flist, parallel, max_workers=None, chunksize=1):
    # run all downloads (parallel and serial options)
    wrapper_list = [(func, i) for i in flist]
    if parallel:
        # see: https://mpi4py.readthedocs.io/en/stable/mpi4py.futures.html
        # and: https://docs.python.org/3/library/concurrent.futures.html
        try:
            from mpi4py.futures import MPIPoolExecutor
            mpi = True
        except:
            from concurrent.futures import ProcessPoolExecutor
            mpi = False
        if max_workers is None:
            if mpi:
                if "OMPI_UNIVERSE_SIZE" not in os.environ:
                    raise ValueError("Parallel set to True and mpi4py is installed but max_workers not specified and OMPI_UNIVERSE_SIZE env var not found")
                max_workers = os.environ["OMPI_UNIVERSE_SIZE"]
                warnings.warn(f"Parallel set to True (mpi4py is installed) but max_workers not specified. Defaulting to OMPI_UNIVERSE_SIZE env var value ({max_workers})")
            else:
                import multiprocessing
                max_workers = multiprocessing.cpu_count()
                warnings.warn(f"Parallel set to True (mpi4py is not installed) but max_workers not specified. Defaulting to CPU count ({max_workers})")
        if mpi:
            with MPIPoolExecutor(max_workers=max_workers) as executor:
                results_gen = executor.starmap(task_wrapper, wrapper_list, chunksize=chunksize)
        else:
            with ProcessPoolExecutor(max_workers=max_workers) as executor:
                results_gen = executor.map(task_wrapper, *zip(*wrapper_list), chunksize=chunksize)
        results = list(results_gen)
    else:
        results = []
        # for i in flist:
            # results.append(func(*i))
        for i in wrapper_list:
            results.append(task_wrapper(*i))
    return results

In [33]:
template_url = "https://data.worldpop.org/GIS/Population/Global_2000_2020/{YEAR}/0_Mosaicked/ppp_{YEAR}_1km_Aggregated.tif"
timestamp = get_current_timestamp('%Y_%m_%d_%H_%M')

Edit variables in cell below to your preference

In [34]:
output_dir = "/home/jovyan/data/population/"
year_list = [2001, 2011]
run_parallel = True
max_workers = 3

In [35]:
def manage_download(url, local_filename):
    overwrite = True
    max_attempts = 5
    if file_exists(local_filename) and not overwrite:
        return (0, "Exists", url)
    attempts = 1
    while attempts <= max_attempts:
        try:
            download_file(url, local_filename)
            return (0, "Downloaded", url)
        except Exception as e:
            attempts += 1
            if attempts > max_attempts:
                raise

In [36]:
test_request = requests.get("https://data.worldpop.org/GIS/", verify=True)
test_request.raise_for_status()

Preparing download

In [37]:
year_file_list = []
for year in year_list:
    year_url = template_url.replace("{YEAR}", str(year))
    year_file_list.append(year_url)

In [38]:
df = pd.DataFrame({"raw_url": year_file_list})
df["output"] = df["raw_url"].apply(lambda x: os.path.join(output_dir, os.path.basename(x)))

In [39]:
os.makedirs(output_dir, exist_ok=True)
flist = list(zip(df["raw_url"], df["output"]))

Running data download

In [40]:
results = run_tasks(manage_download, flist, run_parallel, max_workers=max_workers, chunksize=1)

In [41]:
results_join_field_name = "raw_url"
results_join_field_loc = 2

In [42]:
results_df = pd.DataFrame(results, columns=["status", "message", "args", results_join_field_name])
results_df[results_join_field_name] = results_df[results_join_field_name].apply(lambda x: x[results_join_field_loc])
output_df = df.merge(results_df, on=results_join_field_name, how="left")

In [43]:
errors_df = output_df[output_df["status"] != 0]
print("{} errors found out of {} tasks".format(len(errors_df), len(output_df)))

0 errors found out of 2 tasks


In [44]:
os.makedirs(os.path.join(output_dir, "results"), exist_ok=True)
output_path = os.path.join(output_dir, "results", f"data_download_{timestamp}.csv")
output_df.to_csv(output_path, index=False)