In [8]:
import requests
import json
from pathlib import Path

In [11]:
WORLD_BANK_API = "https://search.worldbank.org/api/v2/wds?"
output_file = Path("data/digital_agriculture_projects.json")


#Retrieve projects whose query is "digital agriculture" 
# from the "Agriculture, Fishing and Forestry sector" or "Agriculture, fishing and forestry sector" (yes there are two different spellings)
# and are in English
#from each project, retrieve the following fields:
# docdt: date of document
# authr: author
# count: country
# abstracts: abstract
# display_title: title
# majtheme: major theme
# keywd: keywords
# envcat: environmental category
# projectid: project id
# subsc: subsector
# subtopic: subtopic
# theme: theme
# url: url
# txturl: text url

rows = 100 # read n rows at once from API
verbose = True
test_size = 100000 # Only process the first n projects, for debugging

related_terms = [
    "Agrifood",
    "digital agriculture",
    "precision agriculture",
    "farm management software",
    "remote sensing in agriculture",
    "earth observation in agriculture",
    "Internet of Things in agriculture",
    "big data in agriculture",
    "artificial intelligence in agriculture",
    "AI in agriculture"
]
def transform_term(term):
    return f"({term.replace(' ', '%20AND%20')})"



def get_projects_metadata():
    projects_metadata = {}
    
    for q in related_terms:
        print(f"Retrieving {q}")
        PARAMS="format=json"\
             "&majdocty_exact=Project%20Documents"\
             "&sectr_exact=Agriculture,%20Fishing%20and%20Forestry"\
             "&sectr_exact=Agriculture,%20fishing%20and%20forestry"\
             f"&qterm={transform_term(q)}"\
             "&lang=English"\
             "&fl=docdt,authr,count,abstracts,display_title,majtheme,keywd,envcat,projectid,subsc,subtopic,theme,url,txturl"
        offset = 0
        while True:
            URL=f"{WORLD_BANK_API}/{PARAMS}&rows={rows}&os={offset}"
            #print(URL)
            #break
            response = requests.get(URL)
            if response.status_code != 200:
                break
            data = response.json()

            projects_metadata.update(data["documents"]) # will overwrite if duplicate which is fine

            offset += rows
            if offset > min(data["total"],test_size) :
                break
        
            if verbose: 
                print(f"Retrieved {offset} of {data['total']} for {q}", end="\r")
    return projects_metadata

#retrieve projects metadata from API
projects_metadata = get_projects_metadata()
if verbose:
    print(f"Retrieved {len(projects_metadata)} projects.")

#save to file
with open(output_file, "w") as f:
    json.dump(projects_metadata, f)

Retrieving Agrifood
Retrieving digital agriculture
Retrieving precision agriculture agriculture
Retrieving farm management software
Retrieving remote sensing in agriculture software
Retrieving earth observation in agricultureagriculture
Retrieving Internet of Things in agriculture
Retrieving big data in agriculture
Retrieving artificial intelligence in agriculture
Retrieving AI in agriculture
Retrieved 1050 projects.


In [12]:
#process projects metadata
verbose = True

def clean_text(text):
    out = text.replace('\n', ' ').strip()
    out = text.replace("'", "").replace('"', "").replace('"', "").replace('[', "").replace(']', "")
    out = " ".join(out.split())
    return out


def process_project(project_metadata):
    #print(json.dumps(project_metadata, indent=4))

    #check if keys exist before accessing them
    for k in ['display_title','txturl','projectid','docdt','subtopic']:
        if k not in project_metadata.keys():
            project_metadata[k] = ""
    if "abstracts" not in project_metadata.keys() or project_metadata["abstracts"] == "":
        project_metadata["abstracts"] = {"cdata!" : ""}
        return {"skip":"yes"}
    if "keywd" not in project_metadata.keys():
        project_metadata["keywd"]={"0":{"keywd" : ""}}
    
    if "authors" not in project_metadata.keys():
        project_metadata["authors"] = {'0':{'author': ''}}
    elif project_metadata["authors"] == {'author': ''}:
        project_metadata["authors"] = {'0':{'author': ''}}
    
    #print(json.dumps(project_metadata, indent=4))
    
    keywords=[clean_text(project_metadata["keywd"]["0"]["keywd"])]
    for k in [clean_text(project_metadata[v]) for v in ['majtheme','theme','subsc','subtopic','envcat'] if v in project_metadata.keys()]:
         keywords.append(k)
    authors=[clean_text(v["author"]) for k,v in project_metadata["authors"].items()]

    project = {
        "title":    clean_text(project_metadata["display_title"]),
        "url":   project_metadata["url"],
        "txturl":   project_metadata["txturl"],
        "ids":      project_metadata["projectid"].split(","),
        "date":     project_metadata["docdt"],
        "keywords": f"{''.join(keywords)},{clean_text(project_metadata['subtopic'])}",
        "abstract": clean_text(project_metadata["abstracts"]["cdata!"]),
        "authors":  authors
    }
    #print(json.dumps(project, indent=4))
    return project


projects = []
i=0
for pid in projects_metadata.keys():
    if verbose:
        print(f"Processing project {i} of {len(projects_metadata)}", end="\r")
    project_metadata = projects_metadata[pid]
    if project_metadata == {}:
        continue
    project = process_project(project_metadata)
    if project == {"skip":"yes"}:
        continue
    projects.append(project)
    i+=1

    if verbose:
        print(f"Processed project {i} of {len(projects_metadata)}", end="\r")


with open("data/digital_agriculture_projects.json", "w") as file:
        json.dump(projects, file, indent=4)

Processed project 649 of 10500