In [None]:
import requests
import json
import csv

endpoint = 'http://sparql.hegroup.org/sparql'
accept_media_type = 'application/json'
# Replace this value with your own user agent header string
user_agent_header = 'twdgCvTool/0.1 (mailto:steve.baskauf@vanderbilt.edu)'

# The column headers of the output will be in the order in which they occur in the dict (usually the order they were added)
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)


def generate_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_header_dictionary(accept_media_type,user_agent_header)
# The query is a valid SPARQL query string

# Sends a query to the query service endpoint. 
# NOTE: request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header) # use URL-encoded method
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    print('done retrieving data')
    # print(json.dumps(results, indent=2))
    return results

query_string = '''prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix owl: <http://www.w3.org/2002/07/owl#> 
SELECT distinct ?subclass ?plainLabel
WHERE { 
bind(<http://purl.obolibrary.org/obo/ENVO_00000428> as ?rootIri)
?subclass rdfs:subClassOf+ ?rootIri.
?subclass rdfs:label ?label.
filter(contains(str(?subclass), "ENVO"))
bind(lcase(str(?label)) as ?plainLabel)
minus {?subclass owl:deprecated "true"^^xsd:boolean.}
}'''



In [None]:
result = send_sparql_query(query_string)


In [None]:
# Extract IRI and generate lower camelCase controlled value string
processed_list = []
unique_value_test_list = []
for value in result:
    iri = value['subclass']['value']
    plain_label = value['plainLabel']['value']
    pieces = plain_label.split(' ')
    # Remove final "biome" if it is there
    if pieces[len(pieces)-1] == 'biome':
        pieces.remove('biome')
    # Turn label into lower camelCase
    cv_string = pieces[0] # first piece remains lower case
    for piece in pieces[1:]:
        cv_string += piece.title() # subsequent pieces capitalized
    processed_list.append({'iri': iri, 'cv_string': cv_string})
    unique_value_test_list.append(cv_string)

unique_value_test_list.sort()


In [None]:
# Check controlled value strings to make sure they are actually unique
# Compare list to list with removed duplicates to make sure they are the same
no_duplicates = list(set(unique_value_test_list))
no_duplicates.sort()

print('No duplicates:', no_duplicates == unique_value_test_list)

In [None]:
# Write results to CSV
fieldnames = processed_list[0].keys()
filename = 'biome_cv_values.csv'
write_dicts_to_csv(processed_list, filename, fieldnames)
print('done')