# Sample pipeline for PIC-SURE and CCD API
Sample PYTHON session which demonstrates a simple pipeline for retrieving SCC data using Harvards PIC-SURE API and performing a causal analysis using CCD API. You must do the following:

1. Change the api key, get this from user profile in PIC-SURE tranSMART
2. Get a JSON web token, you must have a user account in CCD-Web. 

for full CCD API usage examples, see the follow  https://github.com/bd2kccd/causal-rest-api#api-usage-and-examples

In [1]:
import requests
import json
import pandas as pd
import tempfile as tf
import uuid

# Get data from PIC-SURE

In [2]:
apikey = {"key":"<CHANGE ME>"}    # CHANGE ME: get this api-key from user profile in tranSMART

In [3]:
url = 'https://ssc.hms.harvard.edu/rest/v1/'
auth_url = url + 'securityService/startSession'
qry_url = url + 'queryService/runQuery'
result_url = url + 'resultService/result/'

In [4]:
s = requests.Session()
s.get(auth_url, params=apikey)

<Response [200]>

# Build select/where clause in the i2b2 path style

In [5]:
criteria = '{ "where": [{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/Demographics/ROLE/p1/","dataType": "STRING"}, "predicate": "CONTAINS", "fields": {"ENOUNTER": "NO"}}],"select": [{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/Demographics/AGE_IN_YEARS_NUM/","dataType": "STRING"},"alias": "AGE"},{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/ssc_commonly_used/head_circumference/","dataType": "STRING"},"alias": "HEAD_CIRC"},{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/vineland_ii/composite_standard_score/","dataType": "STRING"},"alias": "COMP_SCORE"},{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/ssc_diagnosis/nonverbal_iq/","dataType": "STRING"},"alias": "NONV_IQ"},{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/ssc_diagnosis/vma/","dataType": "STRING"},"alias": "VMA"},{"field": {"pui": "/ssc/Demo/SFARI_Simplex_Collection_v15/SFARI_Simplex_Collection_v15/Clinical/ssc_diagnosis/verbal_iq/","dataType": "STRING"},"alias": "VERBAL_IQ"}]}'

In [6]:
r = s.post(qry_url, data=criteria, headers={'Content-Type': 'application/json'})
print(r.text)
# parse out the resultId to use
p_resultId = json.loads(r.text)
rid = p_resultId['resultId']

{"resultId":79183}


# Check the job status

In [7]:
# this will keep looping until we get back an available status
status = 'UNAVAILABLE'
while status == 'UNAVAILABLE':
    r = s.get('https://ssc.hms.harvard.edu/rest/v1/resultService/resultStatus/' + str(rid))
    try:
        p_resultId = json.loads(r.text)
        status = p_resultId['status']   # get the status
        print (p_resultId)
    except:
        status = 'UNAVAILABLE'

print (r.text)

{'status': 'AVAILABLE', 'resultId': 79183}
{"resultId":79183,"status":"AVAILABLE"}


# Get the results

In [8]:
# show the data
rurl = result_url + str(rid) + '/CSV';
print (rurl)
resp = s.get(rurl)
print (resp.text)

https://ssc.hms.harvard.edu/rest/v1/resultService/result/79183/CSV
PATIENT_NUM,NONV_IQ,AGE,COMP_SCORE,VERBAL_IQ,VMA,HEAD_CIRC
218362,88,5.3,80,98,61,53.65
218363,93,8.1,94,86,82,53.9
218360,67,,67,48,33,
218361,138,7.5,73,91,76,53
218748,84,5.1,88,85,48,55.5
219879,51,16.1,58,32,61,52.07
218749,96,5.3,91,104,68,52.5
218746,106,6.4,86,84,58,54
218747,82,8,66,54,52,53
218368,128,6.8,91,134,128,55.6
218369,78,8.3,68,44,39,50.5
218366,115,11.9,84,123,206,56
218367,94,10.6,79,81,99,54.4
218364,117,13.1,76,106,182,55.5
218365,110,10.1,74,110,150,52.5
219374,91,10.8,73,89,107,57.5
219375,99,16,68,102,192,60
219880,79,8.6,75,80,79,52.5
219372,65,6.5,77,69,48,51.7
219881,56,17.9,43,29,61,55
219373,93,9.7,83,92,106,56
219370,110,9.7,78,93,113,54.5
219371,57,8.8,78,38,33,53
219886,105,12.8,64,103,162,56.5
218756,96,6.5,110,80,56,55.5
219887,129,4.5,77,103,54,51
218755,110,8.4,93,117,137,53.7
219888,76,10.7,76,80,99,54
218754,94,13.6,93,98,159,56
219889,80,10.2,71,70,82,55.5
218753,89,8.8,95,116,1

In [9]:
# Save results to a CSV file
outfile = tf.gettempdir() + '\picsure-' + str(uuid.uuid4()) + '.csv'
print (outfile)
f = open(outfile, 'wb')
f.write(str.encode(resp.text))   # convert to a byte to correct problem with blank rows
f.close()

C:\Users\midavis\AppData\Local\Temp\picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv


In [10]:
# scrub the data, just throw out rows with missing values for now
df = pd.read_csv(outfile)
no_missing = df.dropna()
no_missing.to_csv(outfile, index=False)

# Process results from PIC-SURE using CCD API

In [11]:
baseurl = 'https://cloud.ccd.pitt.edu/ccd-api'   # AWS url;  CHANGE HERE TO match your username in CCDWeb
#baseurl = 'https://ccd1.vm.bridges.psc.edu/ccd-api' # PSC url

# change the Basic authorization for your username/password (you can use Postman to generate) 
basicheader = {
    'authorization': "Basic <CHANGE ME>==",    # CHANGE ME:  
    'accept': "application/json"
    }

# endpoints
jwturl = baseurl + '/jwt'
dataurl = baseurl + '/data'

In [12]:
# get JSON Web token and userId
r = requests.get(jwturl, headers=basicheader)
jwtresults = json.loads(r.text)

userId = jwtresults['userId']
jwt = jwtresults['jwt']

print (jwtresults)

# construct a header with the JSON web token
jtoken = "Bearer " + jwt
bearheader = {
    'authorization': jtoken,
    'accept': "application/json"    
    }
print (bearheader)

{'lifetime': 3600, 'issuedTime': 1477676836833, 'jwt': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJodHRwczovL2Nsb3VkLmNjZC5waXR0LmVkdS8iLCJ1aWQiOjMsImV4cCI6MTQ3NzY4MDQzNjgzMywiaWF0IjoxNDc3Njc2ODM2ODMzfQ.zPhlGSIgFuogE_J83grm2aIiG2wi03hJkCVACl9r9_I', 'expireTime': 1477680436833, 'userId': 3}
{'accept': 'application/json', 'authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJodHRwczovL2Nsb3VkLmNjZC5waXR0LmVkdS8iLCJ1aWQiOjMsImV4cCI6MTQ3NzY4MDQzNjgzMywiaWF0IjoxNDc3Njc2ODM2ODMzfQ.zPhlGSIgFuogE_J83grm2aIiG2wi03hJkCVACl9r9_I'}


# Upload results file to CCD API

In [13]:
# read in the temporary csv file of downloaded data
fo = open(outfile, 'rb')
files = {'file': fo }

In [14]:
# post the newly saved data file from PIC-SURE query to your user directory in CCD
url = baseurl + '/' + str(userId) + '/dataset/upload'
r = requests.post(url, headers=bearheader, files=files)
print (r.text)

{"id":48,"name":"picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv","creationTime":1477676845000,"lastModifiedTime":1477676845000,"fileSize":104056,"md5checkSum":"cc04101a3c39c23e321303078c391284","fileSummary":{"variableType":null,"fileDelimiter":null,"numOfRows":null,"numOfColumns":null}}


In [15]:
fo.close()
print (json.dumps(json.loads(r.text), indent=4))  # pretty print the results
p_resultId = json.loads(r.text)  #  parse and get the job id
fileId = p_resultId['id']

{
    "id": 48,
    "creationTime": 1477676845000,
    "md5checkSum": "cc04101a3c39c23e321303078c391284",
    "name": "picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv",
    "fileSize": 104056,
    "fileSummary": {
        "fileDelimiter": null,
        "numOfRows": null,
        "numOfColumns": null,
        "variableType": null
    },
    "lastModifiedTime": 1477676845000
}


# Summarize the data file

variableType = {discrete or continuous}

fileDelimiter = {tab or comma}


In [16]:
payload = "{\n    \"id\": " + str(fileId) + ",\n    \"variableType\": \"continuous\",\n    \"fileDelimiter\": \"comma\"\n}"
bearheader.update({'content-type': "application/json"})    

In [17]:
url = baseurl + '/' + str(userId) + '/dataset/summarize'
# post a request to summarize the data file
r = requests.post(url, data=payload, headers=bearheader)
print (r.status_code)
print (r.text)

200
{"id":48,"name":"picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv","creationTime":1477676845000,"lastModifiedTime":1477676845000,"fileSize":104056,"md5checkSum":"cc04101a3c39c23e321303078c391284","fileSummary":{"variableType":"continuous","fileDelimiter":"comma","numOfRows":2720,"numOfColumns":7}}


# Run algorithm

Available alogrithms:
"name": "fgsc", "description": "FGS continuous"
"name": "fgsd", "description": "FGS discrete"
"name": "gfcic","description": "GFCI continuous"

In [18]:
payload = "{\n    \"datasetFileId\": " + str(fileId)+ ",\n    \"dataValidation\": {\n      \"uniqueVarName\": true,\n      \"limitNumOfCategory\": false\n    },\n    \"algorithmParameters\": {\n      \"depth\": 3,\n      \"structurePrior\": 1.0,\n      \"samplePrior\": 1.0\n    },\n    \"jvmOptions\": {\n      \"maxHeapSize\": 100\n    }\n}"

In [19]:
# use the alogrithm name as endpoint
# url = baseurl + '/' + str(userId) + '/jobs/gfcic'  # GFCI continuous
# url = baseurl + '/' + str(userId) + '/jobs/fgsd'  # FGS discrete
url = baseurl + '/' + str(userId) + '/jobs/fgsc'   # FGS Continuous
print (url)
r = requests.post(url, data=payload, headers=bearheader)
print (r.text)
p = json.loads(r.text)  

https://cloud.ccd.pitt.edu/ccd-api/3/jobs/fgsc
{"id":78,"algorithmName":"fgsc","status":0,"addedTime":1477676865305,"resultFileName":"fgsc_picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv_1477676865303.txt","resultJsonFileName":"fgsc_picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv_1477676865303.json","errorResultFileName":"error_fgsc_picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv_1477676865303.txt"}


In [20]:
#  parse and get the job id, and expected result and error name 
jobId = p['id']
resultFilename = p['resultFileName']
errorFilename = p['errorResultFileName']
print (r.text)

{"id":78,"algorithmName":"fgsc","status":0,"addedTime":1477676865305,"resultFileName":"fgsc_picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv_1477676865303.txt","resultJsonFileName":"fgsc_picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv_1477676865303.json","errorResultFileName":"error_fgsc_picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv_1477676865303.txt"}


# Show the job status

Note: an error of 'Not Found' may just mean that the job was processed and no longer in the job queue

In [21]:
url = baseurl + '/' + str(userId) + '/jobs'
r = requests.get(url + '/' + str(jobId), headers=bearheader)
print (r.text)

{"timestamp":1477676877201,"status":404,"error":"Not Found","message":"Unable to find job with ID 78 for user with ID: 3","path":"/3/jobs/78"}


# Show the algorithm result

In [22]:
# shows a report-based summary, non-graphical view
url = baseurl + '/' + str(userId) + '/results'
r = requests.get(url + '/' + resultFilename, headers=bearheader)
# if the result file is not found check the error file
if (r.status_code == 404):
    r = requests.get(url + '/' + errorFilename, headers=bearheader)
print (r.text) 

FGS Continuous (Fri, October 28, 2016 05:47:46 PM)

Runtime Parameters:
verbose = false
number of threads = 1

Dataset:
file = picsure-45b86de0-b123-4cbf-9785-ed8a03b412d0.csv
delimiter = comma
cases read in = 2719
variables read in = 7

Algorithm Parameters:
penalty discount = 0.000000
max degree = 3
faithfulness assumed = false

Data Validations:
ensure variable names are unique = true
ensure variables have non-zero variance = false


Graph Nodes:
PATIENT_NUM,NONV_IQ,AGE,COMP_SCORE,VERBAL_IQ,VMA,HEAD_CIRC

Graph Edges:
1. AGE --> COMP_SCORE
2. AGE --- HEAD_CIRC
3. AGE --> VMA
4. HEAD_CIRC --> PATIENT_NUM
5. HEAD_CIRC --> VMA
6. NONV_IQ --> COMP_SCORE
7. NONV_IQ --> PATIENT_NUM
8. VERBAL_IQ --> COMP_SCORE
9. VERBAL_IQ --- NONV_IQ
10. VERBAL_IQ --> VMA



# clean up 
import os 
os.remove(outfile)