In [1]:
import json
import requests
import pandas as pd
import re

### Useful docs
# Labmatrix API
#   https://fr-s-ccr-chm-t.nci.nih.gov/Help/doc/chameleon-api/rest/  
# 'requests' module docs
#   https://requests.readthedocs.io/en/latest/api/
# json module docs
#   https://docs.python.org/3/library/json.html

base_url="https://labmatrix.nci.nih.gov"  ## Production server
# base_url="https://fr-s-ccr-chm-t.nci.nih.gov"  ## Test server

## I think username will only be required if we have accounts on multiple instances of labmatrix?
## From the API docs:
##   "If the specified LDAP username is associated with multiple Labmatrix accounts,
##    the Authorization header must be accompanied by a X-BF-Username-Disambiguation header
##    specifying the Labmatrix username of the account to login with."
# username="tandonm" ## I guess we don't need this for now?

## I got this key from the example JSON in the labmatrix API docs (so it's associated with my account)
## mykey="dGFuZG9ubTptaWNyb1JOQTUxKSkgSnVseSAyMDIw"
### Right now, this key is hardcoded into each function
### Would prob be better to pass this into each function so can be used by different users

In [2]:
def get_study_metadata(base_url, study_name):
    tool_url = "/api/StudyData/read/"
    request_type = "POST"
    myurl = base_url + tool_url
    myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
    myparams = {"xhrFields": {"withCredentials": "true"},
                "dataType": "json",
                "contentType": "application/json"}#,
                # "identifier": { "name": "20C0006" } }
    mydata = { "identifier": { "name": study_name } }
    myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams, json=mydata)
    ## Prob needs error checking here to see if response was sucessful
    ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
    ## Cuz 'content' will contain error details instead of study info
    return(json.loads(myrequest.content))

def get_biomaterial_details(base_url, bm_id):
    tool_url = "/api/Biomaterial/"
    request_type = "GET"
    myurl = base_url + tool_url + bm_id
    myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
    myparams = { "xhrFields": {"withCredentials": "true"},
                "dataType": "json",
                "contentType": "application/json" }
    myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams)
    ## Prob needs error checking here to see if response was sucessful
    ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
    ## Cuz 'content' will contain error details instead of study info
    return(json.loads(myrequest.content))

# def create_subject(base_url, info_dict):
#     tool_url = "/api/Subject"
#     request_type = "PUT"
#     myurl = base_url + tool_url
#     myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
#     myparams = {"xhrFields": {"withCredentials": "true"},
#                 "dataType": "json",
#                 "contentType": "application/json"}
#     myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams, json=info_dict)
#     ## Prob needs error checking here to see if response was sucessful
#     ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
#     ## Cuz 'content' will contain error details instead of study info
#     return(json.loads(myrequest.content))
# def update_subject(base_url, subject_id, info_dict):
#     tool_url = "/api/Subject/"
#     request_type = "POST"
#     myurl = base_url + tool_url + subject_id
#     myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
#     myparams = {"xhrFields": {"withCredentials": "true"},
#                 "dataType": "json",
#                 "contentType": "application/json"}
#     # mydata = { "identifier": { "name": study_name } }
#     myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams, json=info_dict)
#     ## Prob needs error checking here to see if response was sucessful
#     ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
#     ## Cuz 'content' will contain error details instead of study info
#     return(json.loads(myrequest.content))

In [None]:
my_study = "20C0006"
print("Fetching data for Study " + my_study)
studymeta = get_study_metadata(base_url, my_study)
print(studymeta)

bm_id="2744744"
print("Fetching data for Biomaterial " + bm_id)
bm_data = get_biomaterial_details(base_url, bm_id)
print(bm_data)

In [3]:
#https://ccrod.cancer.gov/confluence/display/CCRCRO/Clinical+Informatics
def get_query_results(base_url,info_dict):
    tool_url = "/api/qiagram/queryResults"
    request_type = "POST"
    myurl = base_url + tool_url
    myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
    myparams = {"xhrFields": {"withCredentials": "true"},
                "dataType": "json",
                "contentType": "application/json"
                }
    myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams,json=info_dict)
    return(json.loads(myrequest.content))

def read_biomaterial_id(base_url, biomaterial_id):
    tool_url = "/api/Biomaterial/read"
    request_type = "POST"
    myurl = base_url + tool_url
    myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
    myparams = {"xhrFields": {"withCredentials": "true"},
                "dataType": "json",
                "contentType": "application/json"}
    mydata = { "identifier": { "id": biomaterial_id } }
    myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams, json=mydata)
    ## Prob needs error checking here to see if response was sucessful
    ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
    ## Cuz 'content' will contain error details instead of study info
    return(json.loads(myrequest.content))

def read_subject_code(base_url, subject_code):
    tool_url = "/api/Subject/read"
    request_type = "POST"
    myurl = base_url + tool_url
    myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
    myparams = {"xhrFields": {"withCredentials": "true"},
                "dataType": "json",
                "contentType": "application/json"}
    mydata = { "identifier": { "code": subject_code } }
    myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams, json=mydata)
    ## Prob needs error checking here to see if response was sucessful
    ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
    ## Cuz 'content' will contain error details instead of study info
    return(json.loads(myrequest.content))

def read_subject_id(base_url, subject_id):
    tool_url = "/api/Subject/read"
    request_type = "POST"
    myurl = base_url + tool_url
    myheaders = { "Authorization": "Basic amFpbmExMzphSjExMTkqKg=="}
    myparams = {"xhrFields": {"withCredentials": "true"},
                "dataType": "json",
                "contentType": "application/json"}
    mydata = { "identifier": { "id": subject_id } }
    myrequest=requests.request(request_type, myurl, headers=myheaders, params=myparams, json=mydata)
    ## Prob needs error checking here to see if response was sucessful
    ## if (myrequest.ok): ...  ### See docs on Response object: https://requests.readthedocs.io/en/latest/api/#requests.Response
    ## Cuz 'content' will contain error details instead of study info
    return(json.loads(myrequest.content))

In [163]:
##Calling a Qiagram to get the data
features = pd.read_csv("~/myPART/features.txt",sep="\t")
patientsCodes = features["RTNo"].tolist()
#patientsCodes=["RT00036","RT00043","RT00022","RT00050","RT00063","RT00162","RT00160","RT00088","RT00106","RT00183","RT00114","RT00127","RT00133","RT00032","RT00137","RT00138"]
info_dict = {"id": 5317}
t = (get_query_results(base_url,info_dict))
table_data = pd.DataFrame(t["data"],columns=list(t["columns"].keys()))
#t1 = table_data[table_data['Biomaterial ID - Biomaterials'].isin(['3249387'])]
t1 = table_data[table_data['Type - Biomaterials'].isin(['tumor RNA'])]
#t1 = table_data[table_data['Subject Code - Biomaterials'].isin(patientsCodes)]
#t = t1.iloc[:,[5,27,31]]

In [164]:
###Code to extract the information from the Biomaterial forms
biomaterialDF = pd.DataFrame(columns = ['Patient','Pathology_Id','Anatomical_Site'])
for index, row in t1.iterrows():
    data=read_biomaterial_id(base_url,row["Source Biomaterial ID - Biomaterials"])
    patientCode = row["Subject Code - *Subject Data"]
    pathologyID = data['name'].split("- ")[2].strip()
    anatomicalSite = data['forms']['Biomaterial Details'][0]['fields']['Anatomy/Cell Type']
    biomaterialDF = biomaterialDF.append({'Patient' : patientCode,'Pathology_Id':pathologyID, 'Anatomical_Site' : anatomicalSite},ignore_index = True)

In [166]:
#[item.split("- ")[2].strip() for item in biomaterialDF['Pathology_Id'].tolist()]
#biomaterialDF['Pathology_Id'] = [item.split("- ")[2].strip() for item in biomaterialDF['pathology_code'].tolist()]
#biomaterialDF['pathology_code'] = biomaterialDF['Pathology_Id'].tolist()
biomaterialDF['Pathology_Id'] = [item.split(" ")[0].strip() for item in biomaterialDF['Pathology_Id'].tolist()]
biomaterialDF.head()

Unnamed: 0,Patient,Pathology_Id,Anatomical_Site
0,RT00001,OM-18-211,"arm, left, metastasis"
1,RT00002,SS-19-898,"testicle and spermatic cord, right, metastasis"
2,RT00003,OM-19-58,"adrenal gland, right"
3,RT00006,MD-19-2216,"kidney/adrenal, left"
4,RT00007,GS-19-2048-B1,clivus


In [68]:
bm_id="3343160"
print("Fetching data for Biomaterial " + bm_id)
bm_data = get_biomaterial_details(base_url, bm_id)

Fetching data for Biomaterial 3343160


In [85]:
my_study = "POB Rare Solid Tumor NatHx"

In [None]:
###Code to extract the information from the Subject Forms
#(studymeta['forms']['MyPART Tumor Pathology'])
tissueLocation={}
features = pd.read_csv("~/myPART/features.txt",sep="\t")
patientsCodes = features["RTNo"].tolist()
#patientsCodes = ["RT00142"]
#patientsCodes=["RT00036","RT00043","RT00022","RT00050","RT00063","RT00162","RT00160","RT00088","RT00106","RT00183","RT00114","RT00127","RT00133","RT00032","RT00137","RT00138"]
df = pd.DataFrame(columns = ['Patient', 'Source','Pathology_Id','Anatomical_Site'])
for subjectCode in [x for x in patientsCodes if str(x) != 'nan'] :
    studymeta = read_subject_code(base_url, subjectCode)
    #print(subjectCode)
    for pathReport in studymeta['forms']['MyPART Tumor Pathology']:
        if 'pathology ID' in pathReport['fields'].keys():
            pathID = pathReport['fields']['pathology ID'].strip().split(" ")[0]
            pathID = pathID if len(pathID.split(":")) < 2 else pathID.split(":")[1].strip()
        else:
            pathID=None
        #print(pathID)
        source = (pathReport['fields']['current path report source'] if 'current path report source' in pathReport['fields'].keys() else None)
        #print(source)
        df = df.append({'Patient' : subjectCode, 'Source' : source,'Pathology_Id': pathID, 'Anatomical_Site' : pathReport['fields']['anatomical location']},ignore_index = True)
        #df.append([subjectCode, pathReport['fields']['current path report source'], pathReport['fields']['anatomical location']])
        #if pathReport['fields']['current path report source'] in ['NIH pathology diagnosis']:
            #tissueLocation="pelvic mass"
            #tissueLocation[subjectCode] = pathReport['fields']['anatomical location']

In [37]:
###Code to extract the TSO500 information from the Subject Forms
#(studymeta['forms']['MyPART Tumor Pathology'])
tissueLocation={}
features = pd.read_csv("~/myPART/WGSData/featuresWGS.txt",sep="\t")
patientsCodes = features["RTNo"].tolist()
#print(patientsCodes)
patientsCodes = ["RT00190"]
#patientsCodes=["RT00036","RT00043","RT00022","RT00050","RT00063","RT00162","RT00160","RT00088","RT00106","RT00183","RT00114","RT00127","RT00133","RT00032","RT00137","RT00138"]
df = pd.DataFrame(columns = ['Patient', 'Tumor %'])
for subjectCode in [x for x in patientsCodes if str(x) != 'nan'] :
    studymeta = read_subject_code(base_url, subjectCode)
    print(subjectCode)
    if 'MyPART TSO 500' in studymeta['forms'].keys():
        for pathReport in studymeta['forms']['MyPART TSO 500']:
            if 'Tumor %' in pathReport['fields'].keys():
                #pathID = pathReport['fields']['pathology ID'].strip().split(" ")[0]
                #pathID = pathID if len(pathID.split(":")) < 2 else pathID.split(":")[1].strip()
                tumorPurity = pathReport['fields']['Tumor %']
                #dnaID = pathReport['fields']['DNA ID']
            else:
                tumorPurity = None
                #dnaID = None
            #print(tumorPurity)
            #tumorPurity = (pathReport['fields']['current path report source'] if 'current path report source' in pathReport['fields'].keys() else None)
            #tumorPurity = pathReport['fields']['Tumor %']
            #print(source)
            df = df.append({'Patient' : subjectCode,'Tumor %' : tumorPurity},ignore_index = True)
            #df.append([subjectCode, pathReport['fields']['current path report source'], pathReport['fields']['anatomical location']])
            #if pathReport['fields']['current path report source'] in ['NIH pathology diagnosis']:
                #tissueLocation="pelvic mass"
                #tissueLocation[subjectCode] = pathReport['fields']['anatomical location']
    else:
        df = df.append({'Patient' : subjectCode, 'Tumor %' : None},ignore_index = True)

RT00190
20
RT00022
90
RT00032
70
RT00036
75
RT00043
90
RT00050
70
RT00064
70
RT00088
90
80
RT00106
50
RT00114
60
RT00127
85
RT00133
80
RT00137
80
RT00138
90
RT00160
90
RT00162
70
RT00183
70
60
RT00053
80
RT00142
70
RT00014
60
RT00038
60
RT00161
35
RT00024
80
RT00033
90
RT00039
80
RT00045
85
RT00056
80
RT00060
70
RT00062
90
RT00063
70
RT00066
80
80
RT00095
80
RT00105
85
RT00143
70
RT00148
100
RT00172
80
90
RT00179
60
None
RT00159
90
RT00166
80
RT00101
60
RT00115
80
RT00128
90
RT00132
60
RT00171
RT00094
60
RT00041
65
RT00076
80
RT00083
80
RT00084
50
RT00085
80
RT00086
90
RT00108
60
RT00110
40
RT00113
80
RT00118
RT00175
20
RT00178
60
RT00182
85
RT00185
70
RT00047
70
RT00147
60
RT00144
80
RT00091
80
RT00116
60
RT00164
65
RT00163
80
RT00130
70
RT00125
70
RT00006
70


In [29]:
###Code to extract the Germline information from the Subject Forms
#(studymeta['forms']['MyPART Tumor Pathology'])
tissueLocation={}
features = pd.read_csv("~/myPART/WGSData/featuresWGS.txt",sep="\t")
patientsCodes = features["RTNo"].tolist()
#print(patientsCodes)
#patientsCodes = ["RT00190"]
#patientsCodes=["RT00036","RT00043","RT00022","RT00050","RT00063","RT00162","RT00160","RT00088","RT00106","RT00183","RT00114","RT00127","RT00133","RT00032","RT00137","RT00138"]
df = pd.DataFrame(columns = ['Patient', 'Tumor %'])
for subjectCode in [x for x in patientsCodes if str(x) != 'nan'] :
    studymeta = read_subject_code(base_url, subjectCode)
    print(subjectCode)
    if 'MyPART Genetic Consult' in studymeta['forms'].keys():
        for pathReport in studymeta['forms']['MyPART Genetic Consult']:
            if 'patient CLIA germline mutations' in pathReport['childFields'].keys():
                #pathID = pathReport['fields']['pathology ID'].strip().split(" ")[0]
                #pathID = pathID if len(pathID.split(":")) < 2 else pathID.split(":")[1].strip()
                print(pathReport['childFields']['patient CLIA germline mutations'])
                #dnaID = pathReport['fields']['DNA ID']
            else:
                tumorPurity = None
                #dnaID = None
            #print(tumorPurity)
            #tumorPurity = (pathReport['fields']['current path report source'] if 'current path report source' in pathReport['fields'].keys() else None)
            #tumorPurity = pathReport['fields']['Tumor %']
            #print(source)
            #df = df.append({'Patient' : subjectCode,'Tumor %' : tumorPurity},ignore_index = True)
            #df.append([subjectCode, pathReport['fields']['current path report source'], pathReport['fields']['anatomical location']])
            #if pathReport['fields']['current path report source'] in ['NIH pathology diagnosis']:
                #tissueLocation="pelvic mass"
                #tissueLocation[subjectCode] = pathReport['fields']['anatomical location']
    else:
        #df = df.append({'Patient' : subjectCode, 'Tumor %' : None},ignore_index = True)
        print("No Report")

RT00190
No Report
RT00022
No Report
RT00032
[]
RT00036
[{'id': 8, 'fields': {'gene': 'NTHL1', 'mode of inheritance': 'autosomal dominant', 'variant': 'c.856G>A', 'classification': 'VUS'}}]
RT00043
No Report
RT00050
No Report
RT00064
[{'id': 21, 'fields': {'gene': 'SDHB', 'disease': 'PGL/PHEO', 'mode of inheritance': 'autosomal dominant', 'variant': 'c.380C>T', 'zygosity': 'heterozygous', 'inherited from': 'father', 'classification': 'pathogenic'}}]
RT00088
No Report
RT00106
No Report
RT00114
[]
RT00127
No Report
RT00133
No Report
RT00137
No Report
RT00138
No Report
RT00160
No Report
RT00162
No Report
RT00183
[{'id': 42, 'fields': {'gene': 'APC', 'disease': 'FAP', 'mode of inheritance': 'autosomal dominant', 'classification': 'pathogenic'}}]
RT00053
No Report
RT00142
No Report
RT00014
[{'id': 41, 'fields': {'gene': 'FBN2', 'disease': 'contractural arachnodactyly', 'mode of inheritance': 'autosomal dominant', 'variant': 'c.6948 C>G', 'zygosity': 'heterozygous', 'classification': 'VUS'}}]

In [None]:
#studymeta = read_subject_code(base_url, "RT00142")
#df.drop_duplicates('Patient')
#df.drop_duplicates('Patient').to_csv('~/patientWODuplicate.csv', index=False,sep=",")
pd.merge(df, biomaterialDF, on=["Pathology_Id"])
pd.merge(df, biomaterialDF, on=["Pathology_Id"]).to_csv('~/patientBiomaterialIDMatch.csv', index=False,sep=",")
#pd.merge(df.drop_duplicates('Patient'), pd.merge(df, biomaterialDF, on=["Patient","Patient_x"]), on="Patient", how="outer").to_csv('~/patientBiomaterialAll.csv', index=False,sep=",")

In [None]:
#list(set(biomaterialDF['Pathology_Id'].tolist()) & set(df['Pathology_Id'].tolist()))
re.search(biomaterialDF['Pathology_Id'].tolist()[0],df['Pathology_Id'].tolist())

In [27]:
studymeta = read_subject_code(base_url, "RT00014")

In [28]:
studymeta['forms']['MyPART Genetic Consult'][0]['childFields']

{'patient phenotypes': [{'id': 17, 'fields': {'phenotype': 'Arachnodactyly'}},
  {'id': 18, 'fields': {'phenotype': 'Translucent Skin'}},
  {'id': 19, 'fields': {'phenotype': 'Short philtrum'}},
  {'id': 20, 'fields': {'phenotype': 'Clinodactyly of the 5th finger'}},
  {'id': 21, 'fields': {'phenotype': 'Hyperflexibility'}},
  {'id': 22, 'fields': {'phenotype': 'Pectus excavatum'}},
  {'id': 23, 'fields': {'phenotype': 'Low set ears'}},
  {'id': 24, 'fields': {'phenotype': 'Macrocephaly'}},
  {'id': 25, 'fields': {'phenotype': 'OC=10'}}],
 'patient conditions': [],
 'patient relative phenotypes': [],
 'patient relative conditions': [],
 'patient CLIA germline mutations': [{'id': 41,
   'fields': {'gene': 'FBN2',
    'disease': 'contractural arachnodactyly',
    'mode of inheritance': 'autosomal dominant',
    'variant': 'c.6948 C>G',
    'zygosity': 'heterozygous',
    'classification': 'VUS'}}],
 'additional patient research germline mutations': []}

In [108]:
data=read_biomaterial_id(base_url,"1957756")
data#'MyPART Biomaterial'

'kidney/adrenal, left'

In [43]:
pd.merge(df, features, left_on=["Patient"],right_on=["RTNo"]).to_csv('~/myPART/WGSData/sampleTumorPurityTS500.csv', index=False,sep=",")
#df.merge(features, left_index=True, right_index=True,how='outer', suffixes=('', '_y')).to_csv('~/myPART/WGSData/sampleTumorPurityTS500.csv', index=False,sep=",")

In [206]:
##Extract physicals of MyPART Patients
info_dict = {"id": 6067}
t = (get_query_results(base_url,info_dict))
table_data = pd.DataFrame(t["data"],columns=list(t["columns"].keys()))
table_data['Age'] = pd.to_numeric(table_data['Age'])
t1 = table_data[table_data['Age'] <= 20]

In [None]:
df = pd.DataFrame(columns = ['Patient','Gender', 'Age','Height','Weight'])
patientsCodes = t1["Subject ID"].tolist()
for subjectCode in [x for x in patientsCodes if str(x) != 'nan'] :
    studymeta = read_subject_id(base_url, subjectCode)
    gender = t1[t1['Subject ID'] == subjectCode]
    print(subjectCode)
    if 'MyPART Physical Exam' in studymeta['forms'].keys():
        for pathReport in studymeta['forms']['MyPART Physical Exam']:
            age=pathReport['fields']['age']
            height=pathReport['fields']['height']
            weight=pathReport['fields']['body weight']
            df = df.append({'Patient' : gender['Subject Code'].tolist()[0], 'Gender' : (gender['Sex'].tolist())[0],'Age': age, 'Height' : height,'Weight':weight},ignore_index = True)        

In [261]:
df.to_csv('~/myPART/patientsPhysiology.csv', index=False)

In [258]:
gender

Unnamed: 0,Subject ID,Subject Code,Age,Date of Birth,Date of Death,Race(s),Sex,Ethnicity,Study Code,Family Code,Family Name,Family ID
