In [4]:
import os
import subprocess
import requests
import json
import sys
import pandas as pd

import ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context





#from tcga downloader import *
#ids=process_manifests(manifest_file)

'''
Match tumor and normal samples to a patient through GDC API. This will provide metadata that
for associated files including sample type(tumor /normal) patient ID(TCGA barcode), etc
The general query structure will look like this:
curl "https://gdc-api.nci.nih.gov/files?size=100000&format=tsv&filters=XXXX&fields=YYYY"
filters= :
The XXXX will be replaced with a URL-encoded JSON query that will filter a list of files. The JSON query can be generated yourself and then URL encoded, or the filtering can be performed on the GDC Data Portal (https://gdc-portal.nci.nih.gov/search/s?facetTab=files). You can copy and paste the string that appears where the * are displayed in the following:
https://gdc-portal.nci.nih.gov/search/f?filters=**********&facetTab=cases
'''

def prepare_payload(ids,data_type=None):

    #"Gene Expression Quantification"
    
    "Workflow Type" "HTSeq-Counts"

    "Data Category" "transcriptome profilling"
    "Experimental Strategy" "RNA-Seq"

    no_of_samples=len(ids)
    part1='''{
    "filters":{
        "op":"and",
        "content":[
            {
                "op":"in",
                "content":{
                    "field":"files.file_id",
                    "value":[%s]
                }
            },
            {
                "op":"=",
                "content":{
                    "field":"files.data_type",
                    "value":"%s"
                }
            }
        ]
    },'''%(",\n".join(ids),data_type)


    part2=''' "format":"TSV","fields":"file_id,file_name,cases.submitter_id,cases.disease_type,cases.case_id,data_category,data_type,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.samples.sample_type,cases.samples.submitter_id,cases.samples.sample_id,cases.samples.portions.analytes.aliquots.aliquot_id,cases.samples.portions.analytes.aliquots.submitter_id","size":"%d"}'''%no_of_samples

    payload_command='%s %s'%(part1,part2)

    payloadfile='payloadv3.txt'
    with open(payloadfile,'w') as output_:
        output_.write(payload_command)
    return payloadfile


def get_ids(manifest):
    try:
#manifest='all.txt'
        with open (manifest,'r') as input_:
            ids=["\"%s\""%i.strip('\n').split('\t')[0] for i in input_][1:]

        no_of_samples=len(ids)
        return ids
    except Exception as ex:
        return None
    #print(ids)


def get_metadata(payloadfile):
    metadata='Metadata.tsv'
    webaddr='\'https://api.gdc.cancer.gov/files\''
    args=['curl', '--request POST', '--header','\"Content-Type: application/json\"',
      '--data','@%s'%payloadfile,webaddr, '>', metadata]

    single=' '.join(args)
    print(single)
    os.system(single)
    return metadata


#def get_metadatada():
 #   ids=process_manifest()
  #  if ids==None:
   #     print('Error encountered\nPlease ensure that you are using the correct manifest file')
   # else:
   #     payloadfile=prepare_payload(ids)
   #     download_data(payloadfile)

def download_data(metadatafile,sep='\t',outdir='downloads'):
    df=pd.DataFrame()
    data_df=pd.read_csv(metadatafile,sep=sep)
    sampletypes=data_df['cases.0.samples.0.sample_type'].unique()
    sampletypes=sampletypes.tolist()
    curr_dir=os.getcwd()
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    else:
        print('output directory exists\ndata may be overwritten')
        
    for sampletype in sampletypes:
        sel=data_df[data_df['cases.0.samples.0.sample_type'].str.contains(sampletype)][['file_id',
                                                                               'file_name']]

        sampledir="%s/%s"%(outdir,sampletype)
        
        if not os.path.exists(sampledir):
            os.mkdir(sampledir)
        else:
            print('sample type directory exists\ndata may be overwritten')
            
        
        os.chdir(sampledir)
        all_file_ids=sel['file_id'].values
        download_list=[]
        for file_id in all_file_ids:
            args=['curl', '--remote-name', '--remote-header-name',
                  '\'https://api.gdc.cancer.gov/data/%s\''%file_id]
            cmd=' '.join(args)
            print('downloading %s'%file_id)
            os.system(cmd)
            #download_list.append(' '.join(args))
        os.chdir(curr_dir)
    print('Download complete\nAll data has been downloaded to ------------->%s'%outdir)
        
        
    

In [None]:
ids = get_ids("data/manifest.txt")
#ids = ['"0389c08d-927d-4254-8d7f-f44963594db3"', '"048f2967-e1e6-43c4-831f-178df5ddf660"']
print(len(ids))
payload = prepare_payload(ids, data_type = 'Gene Expression Quantification')
metadata = get_metadata(payload)
download_data(metadata,sep='\t',outdir='BRCA')

1222
curl --request POST --header "Content-Type: application/json" --data @payloadv3.txt 'https://api.gdc.cancer.gov/files' > Metadata.tsv
downloading 7b49680e-d7a7-4b6c-8763-5d0d26c8bc61
downloading cefaebf8-5419-4ddd-8dc1-e2867d584ce0
downloading 0c9bc998-17a7-42cf-b4d2-b8a5df20ce8b
downloading 078f7608-b8f7-4827-b752-adfed6064922
downloading f821249b-0738-49a8-89e6-4756e894d000
downloading a93597df-f552-4b43-9985-3a0955b03a1a
downloading 96463113-d20c-405e-b2e8-56eed3faf201
downloading 298e02ea-bc72-48a5-8bfa-af556b7f7487
downloading 12f2970e-36d3-483d-9cd0-cc34df10701f
downloading d86f17a9-0ee2-4bd1-993d-f23ba7fe3a26
downloading 63434097-5ed0-435b-9df5-940154e4a72a
downloading 808080d5-ed28-4542-a30a-fe0169de2ef3
downloading 64d56e28-7e24-4fcc-8515-5808bf042d9d
downloading 6253a9c3-4b64-4339-b40b-43e40a511299
downloading 426bae83-15fe-4b2b-9e29-5d36c09a4361
downloading ecc6b7a3-d7af-4bac-a380-62475c1a9f2e
downloading e47ded12-ac91-4443-981a-94832d0b3593
downloading 628287b4-2a1e-47