In [None]:
import os
import json
import synapseclient
import pandas
import requests
import boto3
import nda_aws_token_generator
import logging

logger = logging.getLogger("main")
logger.setLevel(logging.DEBUG)
#create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# NDA Configuration
REFERENCE_GUID = 'NDAR_INVRT663MBL'

# This is an old genomics subject
EXCLUDE_GENOMICS_SUBJECTS = ('92027', )
# EXCLUDE_EXPERIMENTS = ('534', '535')
EXCLUDE_EXPERIMENTS = ()

metadata_columns = ['src_subject_id', 'experiment_id', 'subjectkey', 'sample_id_original', 
                    'sample_id_biorepository',
                    'subject_sample_id_original', 'biorepository', 'subject_biorepository', 'sample_description',
                    'species', 'site', 'sex', 'sample_amount', 'phenotype', 'comments_misc']

nda_bucket_name = 'nda-bsmn'

# Synapse configuration
synapse_data_folder = 'syn7872188'
synapse_data_folder_id = int(synapse_data_folder.replace('syn', ''))
storage_location_id = '9209'

content_type_dict = {'.gz': 'application/x-gzip', '.bam': 'application/octet-stream', 
                     '.zip': 'application/zip'}

# Credential configuration for NDA

In [None]:
s3 = boto3.resource("s3")
obj = s3.Object('kdaily-lambda-creds.sagebase.org', 'ndalogs_config.json')

config = json.loads(obj.get()['Body'].read())

ndaconfig = config['nda']

tokengenerator = nda_aws_token_generator.NDATokenGenerator()
mytoken = tokengenerator.generate_token(ndaconfig['username'],
                                        ndaconfig['password'])

session = boto3.Session(
    aws_access_key_id=mytoken.access_key,
    aws_secret_access_key=mytoken.secret_key,
    aws_session_token=mytoken.session
)

s3_nda = session.resource("s3")

# Get Samples

Use the NDA api to get the `genomics_sample03` records for this GUID.

In [None]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=genomics_sample03".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], ndaconfig['password']),
                 headers={'Accept': 'application/json'})

guid_data = json.loads(r.text)

In [None]:
# Get data files from samples. There are currently up to two files per row.

tmp = [{col['name']: col['value'] for col in row['dataElement']} 
       for row in guid_data['age'][0]['dataStructureRow']]
    
samples = pandas.io.json.json_normalize(tmp)

colnames_lower = map(lambda x: x.lower(), samples.columns.tolist())
samples.columns = colnames_lower

# exclude some experiments
samples = samples[~samples.experiment_id.isin(EXCLUDE_EXPERIMENTS)]

samples['species'] = samples.organism.replace(['Homo Sapiens'], ['Human'])

samples1 = samples[['src_subject_id', 'experiment_id', 'subjectkey', 'sample_id_original', 'sample_id_biorepository',
                    'organism', 'species', 'sample_amount', 'sample_unit', 'biorepository',
                    'comments_misc', 'site', 'data_file1', 'data_file1_type']]

samples1.rename(columns={'data_file1': 'data_file', 'data_file1_type': 'fileFormat'}, inplace=True)

samples2 = samples[['src_subject_id', 'experiment_id', 'subjectkey', 'sample_id_original', 'sample_id_biorepository',
                    'organism', 'species', 'sample_amount', 'sample_unit', 'biorepository', 
                    'comments_misc', 'site', 'data_file2', 'data_file2_type']]

samples2.rename(columns={'data_file2': 'data_file', 'data_file2_type': 'fileFormat'}, inplace=True)

samples3 = pandas.concat([samples1, samples2], ignore_index=True)
samples3.filter(~samples3.data_file.isnull())
samples3['fileFormat'].replace(['BAM', 'FASTQ'], ['bam', 'fastq'], inplace=True)

# Remove initial slash to match what is in manifest file
samples3.data_file = samples3['data_file'].apply(lambda value: value[1:] if not pandas.isnull(value) else value)

# Remove stuff that isn't part of s3 path
samples3.data_file = map(lambda x: str(x).replace("![CDATA[", "").replace("]]>", ""), samples3.data_file.tolist())

samples3 = samples3[samples3.data_file != 'nan']

samples3.to_csv("./samples3.csv")

# Get Subjects

Use the NDA api to get the `genomics_subject02` records for this GUID.

In [None]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=genomics_subject02".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], ndaconfig['password']),
                 headers={'Accept': 'application/json'})

subject_guid_data = json.loads(r.text)

In [None]:
tmp_subject = []
for row in subject_guid_data['age'][0]['dataStructureRow']:
    foo = {col['name']: col['value'] for col in row['dataElement']}
    tmp_subject.append(foo)
    
subjects = pandas.io.json.json_normalize(tmp_subject)
subjects = subjects[~subjects.GENOMICS_SUBJECT02_ID.isin(EXCLUDE_GENOMICS_SUBJECTS)]

colnames_lower = map(lambda x: x.lower(), subjects.columns.tolist())
subjects.columns = colnames_lower

subjects = subjects.assign(sex=subjects.gender.replace(['M', 'F'], ['male', 'female']),
                           subject_sample_id_original=subjects.sample_id_original,
                           subject_biorepository=subjects.biorepository)

subjects = subjects[['src_subject_id', 'subjectkey', 'gender', 'race', 'phenotype', 
                   'subject_sample_id_original', 'sample_description', 'subject_biorepository', 'sex']]

subjects = subjects.drop_duplicates()

subjects.to_csv("./subjects.csv")

# Get Tissues

Use the NDA api to get the `ncihd_btb02` records for this GUID.

In [None]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=nichd_btb02".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], ndaconfig['password']),
                 headers={'Accept': 'application/json'})

btb_guid_data = json.loads(r.text)

tmp_btb = []
for row in btb_guid_data['age'][0]['dataStructureRow']:
    foo = {col['name']: col['value'] for col in row['dataElement']}
    tmp_btb.append(foo)
    
btb = pandas.io.json.json_normalize(tmp_btb)

colnames_lower = map(lambda x: x.lower(), btb.columns.tolist())
btb.columns = colnames_lower

btb.drop('nichd_btb02_id', axis=1, inplace=True)
btb = btb.drop_duplicates()

btb.to_csv('./btb.csv')

# Get Manifests

Get list of `.manifest` files from the NDA-BSMN bucket. Read them in and concatenate them, under the assumption that the files listed in the manifest are in the same directory as the manifest file itself.

In [None]:
bucket = s3_nda.Bucket(nda_bucket_name)
manifests = [x for x in bucket.objects.all() if x.key.find('.manifest') >=0]

In [None]:
manifest = pandas.DataFrame()

for m in manifests:
    folder = os.path.split(m.key)[0]
    tmp = pandas.read_csv(m.get()['Body'], delimiter="\t", header=None)
    tmp.columns = ('filename', 'md5', 'size')
    tmp.filename = "s3://%s/%s/" % (nda_bucket_name, folder,) + tmp.filename.map(str)
    manifest = pandas.concat([manifest, tmp])

manifest.reset_index(drop=True, inplace=True)

manifest.to_csv('./manifest.csv')

Merge together the tissue file and the subjects file.

In [None]:
btb_subjects = btb.merge(subjects, how="left",
                         left_on=["src_subject_id", "subjectkey", "race", "gender"],
                         right_on=["src_subject_id", "subjectkey", "race", "gender"])

btb_subjects = btb_subjects.assign(sample_id_biorepository=btb_subjects.sample_id_original)

# Drop this as it will come back from the samples
btb_subjects.drop('sample_id_original', axis=1, inplace=True)

btb_subjects.to_csv('btb_subjects.csv')

Merge the tissue/subject with the samples to make a complete metadata table.

In [None]:
metadata = samples3.merge(btb_subjects, how="left",
                          left_on=["src_subject_id", "subjectkey", "sample_id_biorepository"],
                          right_on=["src_subject_id", "subjectkey", "sample_id_biorepository"])

metadata.index = metadata.data_file

metadata = metadata.drop_duplicates()

metadata = metadata[metadata_columns]

metadata.to_csv("./metadata.csv")

# Synapse

Using the concatenated manifests as the master list of files to store, create file handles and entities in Synapse.

Use the metadata table to get the appropriate tissue/subject/sample annotations to set on each File entity.

In [None]:
syn = synapseclient.login(silent=True)

update = True

for n, x in manifest.iterrows():
    s3Key = x['filename'].replace("s3://%s/" % nda_bucket_name, "")
    s3FilePath = os.path.split(s3Key)[-1]
    contentSize = x['size']
    contentMd5 = x['md5']
    
    logger.debug("%s - %s" % (s3Key, s3FilePath))

    # Check if it exists in Synapse
    res = syn.restGET("/entity/md5/%s" % (contentMd5, ))['results']
    
    res = filter(lambda x: x['benefactorId'] == synapse_data_folder_id, res)
    
    if len(res) > 0:
        
        fhs = [syn.restGET("/entity/%(id)s/version/%(versionNumber)s/filehandles" % er) for er in res]    
        fileHandleObj = syn._getFileHandle(fhs[0]['list'][0]['id'])

#         logger.info("%s already exists in Synapse (count = %s). Reusing fh %s" % (os.path.split(s3Key)[1], 
#                                                                                   len(res), fileHandleObj))
            
    else:
        # print "Adding %s (%s)" % (contentMd5, x['filename'])

        
        contentType = content_type_dict.get(os.path.splitext(x['filename'])[-1],
                                            'application/octet-stream')
        
        fileHandle = {'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle',
                      'fileName'    : s3FilePath,
                      'contentSize' : contentSize,
                      'contentType' : contentType,
                      'contentMd5' :  contentMd5,
                      'bucketName' : nda_bucket_name,
                      'key'        : s3Key,
                      'storageLocationId' : storage_location_id}

        fileHandleObj = syn.restPOST('/externalFileHandle/s3', 
                                     json.dumps(fileHandle), 
                                     endpoint=syn.fileHandleEndpoint)

    try:
        a = metadata.loc[x['filename']].to_dict()
        
        logger.debug("filename = %s, annotations = %s" % (x['filename'], a))

        f = synapseclient.File(parentId=synapse_data_folder, 
                               name=s3FilePath, 
                               dataFileHandleId = fileHandleObj['id'])
        f.annotations = a

        f = syn.store(f, forceVersion=False)
    except KeyError:
        logger.debug("Error getting metadata to annotation dictionary %s" % (x['filename'], ))
