In [1]:
import os
import json
import synapseclient
import pandas
import requests
import boto3
import nda_aws_token_generator
import logging

pandas.options.display.max_rows = None
pandas.options.display.max_columns = None
pandas.options.display.max_colwidth = 1000

logger = logging.getLogger("main")
logger.setLevel(logging.DEBUG)
#create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# NDA Configuration
REFERENCE_GUID = 'NDAR_INVRT663MBL'

# This is an old genomics subject
EXCLUDE_GENOMICS_SUBJECTS = ('92027', )
# EXCLUDE_EXPERIMENTS = ('534', '535')
EXCLUDE_EXPERIMENTS = ()

metadata_columns = ['src_subject_id', 'experiment_id', 'subjectkey', 'sample_id_original', 
                    'sample_id_biorepository', 'subject_sample_id_original', 'biorepository', 
                    'subject_biorepository', 'sample_description', 'species', 'site', 'sex',
                    'sample_amount', 'phenotype', 'comments_misc', 'sample_unit', 'fileFormat']

sample_columns = ['src_subject_id', 'experiment_id', 'subjectkey', 'sample_id_original', 
                  'sample_id_biorepository', 'organism', 'species', 'sample_amount', 'sample_unit', 
                  'biorepository', 'comments_misc', 'site']

subject_columns = ['src_subject_id', 'subjectkey', 'gender', 'race', 'phenotype',
                   'subject_sample_id_original', 'sample_description', 'subject_biorepository',
                   'sex']

nda_bucket_name = 'nda-bsmn'

# Synapse configuration
synapse_data_folder = 'syn7872188'
synapse_data_folder_id = int(synapse_data_folder.replace('syn', ''))
storage_location_id = '9209'

content_type_dict = {'.gz': 'application/x-gzip', '.bam': 'application/octet-stream', 
                     '.zip': 'application/zip'}

# Credential configuration for NDA

In [2]:
s3 = boto3.resource("s3")
obj = s3.Object('kdaily-lambda-creds.sagebase.org', 'ndalogs_config.json')

config = json.loads(obj.get()['Body'].read())

ndaconfig = config['nda']

tokengenerator = nda_aws_token_generator.NDATokenGenerator()
mytoken = tokengenerator.generate_token(ndaconfig['username'],
                                        ndaconfig['password'])

session = boto3.Session(
    aws_access_key_id=mytoken.access_key,
    aws_secret_access_key=mytoken.secret_key,
    aws_session_token=mytoken.session
)

s3_nda = session.resource("s3")

# Get Samples

Use the NDA api to get the `genomics_sample03` records for this GUID.

In [3]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=genomics_sample03".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                  ndaconfig['password']),
                 headers={'Accept': 'application/json'})

guid_data = json.loads(r.text)

Get data files from samples. There are currently up to two files per row.

In [4]:
tmp = [{col['name']: col['value'] for col in row['dataElement']} 
       for row in guid_data['age'][0]['dataStructureRow']]
    
samples = pandas.io.json.json_normalize(tmp)

colnames_lower = map(lambda x: x.lower(), samples.columns.tolist())
samples.columns = colnames_lower

# exclude some experiments
samples = samples[~samples.experiment_id.isin(EXCLUDE_EXPERIMENTS)]

samples['species'] = samples.organism.replace(['Homo Sapiens'], ['Human'])

samples1 = samples[sample_columns + ['data_file1', 'data_file1_type']]

samples1.rename(columns={'data_file1': 'data_file', 'data_file1_type': 'fileFormat'}, 
                inplace=True)

samples2 = samples[sample_columns + ['data_file2', 'data_file2_type']]

samples2.rename(columns={'data_file2': 'data_file', 'data_file2_type': 'fileFormat'}, 
                inplace=True)

samples3 = pandas.concat([samples1, samples2], ignore_index=True)
samples3.filter(~samples3.data_file.isnull())
samples3['fileFormat'].replace(['BAM', 'FASTQ', 'bam_index'], ['bam', 'fastq', 'bai'], inplace=True)

# Remove initial slash to match what is in manifest file
samples3.data_file = samples3['data_file'].apply(lambda value: value[1:] if not pandas.isnull(value) else value)

# Remove stuff that isn't part of s3 path
samples3.data_file = map(lambda x: str(x).replace("![CDATA[", "").replace("]]>", ""), 
                         samples3.data_file.tolist())

samples3 = samples3[samples3.data_file != 'nan']

samples3.to_csv("./samples3.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [5]:
samples3

Unnamed: 0,src_subject_id,experiment_id,subjectkey,sample_id_original,sample_id_biorepository,organism,species,sample_amount,sample_unit,biorepository,comments_misc,site,data_file,fileFormat
0,5154,535,NDAR_INVRT663MBL,BSMN_REF_NeuN+_A1,BSMN_REF_NeuN+_A1,Homo Sapiens,Human,17.39,ug - micrograms,LIBD,Yale/Mayo;brain;dura;NeuN+;single nucleus;MDA;WGS;QC,Yale/Mayo,s3://nda-bsmn/abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/Project_Lt76/Sample_E_A1/A1_S1_L008_R1_001.fastq.gz,fastq
1,5154,533,NDAR_INVRT663MBL,5154_fibroblast,5154_fibroblast,Human,Human,379.43,gigabases,LIBD,dural fibroblast for common experiment,U01MH106892,s3://nda-bsmn/seancho_1492630428786/KKI_U01MH_JP-DuralFibroblast.zip,zip
2,5154,641,NDAR_INVRT663MBL,71938,5154_fibroblast,,,1.0,,LIBD,,,s3://nda-bsmn/jmkidd_1491406438314/71938.lanes-merged.sort.RG.mkdups.realign.recal.bam,bam
3,5154,643,NDAR_INVRT663MBL,BSM-Cells,5154_fibroblast,Human,Human,50.0,ng,LIBD,,,s3://nda-bsmn/reneegeorge_1490833811716/BSM-Cells.final.bam,bam
4,5154,643,NDAR_INVRT663MBL,BSM-Br-3,5154_brain,Human,Human,50.0,ng,LIBD,,,s3://nda-bsmn/reneegeorge_1490833811716/BSM-Br-3.final.bam,bam
5,5154,643,NDAR_INVRT663MBL,BSM-Br-2,5154_brain,Human,Human,50.0,ng,LIBD,,,s3://nda-bsmn/reneegeorge_1490833811716/BSM-Br-2.final.bam,bam
6,5154,643,NDAR_INVRT663MBL,BSM-Br-1,5154_brain,Human,Human,50.0,ng,LIBD,,,s3://nda-bsmn/reneegeorge_1490833811716/BSM-Br-1.final.bam,bam
7,5154,534,NDAR_INVRT663MBL,BSMN_REF_fibroblasts,BSMN_REF_fibroblasts,Homo Sapiens,Human,124240.0,ug - micrograms,LIBD,Yale/Mayo;fibroblast;WGS,Yale/Mayo,s3://nda-bsmn/abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/1608UNHX-0002/fibroblasts/fibroblasts_sorted.bam,bam
8,5154,534,NDAR_INVRT663MBL,BSMN_REF_brain,BSMN_REF_brain,Homo Sapiens,Human,21220.0,ug - micrograms,LIBD,Yale/Mayo;brain;dura;WGS,Yale/Mayo,s3://nda-bsmn/abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/1608UNHX-0002/bulk/bulk_sorted.bam,bam
9,5154,534,NDAR_INVRT663MBL,BSMN_REF_NeuN+,BSMN_REF_NeuN+,Homo Sapiens,Human,3980.0,ug - micrograms,LIBD,Yale/Mayo;brain;dura;NeuN+;WGS,Yale/Mayo,s3://nda-bsmn/abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/1608UNHX-0001/Neun-Plus/Neun-Plus_sorted.bam,bam


# Get Subjects

Use the NDA API to get the `genomics_subject02` records for this GUID.

In [19]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=genomics_subject02".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                  ndaconfig['password']),
                 headers={'Accept': 'application/json'})

subject_guid_data = json.loads(r.text)

In [20]:
tmp_subject = []
for row in subject_guid_data['age'][0]['dataStructureRow']:
    foo = {col['name']: col['value'] for col in row['dataElement']}
    tmp_subject.append(foo)
    
subjects = pandas.io.json.json_normalize(tmp_subject)

subjects = subjects[~subjects.GENOMICS_SUBJECT02_ID.isin(EXCLUDE_GENOMICS_SUBJECTS)]

colnames_lower = map(lambda x: x.lower(), subjects.columns.tolist())
subjects.columns = colnames_lower

subjects = subjects.assign(sex=subjects.gender.replace(['M', 'F'], ['male', 'female']),
                           subject_sample_id_original=subjects.sample_id_original,
                           subject_biorepository=subjects.biorepository)

subjects = subjects[subject_columns]

subjects = subjects.drop_duplicates()

subjects.to_csv("./subjects.csv")

# Get Tissues

Use the NDA api to get the `ncihd_btb02` records for this GUID.

In [21]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=nichd_btb02".format(REFERENCE_GUID),
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                  ndaconfig['password']),
                 headers={'Accept': 'application/json'})

btb_guid_data = json.loads(r.text)

tmp_btb = []
for row in btb_guid_data['age'][0]['dataStructureRow']:
    foo = {col['name']: col['value'] for col in row['dataElement']}
    tmp_btb.append(foo)
    
btb = pandas.io.json.json_normalize(tmp_btb)

colnames_lower = map(lambda x: x.lower(), btb.columns.tolist())
btb.columns = colnames_lower

# This makes them non-unique, so drop them
btb.drop('nichd_btb02_id', axis=1, inplace=True)

btb = btb.drop_duplicates()

btb.to_csv('./btb.csv')

In [22]:
btb

Unnamed: 0,bmi,br_reg,cdeathoff,celltype,death027,gender,grade_highed,race,rindlpfc,s166,s196,sample_id_original,src_subject_id,subjectkey
0,25.8,frontal cortex,Multiple injuries,NeuN-,3,M,8,White,8,1.0,,5154_NeuN_negative,5154,NDAR_INVRT663MBL
2,25.8,frontal cortex,Multiple injuries,NeuN+,3,M,8,White,8,1.0,,5154_NeuN_positive,5154,NDAR_INVRT663MBL
3,25.8,frontal cortex,Multiple injuries,,3,M,8,White,8,1.0,,5154_brain,5154,NDAR_INVRT663MBL
4,25.8,dura mater,Multiple injuries,fibroblast,3,M,8,White,8,,1.0,5154_fibroblast,5154,NDAR_INVRT663MBL


Merge together the tissue file and the subjects file.

We instituted a standard to use `sample_id_biorepository` in the `genomics_sample03` file to map to `sample_id_original` in the `nichd_btb02` file.

In [23]:
btb_subjects = btb.merge(subjects, how="left",
                         left_on=["src_subject_id", "subjectkey", "race", "gender"],
                         right_on=["src_subject_id", "subjectkey", "race", "gender"])

# Rename this column to simplify merging with the sample table
btb_subjects = btb_subjects.assign(sample_id_biorepository=btb_subjects.sample_id_original)

# Drop this as it will come back from the samples
btb_subjects.drop('sample_id_original', axis=1, inplace=True)

btb_subjects.to_csv('btb_subjects.csv')

Merge the tissue/subject with the samples to make a complete metadata table.

In [24]:
metadata = samples3.merge(btb_subjects, how="left",
                          left_on=["src_subject_id", "subjectkey", "sample_id_biorepository"],
                          right_on=["src_subject_id", "subjectkey", "sample_id_biorepository"])

# metadata.index = metadata.data_file

# metadata = metadata[metadata_columns]

metadata = metadata.drop_duplicates()

# Get Manifests

Get list of `.manifest` files from the NDA-BSMN bucket. Read them in and concatenate them, under the assumption that the files listed in the manifest are in the same directory as the manifest file itself.

In [25]:
bucket = s3_nda.Bucket(nda_bucket_name)
manifests = [x for x in bucket.objects.all() if x.key.find('.manifest') >=0]

In [26]:
manifest = pandas.DataFrame()

for m in manifests:
    folder = os.path.split(m.key)[0]
    tmp = pandas.read_csv(m.get()['Body'], delimiter="\t", header=None)
    tmp.columns = ('filename', 'md5', 'size')
    tmp.filename = "s3://%s/%s/" % (nda_bucket_name, folder,) + tmp.filename.map(str)
    manifest = pandas.concat([manifest, tmp])

manifest.reset_index(drop=True, inplace=True)

# Only keep the files that are in the metadata table
manifest = manifest[manifest.filename.isin(metadata.data_file)]

manifest.to_csv('./manifest.csv')

In [27]:
metadata_manifest = manifest.merge(metadata, how="left",
                                   left_on="filename",
                                   right_on="data_file")

metadata_manifest = metadata_manifest.drop_duplicates()

metadata_manifest.to_csv('./metadata_manifest.csv')

# Synapse

Using the concatenated manifests as the master list of files to store, create file handles and entities in Synapse.

Use the metadata table to get the appropriate tissue/subject/sample annotations to set on each File entity.

In [28]:
syn = synapseclient.login(silent=True)

dry_run = False

for n, x in metadata_manifest.iterrows():
    s3Key = x['filename'].replace("s3://%s/" % nda_bucket_name, "")
    s3FilePath = os.path.split(s3Key)[-1]
    contentSize = x['size']
    contentMd5 = x['md5']
    
    logger.debug("%s - %s" % (s3Key, s3FilePath))

    # Check if it exists in Synapse
    res = syn.restGET("/entity/md5/%s" % (contentMd5, ))['results']
    
    res = filter(lambda x: x['benefactorId'] == synapse_data_folder_id, res)
    
    if len(res) > 0:        
        fhs = [syn.restGET("/entity/%(id)s/version/%(versionNumber)s/filehandles" % er) for er in res]    
        fileHandleObj = syn._getFileHandle(fhs[0]['list'][0]['id'])
    else:       
        contentType = content_type_dict.get(os.path.splitext(x['filename'])[-1],
                                            'application/octet-stream')
        
        fileHandle = {'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle',
                      'fileName'    : s3FilePath,
                      'contentSize' : contentSize,
                      'contentType' : contentType,
                      'contentMd5' :  contentMd5,
                      'bucketName' : nda_bucket_name,
                      'key'        : s3Key,
                      'storageLocationId' : storage_location_id}

        fileHandleObj = syn.restPOST('/externalFileHandle/s3', 
                                     json.dumps(fileHandle), 
                                     endpoint=syn.fileHandleEndpoint)
    
    a = x[metadata_columns]
    a = a.to_dict()
    
    logger.debug("filename = %s, annotations = %s" % (x['filename'], a))
    
    if not dry_run:
        f = synapseclient.File(parentId=synapse_data_folder, 
                               name=s3FilePath, 
                               dataFileHandleId = fileHandleObj['id'])
        f.annotations = a

        f = syn.store(f, forceVersion=False)


DEBUG:main:abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/Project_Lt76/Sample_E_A1/A1_S1_L008_R1_001.fastq.gz - A1_S1_L008_R1_001.fastq.gz
DEBUG:main:filename = s3://nda-bsmn/abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/Project_Lt76/Sample_E_A1/A1_S1_L008_R1_001.fastq.gz, annotations = {'experiment_id': u'535', 'comments_misc': u'Yale/Mayo;brain;dura;NeuN+;single nucleus;MDA;WGS;QC', 'sample_description': nan, 'sex': nan, 'site': u'Yale/Mayo', 'sample_id_original': u'BSMN_REF_NeuN+_A1', 'biorepository': u'LIBD', 'fileFormat': 'fastq', 'phenotype': nan, 'sample_id_biorepository': u'BSMN_REF_NeuN+_A1', 'subjectkey': u'NDAR_INVRT663MBL', 'species': 'Human', 'src_subject_id': u'5154', 'sample_amount': u'17.39', 'subject_sample_id_original': nan, 'subject_biorepository': nan, 'sample_unit': u'ug - micrograms'}
DEBUG:main:abyzova_1481392262177/Volumes/Promise Pegasus/data/s203958.TS_mosaicism/Project_Lt76/Sample_E_A1/A1_S1_L008_R2_001.fastq