In [82]:
import os
import json
import synapseclient
import pandas
import requests
import boto3
import nda_aws_token_generator
import logging

pandas.options.display.max_rows = None
pandas.options.display.max_columns = None
pandas.options.display.max_colwidth = 1000

logger = logging.getLogger("main")
logger.setLevel(logging.DEBUG)
#create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# NDA Configuration
REFERENCE_GUID = 'NDAR_INVRT663MBL'

# This is an old genomics subject
EXCLUDE_GENOMICS_SUBJECTS = ('92027', )
# EXCLUDE_EXPERIMENTS = ('534', '535')
EXCLUDE_EXPERIMENTS = ()

nda_bucket_name = 'nda-bsmn'

# Synapse configuration
synapse_data_folder = 'syn7872188'
synapse_data_folder_id = int(synapse_data_folder.replace('syn', ''))
storage_location_id = '9209'

def flattenjson( b, delim ):
    val = {}
    for i in b.keys():
        if isinstance( b[i], dict ):
            get = flattenjson( b[i], delim )
            for j in get.keys():
                val[ i + delim + j ] = get[j]
        else:
            val[i] = b[i]

    return val

In [83]:
s3 = boto3.resource("s3")
obj = s3.Object('kdaily-lambda-creds.sagebase.org', 'ndalogs_config.json')

config = json.loads(obj.get()['Body'].read())

ndaconfig = config['nda']

tokengenerator = nda_aws_token_generator.NDATokenGenerator()
mytoken = tokengenerator.generate_token(ndaconfig['username'],
                                        ndaconfig['password'])

session = boto3.Session(
    aws_access_key_id=mytoken.access_key,
    aws_secret_access_key=mytoken.secret_key,
    aws_session_token=mytoken.session
)

s3_nda = session.resource("s3")

In [84]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=genomics_sample03".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                  ndaconfig['password']),
                 headers={'Accept': 'application/json'})

guid_data = json.loads(r.text)

tmp = [{col['name']: col['value'] for col in row['dataElement']} 
       for row in guid_data['age'][0]['dataStructureRow']]
    
samples = pandas.io.json.json_normalize(tmp)

In [85]:
experiment_ids = samples.EXPERIMENT_ID.unique().tolist()

In [90]:
df = pandas.DataFrame()

for experiment_id in experiment_ids:
    r = requests.get("https://ndar.nih.gov/api/experiment/{}".format(experiment_id), 
                     auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                      ndaconfig['password']),
                     headers={'Accept': 'application/json'})

    guid_data = json.loads(r.text)
    guid_data_flat = flattenjson(guid_data[u'omicsOrFMRIOrEEG']['sections'], '.')

    fix_keys = ['processing.processingKits.processingKit', 
               'additionalinformation.equipment.equipmentName',
               'extraction.extractionKits.extractionKit',
                'additionalinformation.analysisSoftware.software']

    for key in fix_keys:
        foo = guid_data_flat[key]
        tmp = ",".join(map(lambda x: "%s %s" % (x['vendorName'], x['value']), foo))
        guid_data_flat[key] = tmp

    foo = guid_data_flat['processing.processingProtocols.processingProtocol']
    tmp = ",".join(map(lambda x: "%s: %s" % (x['technologyName'], x['value']), foo))
    guid_data_flat['processing.processingProtocols.processingProtocol'] = tmp

    guid_data_flat['extraction.extractionProtocols.protocolName'] = ",".join(
        guid_data_flat['extraction.extractionProtocols.protocolName'])

    guid_data_flat['experiment_id'] = experiment_id
    
    df = df.append(guid_data_flat, ignore_index=True)

In [91]:
df

Unnamed: 0,additionalinformation.analysisSoftware.software,additionalinformation.equipment.equipmentName,experiment_id,experimentparameters.molecule.moleculeName,experimentparameters.molecule.moleculeSubType,experimentparameters.platform.platformName,experimentparameters.platform.platformSubType,experimentparameters.platform.vendorName,experimentparameters.technology.applicationName,experimentparameters.technology.applicationSubType,experimentparameters.technology.moleculeName,experimentparameters.technology.technologyName,experimentparameters.technology.technologySubType,extraction.extractionKits.extractionKit,extraction.extractionKits.moleculeName,extraction.extractionProtocols.moleculeName,extraction.extractionProtocols.protocolName,processing.processingKits.moleculeName,processing.processingKits.processingKit,processing.processingProtocols.moleculeName,processing.processingProtocols.processingProtocol
0,Custom Custom scripts,Illumina HiSeq 4000,535,DNA,gDNA,Illumina Sequencing,Illumina Sequencing,Illumina,Next Generation Sequencing,Whole genome sequencing,DNA,sequencing,sequencing,"Qiagen DNeasy Blood & Tissue Kit,Qiagen Qiagen REPLI-g Single Cell Kit (MDA)",DNA,DNA,Genomic DNA purification,DNA,Illumina TruSeq DNA PCR-Free 350 bp,DNA,"sequencing: sequencing,sequencing: workflow analysis,sequencing: library preparation"
1,IrysView IrysView,BioNano IrysView,533,DNA,gDNA,Irys,Irys,BioNano Genomics,Whole genome optical imaging,Optical genome imaging,DNA,Genome imaging,,Custom Proteinase K digestion followed by NaCl salting-out and chloroform extraction,DNA,DNA,Genomic DNA purification,DNA,Custom processing kit Processing,DNA,Genome imaging: DNA isolation and processing
2,Illumina Genome Analyzer Pipeline,"Illumina MiSeq,Illumina HiSeq 4000",641,DNA,gDNA,SeqCap EZ Exome v2.0,NGS,Nimblegen,Next Generation Sequencing,Exome sequencing,DNA,sequencing,Sequence capture,Not Applicable Not Applicable,DNA,DNA,Genomic DNA purification,DNA,Illumina Paired-End DNA Sample Prep Kit,DNA,sequencing: sequencing
3,Illumina Genome Analyzer Pipeline,Illumina HiSeq 2500,643,DNA,gDNA,Illumina Sequencing,Illumina Sequencing,Illumina,Next Generation Sequencing,Exome sequencing,DNA,sequencing,sequencing,Qiagen Gentra Puregen kit,DNA,DNA,Genomic DNA purification,DNA,Agilent Technologies SureSelectXT2 Human All Exon V4+UTRs,DNA,sequencing: Exome Capture
4,Illumina Isaac,Illumina HiSeq X Ten,534,DNA,gDNA,Illumina Sequencing,Illumina Sequencing,Illumina,Next Generation Sequencing,Whole genome sequencing,DNA,sequencing,sequencing,Qiagen DNeasy Blood & Tissue Kit,DNA,DNA,Genomic DNA purification,DNA,Illumina TruSeq DNA PCR-Free 350 bp,DNA,"sequencing: sequencing,sequencing: workflow analysis,sequencing: library preparation"
