In [1]:
import os
import json
import synapseclient
import pandas
import requests
import boto3
import nda_aws_token_generator
import logging

pandas.options.display.max_rows = None
pandas.options.display.max_columns = None
pandas.options.display.max_colwidth = 1000

logger = logging.getLogger("main")
logger.setLevel(logging.DEBUG)
#create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# NDA Configuration
REFERENCE_GUID = 'NDAR_INVRT663MBL'

# This is an old genomics subject
EXCLUDE_GENOMICS_SUBJECTS = ('92027', )
# EXCLUDE_EXPERIMENTS = ('534', '535')
EXCLUDE_EXPERIMENTS = ()

nda_bucket_name = 'nda-bsmn'

# Synapse configuration
synapse_data_folder = 'syn7872188'
synapse_data_folder_id = int(synapse_data_folder.replace('syn', ''))
storage_location_id = '9209'

def flattenjson( b, delim ):
    val = {}
    for i in b.keys():
        if isinstance( b[i], dict ):
            get = flattenjson( b[i], delim )
            for j in get.keys():
                val[ i + delim + j ] = get[j]
        else:
            val[i] = b[i]

    return val

In [2]:
s3 = boto3.resource("s3")
obj = s3.Object('kdaily-lambda-creds.sagebase.org', 'ndalogs_config.json')

config = json.loads(obj.get()['Body'].read())

ndaconfig = config['nda']

tokengenerator = nda_aws_token_generator.NDATokenGenerator()
mytoken = tokengenerator.generate_token(ndaconfig['username'],
                                        ndaconfig['password'])

session = boto3.Session(
    aws_access_key_id=mytoken.access_key,
    aws_secret_access_key=mytoken.secret_key,
    aws_session_token=mytoken.session
)

s3_nda = session.resource("s3")

In [3]:
r = requests.get("https://ndar.nih.gov/api/guid/{}/data?short_name=genomics_sample03".format(REFERENCE_GUID), 
                 auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                  ndaconfig['password']),
                 headers={'Accept': 'application/json'})

guid_data = json.loads(r.text)

tmp = [{col['name']: col['value'] for col in row['dataElement']} 
       for row in guid_data['age'][0]['dataStructureRow']]
    
samples = pandas.io.json.json_normalize(tmp)

In [4]:
experiment_ids = samples.EXPERIMENT_ID.unique().tolist()

In [5]:
df = pandas.DataFrame()

for experiment_id in experiment_ids:
    r = requests.get("https://ndar.nih.gov/api/experiment/{}".format(experiment_id), 
                     auth=requests.auth.HTTPBasicAuth(ndaconfig['username'], 
                                                      ndaconfig['password']),
                     headers={'Accept': 'application/json'})

    guid_data = json.loads(r.text)
    guid_data_flat = flattenjson(guid_data[u'omicsOrFMRIOrEEG']['sections'], '.')

    fix_keys = ['processing.processingKits.processingKit', 
               'additionalinformation.equipment.equipmentName',
               'extraction.extractionKits.extractionKit',
                'additionalinformation.analysisSoftware.software']

    for key in fix_keys:
        foo = guid_data_flat[key]
        tmp = ",".join(map(lambda x: "%s %s" % (x['vendorName'], x['value']), foo))
        guid_data_flat[key] = tmp

    foo = guid_data_flat['processing.processingProtocols.processingProtocol']
    tmp = ",".join(map(lambda x: "%s: %s" % (x['technologyName'], x['value']), foo))
    guid_data_flat['processing.processingProtocols.processingProtocol'] = tmp

    guid_data_flat['extraction.extractionProtocols.protocolName'] = ",".join(
        guid_data_flat['extraction.extractionProtocols.protocolName'])

    guid_data_flat['experiment_id'] = experiment_id
    
    df = df.append(guid_data_flat, ignore_index=True)

In [6]:
keep_cols = {'additionalinformation.analysisSoftware.software': 'analysisSoftwareName',
             'additionalinformation.equipment.equipmentName': 'equipmentName',
             'experimentparameters.molecule.moleculeName': 'moleculeName',
             'experimentparameters.platform.platformName': 'platformName',
             'experimentparameters.platform.platformSubType': 'platformSubType',
             'experimentparameters.platform.vendorName': 'vendorName',
             'experimentparameters.technology.applicationName': 'applicationName',
             'experimentparameters.technology.applicationSubType': 'applicationSubType',
             'extraction.extractionProtocols.protocolName': 'extractionProtocolName',
             'extraction.extractionKits.extractionKit': 'extractionKit',
             'processing.processingKits.processingKit': 'processingKit'
            }

In [7]:
df_change = df[keep_cols.keys()]

In [8]:
df_change = df_change.rename(columns=keep_cols, inplace=False)

In [9]:
df2 = pandas.concat([df, df_change], axis=1)

In [14]:
df2 = df2.rename(columns = lambda x: x.replace(".", "_"))


In [19]:
syn = synapseclient.login()
schema = syn.store(synapseclient.Schema("Foo", parent='syn5902559', columns=synapseclient.table.as_table_columns(df2)))


Welcome, Kenneth Daily!

