# Dataset w/ Array Load

In [1]:
pip install data-repo-client

Note: you may need to restart the kernel to use updated packages.


## Configuration

### Import libraries

In [2]:
import datetime, getpass, uuid
from data_repo_client import RepositoryApi
from data_repo_client import ResourcesApi
from data_repo_client import ApiClient
from data_repo_client import Configuration
from data_repo_client import UnauthenticatedApi

### Authenticate

Retrieve your access token by running these commands in your terminal

  1. Login in with desired user

`gcloud auth login`

  2. Print the token to use in the next step

`gcloud auth print-access-token`

In [3]:
# Set up configuration
config = Configuration()
config.host="https://jade-sh.datarepo-dev.broadinstitute.org/"

# Use access token printed in last step
config.access_token= getpass.getpass("Paste token data ")
apiClient = ApiClient(configuration=config)
apiClient.client_side_validation = False


In [4]:
# Init api 
unauthenticated = UnauthenticatedApi(api_client=apiClient)
resourceApi = ResourcesApi(api_client=apiClient)
repoApi = RepositoryApi(api_client=apiClient)

Billing profile

In [26]:
# Get first billing profile id that you have acccess to
profileId = resourceApi.enumerate_profiles().items[0].id
resourceApi.retrieve_profile(profileId)

{'application_deployment_name': None,
 'biller': 'direct',
 'billing_account_id': '00708C-45D19D-27AAFA',
 'cloud_platform': 'gcp',
 'created_by': 'sholden@broadinstitute.org',
 'created_date': '2022-05-10T13:05:29.766698Z',
 'description': 'test for c89c2668 on 2022-05-10',
 'id': 'c89c2668-3b0d-4c47-810d-c306b52a2088',
 'profile_name': 'test-c89c2668-2022-05-10',
 'resource_group_name': None,
 'subscription_id': None,
 'tenant_id': None}

## Use the new billing profile to create dataset

### Define parameters  & format request

In [15]:
str_uuid = str(uuid.uuid4())
strHashId = str_uuid[0:8]
strToday = str(datetime.date.today())
datasetDescription = "Test dataset w/ Array Column " + strHashId + " on " + strToday
datasetName = "ArrayDataset" + strHashId

datasetRequest = {
    "defaultProfileId": profileId,
    "description": datasetDescription,
    "name": datasetName,
    "schema": {
        "tables": [
            {
                "name": "sample_info",
                "columns": [
                    {
                        "name": "id",
                        "datatype": "string"
                    },
                    {
                        "name": "favoriteAnimals",
                        "datatype": "string",
                        "array_of": "true"
                    }
                ]
            }
        ]
    }
}

print(datasetRequest)


{'defaultProfileId': 'c89c2668-3b0d-4c47-810d-c306b52a2088', 'description': 'Test dataset w/ Array Column c754a1cd on 2022-05-10', 'name': 'ArrayDatasetc754a1cd', 'schema': {'tables': [{'name': 'sample_info', 'columns': [{'name': 'id', 'datatype': 'string'}, {'name': 'favoriteAnimals', 'datatype': 'string', 'array_of': 'true'}]}]}}


### Make async request to create new dataset and check job status

In [16]:
#async request to create dataset
createDatasetThread = repoApi.create_dataset(async_req=True, dataset=datasetRequest)

In [18]:
# retrieve JobModel generated by create_dataset request
createDatasetJobId = createDatasetThread.get().id

{'class_name': 'bio.terra.service.dataset.flight.create.DatasetCreateFlight',
 'completed': None,
 'description': 'Create dataset ArrayDatasetc754a1cd',
 'id': 'moO-FhIhTD-_6Sk2CN7W5g',
 'job_status': 'running',
 'status_code': 202,
 'submitted': '2022-05-10T14:41:34.100586Z'}

In [20]:
# wait for job_status to equal succeeded
repoApi.retrieve_job(createDatasetJobId)

{'class_name': 'bio.terra.service.dataset.flight.create.DatasetCreateFlight',
 'completed': '2022-05-10T14:42:01.831979Z',
 'description': 'Create dataset ArrayDatasetc754a1cd',
 'id': 'moO-FhIhTD-_6Sk2CN7W5g',
 'job_status': 'succeeded',
 'status_code': 201,
 'submitted': '2022-05-10T14:41:34.100586Z'}

In [21]:
#Once job_status = succeeded, check if dataset appears in recently created datasets
repoApi.enumerate_datasets(sort="created_date", direction="desc", limit=5)

{'filtered_total': 4,
 'items': [{'cloud_platform': 'gcp',
            'created_date': '2022-05-10T14:41:52.539677Z',
            'data_project': 'datarepo-dev-805a6735',
            'default_profile_id': 'c89c2668-3b0d-4c47-810d-c306b52a2088',
            'description': 'Test dataset w/ Array Column c754a1cd on '
                           '2022-05-10',
            'id': '18091a44-ea7d-461f-b9dc-e6fee18620fc',
            'name': 'ArrayDatasetc754a1cd',
            'phs_id': None,
            'secure_monitoring_enabled': False,
            'self_hosted': False,
            'storage': [{'cloud_platform': 'gcp',
                         'cloud_resource': 'bigquery',
                         'region': 'us-central1'},
                        {'cloud_platform': 'gcp',
                         'cloud_resource': 'firestore',
                         'region': 'us-east4'},
                        {'cloud_platform': 'gcp',
                         'cloud_resource': 'bucket',
                  

### Retrieve newly created dataset

In [22]:
# Retrieve Dataset
datasetList = repoApi.enumerate_datasets(sort="created_date", direction="desc", limit=1)
newDatasetId = datasetList.items[0].id
repoApi.retrieve_dataset(newDatasetId)

{'access_information': None,
 'created_date': '2022-05-10T14:41:52.539677Z',
 'data_project': 'datarepo-dev-805a6735',
 'default_profile_id': 'c89c2668-3b0d-4c47-810d-c306b52a2088',
 'default_snapshot_id': None,
 'description': 'Test dataset w/ Array Column c754a1cd on 2022-05-10',
 'id': '18091a44-ea7d-461f-b9dc-e6fee18620fc',
 'name': 'ArrayDatasetc754a1cd',
 'phs_id': None,
 'schema': {'assets': [],
            'relationships': [],
            'tables': [{'columns': [{'array_of': False,
                                     'datatype': 'string',
                                     'name': 'id',
                                     'required': False},
                                    {'array_of': True,
                                     'datatype': 'string',
                                     'name': 'favoriteAnimals',
                                     'required': False}],
                        'date_partition_options': None,
                        'int_partition_options

## Ingest Metadata (AKA Tabular Data) into Table

In [27]:
# Ingest Request
str_uuid_ingest = str(uuid.uuid4())
strHashIdIngest = str_uuid_ingest[0:8]
ingestRequest = {
  "table": "sample_info",
  "records": [
    {
      "id": "2355",
      "favoriteAnimals": ["horse", "dog", "zebra", "moose", "kangaroo", "mouse", "rat", "hamster", "cat", "lizard"]
    }
  ],
  "format": "array",
  "load_tag": strHashIdIngest,
  "profile_id": profileId
}

In [28]:
ingestSampleInfoThread = repoApi.ingest_dataset(async_req=True, id=newDatasetId, ingest=ingestRequest)

In [29]:

ingestDatasetJobId = ingestSampleInfoThread.get().id

In [30]:
repoApi.retrieve_job(ingestDatasetJobId)

{'class_name': 'bio.terra.service.dataset.flight.ingest.DatasetIngestFlight',
 'completed': '2022-05-10T15:02:29.667271Z',
 'description': 'Ingest tabular data to sample_info in dataset id '
                '18091a44-ea7d-461f-b9dc-e6fee18620fc',
 'id': 'YKDr1dqwSJ6CC67Va5BD7Q',
 'job_status': 'succeeded',
 'status_code': 200,
 'submitted': '2022-05-10T15:02:12.801973Z'}