# Terra Data Repository on Azure Year 1 Demo

This notebook runs through the following steps:
- Authenticate using B2C
- Create an Azure *billing Profile* in TDR
- Create a *dataset*
- Ingest 1000 Genomes data into the *dataset*
- Create a *snapshot* from the *dataset*
- Read the the metadata from the *snapshot* into a Pandas data frame
- Read a Drs object from the metadata and use it to access file data


## Import dependencies

In [None]:
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install --upgrade data_repo_client

import pandas as pd
import datetime, uuid, urllib, os, time, json
from IPython.core.display import display, clear_output, HTML
from data_repo_client import DatasetsApi
from data_repo_client import SnapshotsApi
from data_repo_client import JobsApi
from data_repo_client import ResourcesApi
from data_repo_client import ApiClient
from data_repo_client import Configuration
from data_repo_client import DataRepositoryServiceApi
from data_repo_client import ApiException


In [None]:
# Create require methods for demo

# Format TDR object
def pretty_print_tdr_object(obj):
  if (hasattr(obj, "to_dict")):
    return json.dumps(obj.to_dict(), indent=4)
  else:
    return json.dumps(obj, indent=4)
  a = ApiException();

# Check status of job
def wait_for_job(job_model):
  counter = 0
  max_dots = 8
  result = job_model
  job_str = '"%s" (%s)' % (job_model.description, job_model.id)
  while True:
    counter += 1
    clear_output(wait=True)
    if (result == None or result.job_status == "running"):
      num_dots = counter % max_dots
      dots = '.' * (num_dots)
      display(HTML('<b>Running job %s %s</b>' %(job_str, dots)))
      time.sleep(3)
      result = jobs_api.retrieve_job(job_model.id)
    elif (result.job_status == 'failed'):
      display(HTML('<b>Running job %s: <span style="color:rgb(219, 50, 20);">failed</span></b>' % job_str))
      try:
        result = jobs_api.retrieve_job_result(job_model.id)
      except ApiException as e:
        result = display(HTML('<p>Job result is:</p><pre>%s</pre>' % e))
      return result
    elif (result.job_status == "succeeded"):
      display(HTML('<b>Running job %s: <span style="color:rgb(116, 174, 6);">succeeded</span></b>' % job_str))
      result = jobs_api.retrieve_job_result(job_model.id)
      display(HTML('<p>Job result is:</p><hr/><pre>%s</pre>' % pretty_print_tdr_object(result)))
      return result
    else:
      raise "Unrecognized job state %s" % result.job_status


# Convert bytes representation of UUID into string representation
def UUID(bytes):
  return str(uuid.UUID(bytes=bytes))

## Set configuration

Obtain a JWT by going through the auth flow here:

https://tdrb2ctest.b2clogin.com/tdrb2ctest.onmicrosoft.com/oauth2/v2.0/authorize?p=B2C_1A_SIGNUP_SIGNIN&client_id=bc8119eb-e425-4ff7-945a-05f90a37fca7&nonce=defaultNonce&redirect_uri=https%3A%2F%2Fjwt.ms&scope=openid&response_type=id_token&prompt=login

Save the JWT into the token field

In [None]:
# Set up configuration
config = Configuration()
config.host="https://jade-nm.datarepo-dev.broadinstitute.org/"
# Paste in the JWT token obtained via the auth link above
token=""
config.access_token = token
api_client = ApiClient(configuration=config)
api_client.client_side_validation = False

# Obtain a sas token for the folder that contains the source metadata to ingest
ingest_sas_token = ""

# Azure managed managed application configuration.  These are obtained from the Azure portal
application_deployment_name = "tdrdeployment1"
resource_group_name = "TDR"
subscription_id = "71d52ec1-5886-480a-9d6e-ed98cbf1f69f"
tenant_id = "efc08443-0082-4d6c-8931-c5794c156abd"
profile_name = "azureprofile"
profile_description = "Billing profile that demonstrates use of Azure resources within TDR"

# Remove this once GCP cleanup is merged
billing_account_id = "00708C-45D19D-27AAFA"

# Enter a billing profile or leave blank to generate a new one when creating a new billing profile
billing_profile_id = "cdac1ca4-049c-4711-8f4b-1010d71680cb"

dataset_name = "1000genomes13"
dataset_description = "1000 genomes dataset"
ingest_file_base = "https://tdrtestdatauscentral.blob.core.windows.net/1000genomes/metadata"

snapshot_name = "1000genomessnp13"
snapshot_description = "1000 genomes snapshot"

local_parquet_dir = "/tmp/az"

# Create required API Clients
jobs_api = JobsApi(api_client=api_client)
resources_api = ResourcesApi(api_client=api_client)
datasets_api = DatasetsApi(api_client=api_client)
snapshots_api = SnapshotsApi(api_client=api_client)
drs_api = DataRepositoryServiceApi(api_client=api_client)

## Create the Terra Data Repo billing profile

In [None]:
# Create Billing Profile
if (billing_profile_id == None or billing_profile_id == ""):
  billing_profile_id = str(uuid.uuid4());

profile_request = {
  "biller": "direct",
  "billingAccountId": billing_account_id,
  "description": profile_description,
  "id": billing_profile_id,
  "profileName": profile_name,
  "cloudPlatform": "azure",
  "applicationDeploymentName": application_deployment_name,
  "resourceGroupName": resource_group_name,
  "subscriptionId": subscription_id,
  "tenantId": tenant_id
}


create_profile_result = wait_for_job(resources_api.create_profile(billing_profile_request=profile_request))

## Create Dataset and ingest data

In [None]:
dataset_request = {
  "defaultProfileId": billing_profile_id,
  "description": dataset_description,
  "name": dataset_name,
  "cloudPlatform": "azure",
  "region": "westus2",
  "gcpRegion": "us-east1",
  "schema": {
    "tables": [
      {
        "name": "demo_pheno_data",
        "columns": [
          {
            "name": "pheno_data_id",
            "datatype": "string",
            "array_of": False
          },
          {
            "name": "age",
            "datatype": "integer",
            "array_of": False
          },
          {
            "name": "bmi_baseline",
            "datatype": "float",
            "array_of": False
          },
          {
            "name": "dbgap_accession_number",
            "datatype": "string",
            "array_of": False
          },
          {
            "name": "height_baseline",
            "datatype": "float",
            "array_of": False
          },
          {
            "name": "ldl",
            "datatype": "float",
            "array_of": False
          },
          {
            "name": "hdl",
            "datatype": "float",
            "array_of": False
          },
          {
            "name": "population",
            "datatype": "string",
            "array_of": False
          },
          {
            "name": "program_name",
            "datatype": "string",
            "array_of": False
          },
          {
            "name": "sample_specimen_id",
            "datatype": "string",
            "array_of": False
          },
          {
            "name": "sex",
            "datatype": "string",
            "array_of": False
          },
          {
            "name": "total_cholesterol",
            "datatype": "float",
            "array_of": False
          },
          {
            "name": "triglycerides",
            "datatype": "float",
            "array_of": False
          },
          {
            "name": "bam_file",
            "datatype": "fileref",
            "array_of": False
          },
          {
            "name": "bam_file_index",
            "datatype": "fileref",
            "array_of": False
          }
        ],
        "primaryKey": []
      }
    ],
    "assets": [
      {
        "name": "default",
        "tables": [
          {
            "name": "demo_pheno_data",
            "columns": []
          }
        ],
        "rootTable": "demo_pheno_data",
        "rootColumn": "pheno_data_id",
        "follow": []
      }
    ]
  }
}

create_dataset_result = wait_for_job(datasets_api.create_dataset(dataset=dataset_request))

In [None]:
# Read in the dataset that was just created with full information
dataset = datasets_api.retrieve_dataset(create_dataset_result['id'])
print(pretty_print_tdr_object(dataset))

In [None]:
# Ingest Pedigree Data
ingest_request = {
  "format": "json",
  "ignore_unknown_values": True,
  "load_tag": "smallload2",
  "max_bad_records": 0,
  "path": ingest_file_base + "/demo-pheno-data-small.json?" + ingest_sas_token,
  "profile_id": billing_profile_id,
  "resolve_existing_files": True,
  "table": "demo_pheno_data"
}
ingest_request_result = wait_for_job(datasets_api.ingest_dataset(dataset.id, ingest=ingest_request))

In [None]:
# Look at the dataset parquet location
dataset = datasets_api.retrieve_dataset(dataset.id, include=["ACCESS_INFORMATION"])
table = next(iter(dataset.access_information.parquet.tables), lambda t: t.name == "demo_pheno_data")
print(pretty_print_tdr_object(table))

In [None]:
os.system("rm -r %s/%s" % (local_parquet_dir, table.name))
os.system("azcopy cp '%s?%s' '%s' --recursive" % (table.url, table.sas_token, local_parquet_dir))
df = pd.read_parquet("%s/%s" % (local_parquet_dir, table.name))
# Convert the UUID from binary to readable UUID
df["datarepo_row_id"] = df["datarepo_row_id"].apply(UUID)
df.head(10)

In [None]:
snapshot_request = {
  "contents": [
    {
      "datasetName": dataset_name,
      "mode": "byFullView"
    }
  ],
  "description": snapshot_description,
  "name": snapshot_name,
  "profileId": billing_profile_id,
  "readers": []
}

create_snapshot_result = wait_for_job(snapshots_api.create_snapshot(snapshot=snapshot_request))

In [None]:
# Look at the dataset parquet location
snapshot = snapshots_api.retrieve_snapshot(create_snapshot_result['id'], include=["ACCESS_INFORMATION"])
table = next(iter(snapshot.access_information.parquet.tables), lambda t: t.name == "demo_pheno_data")
print(pretty_print_tdr_object(table))

In [None]:
os.system("rm -r %s/%s.parquet" % (local_parquet_dir, table.name))
os.system("azcopy cp '%s?%s' '%s' --recursive" % (table.url, table.sas_token, local_parquet_dir))
df = pd.read_parquet("%s/%s.parquet" % (local_parquet_dir, table.name))
# Convert the UUID from binary to readable UUID
df["datarepo_row_id"] = df["datarepo_row_id"].apply(UUID)
df.head(10)

## Given a Drs ID, access file data
From the previous cell, copy a DRS ID and extract the object ID (in the format `v1_<uuid>_<uuid>`) and save if in the drs_id variable.

The following cell obtains a signed URL from TDR using a DRS ID then reads the first few bytes.

In [None]:
# Get access to DRS object
drs_id = ""
drs_object = drs_api.get_object(drs_id)
drs_access = drs_api.get_access_url(drs_id, drs_object.access_methods[0].access_id)
print(drs_access)

## Read the first 100 bytes of the file represented by DRS ID

In [None]:
# Read the first 100 bytes of the file represented by DRS ID
print(urllib.request.urlopen(drs_access.url).read()[:100])