# Azure Year 1 Demo

This notebook runs through the following steps:
- Authenticate using B2C
- Create an Azure *billing Profile* in TDR
- Create a *dataset*
- Ingest 1000 Genomes data into the *dataset*
- Create a *snapshot* from the *dataset*
- Read the the metadata from the *snapshot* into a Pandas data frame
- Read a Drs object from the metadata and use it to access file data


## Import dependencies

In [None]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install azure-storage-file-datalake
!{sys.executable} -m pip install pyarrowfs-adlgen2
!{sys.executable} -m pip install --upgrade data_repo_client

import pandas as pd
import pyarrow.fs
import pyarrowfs_adlgen2
from azure.storage.filedatalake import FileSystemClient
import datetime, getpass, uuid
from data_repo_client import RepositoryApi
from data_repo_client import DatasetsApi
from data_repo_client import JobsApi
from data_repo_client import ResourcesApi
from data_repo_client import ApiClient
from data_repo_client import Configuration
from data_repo_client import UnauthenticatedApi

## Set configuration

Obtain a JWT by going through the auth flow here:
https://tdrb2ctest.b2clogin.com/tdrb2ctest.onmicrosoft.com/oauth2/v2.0/authorize?p=B2C_1A_SIGNUP_SIGNIN&client_id=bc8119eb-e425-4ff7-945a-05f90a37fca7&nonce=defaultNonce&redirect_uri=https%3A%2F%2Fjwt.ms&scope=openid&response_type=id_token&prompt=login

Save the JWT into the token field

In [None]:
# Set up configuration
config = Configuration()
config.host="https://jade-nm.datarepo-dev.broadinstitute.org/"
# Is there a better way to get this?
token=""
config.access_token = token
apiClient = ApiClient(configuration=config, client_side_validation = False)
# apiClient.client_side_validation = False

# Azure managed managed application configuration
applicationDeploymentName = "tdrdeployment1"
resourceGroupName = "TDR"
subscriptionId = "71d52ec1-5886-480a-9d6e-ed98cbf1f69f"
tenantId = "efc08443-0082-4d6c-8931-c5794c156abd"
profileNameBase = "azureprofile"

# Remove this once GCP cleanup is merged
billingAccountId = "00708C-45D19D-27AAFA"

datasetNameBase = "1000genomes"
datasetDescription = "1000 genomes dataset"
ingestFileBase = "https://tdrtestdatauscentral.blob.core.windows.net/1000genomes/metadata"

snapshotNameBase = "1000genomes"

In [None]:
# This cell should only be run once as these are dynamically set config values
billingProfileId = str(uuid.uuid4())
strHashId = billingProfileId[0:8]
print("Billing Profile Id: " + billingProfileId)
strToday = str(datetime.date.today())

profileName = profileNameBase + "-" + strHashId + "-" + strToday
profileDescription = "Azure profile " + strHashId + " created on " + strToday

datasetName = datasetNameBase + strToday

In [None]:
# Create required API Clients
jobsApi = JobsApi(api_client=apiClient)
resourcesApi = ResourcesApi(api_client=apiClient)
datasetsApi = DatasetsApi(api_client=apiClient)

## Create the Terra Data Repo billing profile 

In [None]:
# Create Billing Profile
profileRequest = {
    "biller": "direct",
    "billingAccountId": billingAccountId,
    "description": profileDescription,
    "id": billingProfileId,
    "profileName": profileName,
    "cloudPlatform": "azure",
    "applicationDeploymentName": applicationDeploymentName,
    "resourceGroupName": resourceGroupName,
    "subscriptionId": subscriptionId,
    "tenantId": tenantId
}


createProfileThread = resourcesApi.create_profile(async_req=True, billing_profile_request=profileRequest)
createProfileJobId = createProfileThread.get().id
jobsApi.retrieve_job(createProfileJobId)

In [None]:
# Wait until job succedes
jobsApi.retrieve_job_result(createProfileJobId)

## Create Dataset and ingest data

In [None]:
datasetRequest = {
  "defaultProfileId": billingProfileId,
  "description": datasetDescription,
  "name": '1000genomestest',
  "cloudPlatform": "azure",
  "region": "centralus",
  "gcpRegion": "us-east1",
  "schema": {
      "tables": [
        {
          "name": "pedigree",
          "columns": [
            {
              "name": "Family_ID",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Individual_ID",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Paternal_ID",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Maternal_ID",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Gender",
              "datatype": "integer",
              "array_of": False
            },
            {
              "name": "Phenotype",
              "datatype": "integer",
              "array_of": False
            },
            {
              "name": "Population",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Relationship",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Siblings",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Second_Order",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Third_Order",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Other_Comments",
              "datatype": "string",
              "array_of": False
            }
          ],
          "primaryKey": []
        },
        {
          "name": "bam_file",
          "columns": [
            {
              "name": "Sample",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "BAM_File_Path",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "BAM_File_Ref",
              "datatype": "fileref",
              "array_of": False
            }
          ],
          "primaryKey": []
        },
        {
          "name": "sample_info",
          "columns": [
            {
              "name": "Sample",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Family_ID",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Population",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Population_Description",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Gender",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Relationship",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Unexpected_Parent_Child",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Non_Paternity",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Siblings",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Grandparents",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Avuncular",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Half_Siblings",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Unknown_Second_Order",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Third_Order",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "In_Low_Coverage_Pilot",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "LC_Pilot_Platforms",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "LC_Pilot_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "In_High_Coverage_Pilot",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "HC_Pilot_Platforms",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "HC_Pilot_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "In_Exon_Targetted_Pilot",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "ET_Pilot_Platforms",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "ET_Pilot_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Has_Sequence_in_Phase1",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Phase1_LC_Platform",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Phase1_LC_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Phase1_E_Platform",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Phase1_E_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "In_Phase1_Integrated_Variant_Set",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_Phase1_chrY_SNPS",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_phase1_chrY_Deletions",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_phase1_chrMT_SNPs",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Main_project_LC_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Main_project_LC_platform",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Total_LC_Sequence",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "LC_Non_Duplicated_Aligned_Coverage",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "Main_Project_E_Centers",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Main_Project_E_Platform",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Total_Exome_Sequence",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "X_Targets_Covered_to_20x_or_greater",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_E_Omni_Free",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_E_Affy_Free",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_E_Omni_Chip",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_E_Affy_Chip",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_LC_Omni_Free",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_LC_Affy_Free",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_LC_Omni_Chip",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "VerifyBam_LC_Affy_Chip",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "LC_Indel_Ratio",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "E_Indel_Ratio",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "LC_Passed_QC",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "E_Passed_QC",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "In_Final_Phase_Variant_Calling",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_Omni_Genotypes",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_Axiom_Genotypes",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_Affy_6_0_Genotypes",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Has_Exome_LOF_Genotypes",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "EBV_Coverage",
              "datatype": "float",
              "array_of": False
            },
            {
              "name": "DNA_Source_from_Coriell",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Has_Sequence_from_Blood_in_Index",
              "datatype": "boolean",
              "array_of": False
            },
            {
              "name": "Super_Population",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "Super_Population_Description",
              "datatype": "string",
              "array_of": False
            }
          ],
          "primaryKey": []
        },
        {
          "name": "vcf_file",
          "columns": [
            {
              "name": "VCF_File_Path",
              "datatype": "string",
              "array_of": False
            },
            {
              "name": "VCF_File_Ref",
              "datatype": "fileref",
              "array_of": False
            },
            {
              "name": "Description",
              "datatype": "string",
              "array_of": False
            }
          ],
          "primaryKey": []
        }
      ],
      "relationships": [
        {
          "name": "pedigree_sample",
          "from": {
            "table": "pedigree",
            "column": "Family_ID"
          },
          "to": {
            "table": "sample_info",
            "column": "Family_ID"
          }
        },
        {
          "name": "sample_bam",
          "from": {
            "table": "sample_info",
            "column": "Sample"
          },
          "to": {
            "table": "bam_file",
            "column": "Sample"
          }
        }
      ],
      "assets": [
        {
          "name": "sample_pedigree",
          "tables": [
            {
              "name": "pedigree",
              "columns": [
                "Family_ID",
                "Individual_ID",
                "Paternal_ID",
                "Maternal_ID",
                "Gender",
                "Phenotype",
                "Population",
                "Relationship",
                "Siblings",
                "Second_Order",
                "Third_Order",
                "Other_Comments"
              ]
            },
            {
              "name": "sample_info",
              "columns": [
                "Sample",
                "Family_ID",
                "Population",
                "Population_Description",
                "Gender",
                "Relationship",
                "Unexpected_Parent_Child",
                "Non_Paternity",
                "Siblings",
                "Grandparents",
                "Avuncular",
                "Half_Siblings",
                "Unknown_Second_Order",
                "Third_Order",
                "In_Low_Coverage_Pilot",
                "LC_Pilot_Platforms",
                "LC_Pilot_Centers",
                "In_High_Coverage_Pilot",
                "HC_Pilot_Platforms",
                "HC_Pilot_Centers",
                "In_Exon_Targetted_Pilot",
                "ET_Pilot_Platforms",
                "ET_Pilot_Centers",
                "Has_Sequence_in_Phase1",
                "Phase1_LC_Platform",
                "Phase1_LC_Centers",
                "Phase1_E_Platform",
                "Phase1_E_Centers",
                "In_Phase1_Integrated_Variant_Set",
                "Has_Phase1_chrY_SNPS",
                "Has_phase1_chrY_Deletions",
                "Has_phase1_chrMT_SNPs",
                "Main_project_LC_Centers",
                "Main_project_LC_platform",
                "Total_LC_Sequence",
                "LC_Non_Duplicated_Aligned_Coverage",
                "Main_Project_E_Centers",
                "Main_Project_E_Platform",
                "Total_Exome_Sequence",
                "X_Targets_Covered_to_20x_or_greater",
                "VerifyBam_E_Omni_Free",
                "VerifyBam_E_Affy_Free",
                "VerifyBam_E_Omni_Chip",
                "VerifyBam_E_Affy_Chip",
                "VerifyBam_LC_Omni_Free",
                "VerifyBam_LC_Affy_Free",
                "VerifyBam_LC_Omni_Chip",
                "VerifyBam_LC_Affy_Chip",
                "LC_Indel_Ratio",
                "E_Indel_Ratio",
                "LC_Passed_QC",
                "E_Passed_QC",
                "In_Final_Phase_Variant_Calling",
                "Has_Omni_Genotypes",
                "Has_Axiom_Genotypes",
                "Has_Affy_6_0_Genotypes",
                "Has_Exome_LOF_Genotypes",
                "EBV_Coverage",
                "DNA_Source_from_Coriell",
                "Has_Sequence_from_Blood_in_Index",
                "Super_Population",
                "Super_Population_Description"
              ]
            }
          ],
          "rootTable": "sample_info",
          "rootColumn": "Sample",
          "follow": [
            "pedigree_sample"
          ]
        },
        {
          "name": "sample_bam",
          "tables": [
            {
              "name": "sample_info",
              "columns": [
                "Sample",
                "Family_ID",
                "Population",
                "Population_Description",
                "Gender",
                "Relationship",
                "Unexpected_Parent_Child",
                "Non_Paternity",
                "Siblings",
                "Grandparents",
                "Avuncular",
                "Half_Siblings",
                "Unknown_Second_Order",
                "Third_Order",
                "In_Low_Coverage_Pilot",
                "LC_Pilot_Platforms",
                "LC_Pilot_Centers",
                "In_High_Coverage_Pilot",
                "HC_Pilot_Platforms",
                "HC_Pilot_Centers",
                "In_Exon_Targetted_Pilot",
                "ET_Pilot_Platforms",
                "ET_Pilot_Centers",
                "Has_Sequence_in_Phase1",
                "Phase1_LC_Platform",
                "Phase1_LC_Centers",
                "Phase1_E_Platform",
                "Phase1_E_Centers",
                "In_Phase1_Integrated_Variant_Set",
                "Has_Phase1_chrY_SNPS",
                "Has_phase1_chrY_Deletions",
                "Has_phase1_chrMT_SNPs",
                "Main_project_LC_Centers",
                "Main_project_LC_platform",
                "Total_LC_Sequence",
                "LC_Non_Duplicated_Aligned_Coverage",
                "Main_Project_E_Centers",
                "Main_Project_E_Platform",
                "Total_Exome_Sequence",
                "X_Targets_Covered_to_20x_or_greater",
                "VerifyBam_E_Omni_Free",
                "VerifyBam_E_Affy_Free",
                "VerifyBam_E_Omni_Chip",
                "VerifyBam_E_Affy_Chip",
                "VerifyBam_LC_Omni_Free",
                "VerifyBam_LC_Affy_Free",
                "VerifyBam_LC_Omni_Chip",
                "VerifyBam_LC_Affy_Chip",
                "LC_Indel_Ratio",
                "E_Indel_Ratio",
                "LC_Passed_QC",
                "E_Passed_QC",
                "In_Final_Phase_Variant_Calling",
                "Has_Omni_Genotypes",
                "Has_Axiom_Genotypes",
                "Has_Affy_6_0_Genotypes",
                "Has_Exome_LOF_Genotypes",
                "EBV_Coverage",
                "DNA_Source_from_Coriell",
                "Has_Sequence_from_Blood_in_Index",
                "Super_Population",
                "Super_Population_Description"
              ]
            },
            {
              "name": "bam_file",
              "columns": [
                "Sample",
                "BAM_File_Path",
                "BAM_File_Ref"
              ]
            }
          ],
          "rootTable": "sample_info",
          "rootColumn": "Sample",
          "follow": [
            "sample_bam"
          ]
        },
        {
          "name": "default",
          "tables": [
            {
              "name": "pedigree",
              "columns": [
                "Family_ID",
                "Individual_ID",
                "Paternal_ID",
                "Maternal_ID",
                "Gender",
                "Phenotype",
                "Population",
                "Relationship",
                "Siblings",
                "Second_Order",
                "Third_Order",
                "Other_Comments"
              ]
            },
            {
              "name": "sample_info",
              "columns": [
                "Sample",
                "Family_ID",
                "Population",
                "Population_Description",
                "Gender",
                "Relationship",
                "Unexpected_Parent_Child",
                "Non_Paternity",
                "Siblings",
                "Grandparents",
                "Avuncular",
                "Half_Siblings",
                "Unknown_Second_Order",
                "Third_Order",
                "In_Low_Coverage_Pilot",
                "LC_Pilot_Platforms",
                "LC_Pilot_Centers",
                "In_High_Coverage_Pilot",
                "HC_Pilot_Platforms",
                "HC_Pilot_Centers",
                "In_Exon_Targetted_Pilot",
                "ET_Pilot_Platforms",
                "ET_Pilot_Centers",
                "Has_Sequence_in_Phase1",
                "Phase1_LC_Platform",
                "Phase1_LC_Centers",
                "Phase1_E_Platform",
                "Phase1_E_Centers",
                "In_Phase1_Integrated_Variant_Set",
                "Has_Phase1_chrY_SNPS",
                "Has_phase1_chrY_Deletions",
                "Has_phase1_chrMT_SNPs",
                "Main_project_LC_Centers",
                "Main_project_LC_platform",
                "Total_LC_Sequence",
                "LC_Non_Duplicated_Aligned_Coverage",
                "Main_Project_E_Centers",
                "Main_Project_E_Platform",
                "Total_Exome_Sequence",
                "X_Targets_Covered_to_20x_or_greater",
                "VerifyBam_E_Omni_Free",
                "VerifyBam_E_Affy_Free",
                "VerifyBam_E_Omni_Chip",
                "VerifyBam_E_Affy_Chip",
                "VerifyBam_LC_Omni_Free",
                "VerifyBam_LC_Affy_Free",
                "VerifyBam_LC_Omni_Chip",
                "VerifyBam_LC_Affy_Chip",
                "LC_Indel_Ratio",
                "E_Indel_Ratio",
                "LC_Passed_QC",
                "E_Passed_QC",
                "In_Final_Phase_Variant_Calling",
                "Has_Omni_Genotypes",
                "Has_Axiom_Genotypes",
                "Has_Affy_6_0_Genotypes",
                "Has_Exome_LOF_Genotypes",
                "EBV_Coverage",
                "DNA_Source_from_Coriell",
                "Has_Sequence_from_Blood_in_Index",
                "Super_Population",
                "Super_Population_Description"
              ]
            },
            {
              "name": "bam_file",
              "columns": [
                "Sample",
                "BAM_File_Path",
                "BAM_File_Ref"
              ]
            }
          ],
          "rootTable": "sample_info",
          "rootColumn": "Sample",
          "follow": [
            "pedigree_sample",
            "sample_bam"
          ]
        }
      ]
    }
}

#async request to create dataset
createDatasetThread = datasetsApi.create_dataset(async_req=True, dataset=datasetRequest)
# retrieve JobModel generated by create_dataset request
createDatasetJobId = createDatasetThread.get().id
# Check status of job
jobsApi.retrieve_job(createDatasetJobId)

In [None]:
# wait for job_status to equal succeeded
jobsApi.retrieve_job(createDatasetJobId)

In [None]:
# see the result of the dataset creation
dataset = jobsApi.retrieve_job_result(createDatasetJobId)
print(dataset)

In [None]:
# Ingest Pedigree Data
ingest_request = {
  "format": "json",
  "ignore_unknown_values": True,
  "load_tag": "myloadtag",
  "max_bad_records": 0,
  "path": ingestFileBase + "/1000-genomes-pedigree.json",
  "profile_id": billingProfileId,
  "resolve_existing_files": True,
  "table": "pedigree"
}
ingestDatasetThread = datasetsApi.ingest_dataset(dataset['id'], async_req=True, ingest=ingest_request)
# retrieve JobModel generated by create_dataset request
ingestDatasetJobId = ingestDatasetThread.get().id
# Check status of job
jobsApi.retrieve_job(ingestDatasetJobId)


In [None]:
# Check status of job
jobsApi.retrieve_job(ingestDatasetJobId)

In [None]:
# Look at the dataset parquet location
# datasetsApi.retrieve_dataset(dataset['id'], include=['ACCESS_INFORMATION'])
datasetsApi.retrieve_dataset('05838669-e3c6-43df-b85d-bab85bcfb721', include=['ACCESS_INFORMATION'])

In [None]:
# Load parquet data
sas = 'sp=rl&st=2021-10-26T20:57:10Z&se=2021-10-27T04:57:10Z&spr=https&sv=2020-08-04&sr=d&sig=9vCFRUjuA32ADGZzCpzY796bhmq0kawmbaSJziUCgU8%3D&sdd=2'
fsc = FileSystemClient('https://tdrdep1uycicoiiinjollozm.dfs.core.windows.net', 'metadata', sas)
handler = pyarrowfs_adlgen2.FilesystemHandler(fsc)
fs = pyarrow.fs.PyFileSystem(handler)
df = pd.read_parquet('parquet/pedigree/D9Liv9NYSbOli6eMeZpSHQ.parquet', filesystem=fs)
df.head(10)

In [None]:
# DELETE: Check datasets
resourcesApi.enumerate_profiles()