# 1000 Genomes Load

In [None]:
pip install data-repo-client

## Configuration

### Import libraries

In [None]:
import datetime, getpass, uuid
from data_repo_client import RepositoryApi
from data_repo_client import ResourcesApi
from data_repo_client import ApiClient
from data_repo_client import Configuration
from data_repo_client import UnauthenticatedApi

### Authenticate

Retrieve your access token by running these commands in your terminal

  1. Login in with desired user

`gcloud auth login`

  2. Print the token to use in the next step

`gcloud auth print-access-token`

In [None]:
# Set up configuration
config = Configuration()
#config.host="https://jade-sh.datarepo-dev.broadinstitute.org/"
# config.host="http://localhost:8080/"
config.host="https://data.shelby-test-tdr-apr-13.bee.envs-terra.bio/"

# Use access token printed in last step
config.access_token= getpass.getpass("Paste token data ")
apiClient = ApiClient(configuration=config)
apiClient.client_side_validation = False


In [None]:
# Init api 
unauthenticated = UnauthenticatedApi(api_client=apiClient)
resourceApi = ResourcesApi(api_client=apiClient)
repoApi = RepositoryApi(api_client=apiClient)

### Basic Examples

In [None]:
# Example Unauthenicated command
print(unauthenticated.retrieve_repository_config())

In [None]:
# List Billing Profiles
resourceApi.enumerate_profiles()

## Create Billing Profile

### Define parameters  & format request

In [None]:
billingProfileId = str(uuid.uuid4())
print("Billing Profile Id: " + billingProfileId)
billingAccountId = "00708C-45D19D-27AAFA"

strHashId = billingProfileId[0:8]
strToday = str(datetime.date.today())
description = "test for " + strHashId + " on " + strToday
profileName = "test-" + strHashId + "-" + strToday

print("Profile Name: " + profileName + "\nProfile Description: " + description)

In [None]:
# B) Format the request - must be json w/ single quotes
profileRequest = {
    "biller": "direct",
    "billingAccountId": billingAccountId,
    "description": description,
    "id": billingProfileId,
    "profileName": profileName
}
print(profileRequest)

### Make async request to create new profile and check job status

In [None]:
createProfileThread = resourceApi.create_profile(async_req=True, billing_profile_request=profileRequest)
createProfileJobId = createProfileThread.get().id
repoApi.retrieve_job(createProfileJobId)

In [None]:
# Wait until status = succeeded
repoApi.retrieve_job(createProfileJobId)

In [None]:
#E) Check to see new profile added
resourceApi.enumerate_profiles()

### Retrieve newly created profile

In [None]:
#retrieve new profile
profileId = resourceApi.enumerate_profiles().items[0].id
resourceApi.retrieve_profile(profileId)

## Use the new billing profile to create dataset

### Define parameters  & format request

In [None]:
str_uuid = str(uuid.uuid4())
strHashId = str_uuid[0:8]
strToday = str(datetime.date.today())
datasetDescription = "Public dataset 1000 Genomes. " + strHashId + " on " + strToday
datasetName = "1000GenomesDataset" + strHashId

datasetRequest = {
    "defaultProfileId": profileId,
    "description": datasetDescription,
    "name": datasetName,
    "schema": {
        "tables": [
            {
                "name": "sample_info",
                "columns": [
                    {
                        "name": "Sample",
                        "datatype": "string"
                    },
                    {
                        "name": "Family_ID",
                        "datatype": "string"
                    },
                    {
                        "name": "Population",
                        "datatype": "string"
                    },
                    {
                        "name": "Population_Description",
                        "datatype": "string"
                    },
                    {
                        "name": "Gender",
                        "datatype": "string"
                    },
                    {
                        "name": "Relationship",
                        "datatype": "string"
                    },
                    {
                        "name": "Unexpected_Parent_Child",
                        "datatype": "string"
                    },
                    {
                        "name": "Non_Paternity",
                        "datatype": "string"
                    },
                    {
                        "name": "Siblings",
                        "datatype": "string"
                    },
                    {
                        "name": "Grandparents",
                        "datatype": "string"
                    },
                    {
                        "name": "Avuncular",
                        "datatype": "string"
                    },
                    {
                        "name": "Half_Siblings",
                        "datatype": "string"
                    },
                    {
                        "name": "Unknown_Second_Order",
                        "datatype": "string"
                    },
                    {
                        "name": "Third_Order",
                        "datatype": "string"
                    },
                    {
                        "name": "In_Low_Coverage_Pilot",
                        "datatype": "boolean"
                    },
                    {
                        "name": "LC_Pilot_Platforms",
                        "datatype": "string"
                    },
                    {
                        "name": "LC_Pilot_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "In_High_Coverage_Pilot",
                        "datatype": "boolean"
                    },
                    {
                        "name": "HC_Pilot_Platforms",
                        "datatype": "string"
                    },
                    {
                        "name": "HC_Pilot_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "In_Exon_Targetted_Pilot",
                        "datatype": "boolean"
                    },
                    {
                        "name": "ET_Pilot_Platforms",
                        "datatype": "string"
                    },
                    {
                        "name": "ET_Pilot_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "Has_Sequence_in_Phase1",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Phase1_LC_Platform",
                        "datatype": "string"
                    },
                    {
                        "name": "Phase1_LC_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "Phase1_E_Platform",
                        "datatype": "string"
                    },
                    {
                        "name": "Phase1_E_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "In_Phase1_Integrated_Variant_Set",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_Phase1_chrY_SNPS",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_phase1_chrY_Deletions",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_phase1_chrMT_SNPs",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Main_project_LC_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "Main_project_LC_platform",
                        "datatype": "string"
                    },
                    {
                        "name": "Total_LC_Sequence",
                        "datatype": "float"
                    },
                    {
                        "name": "LC_Non_Duplicated_Aligned_Coverage",
                        "datatype": "float"
                    },
                    {
                        "name": "Main_Project_E_Centers",
                        "datatype": "string"
                    },
                    {
                        "name": "Main_Project_E_Platform",
                        "datatype": "string"
                    },
                    {
                        "name": "Total_Exome_Sequence",
                        "datatype": "float"
                    },
                    {
                        "name": "X_Targets_Covered_to_20x_or_greater",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_E_Omni_Free",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_E_Affy_Free",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_E_Omni_Chip",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_E_Affy_Chip",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_LC_Omni_Free",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_LC_Affy_Free",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_LC_Omni_Chip",
                        "datatype": "float"
                    },
                    {
                        "name": "VerifyBam_LC_Affy_Chip",
                        "datatype": "float"
                    },
                    {
                        "name": "LC_Indel_Ratio",
                        "datatype": "float"
                    },
                    {
                        "name": "E_Indel_Ratio",
                        "datatype": "float"
                    },
                    {
                        "name": "LC_Passed_QC",
                        "datatype": "boolean"
                    },
                    {
                        "name": "E_Passed_QC",
                        "datatype": "boolean"
                    },
                    {
                        "name": "In_Final_Phase_Variant_Calling",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_Omni_Genotypes",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_Axiom_Genotypes",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_Affy_6_0_Genotypes",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Has_Exome_LOF_Genotypes",
                        "datatype": "boolean"
                    },
                    {
                        "name": "EBV_Coverage",
                        "datatype": "float"
                    },
                    {
                        "name": "DNA_Source_from_Coriell",
                        "datatype": "string"
                    },
                    {
                        "name": "Has_Sequence_from_Blood_in_Index",
                        "datatype": "boolean"
                    },
                    {
                        "name": "Super_Population",
                        "datatype": "string"
                    },
                    {
                        "name": "Super_Population_Description",
                        "datatype": "string"
                    }
                ]
            },
            {
                "name": "pedigree",
                "columns": [
                    {
                        "name": "Family_ID",
                        "datatype": "string"
                    },
                    {
                        "name": "Individual_ID",
                        "datatype": "string"
                    },
                    {
                        "name": "Paternal_ID",
                        "datatype": "string"
                    },
                    {
                        "name": "Maternal_ID",
                        "datatype": "string"
                    },
                    {
                        "name": "Gender",
                        "datatype": "integer"
                    },
                    {
                        "name": "Phenotype",
                        "datatype": "integer"
                    },
                    {
                        "name": "Population",
                        "datatype": "string"
                    },
                    {
                        "name": "Relationship",
                        "datatype": "string"
                    },
                    {
                        "name": "Siblings",
                        "datatype": "string"
                    },
                    {
                        "name": "Second_Order",
                        "datatype": "string"
                    },
                    {
                        "name": "Third_Order",
                        "datatype": "string"
                    },
                    {
                        "name": "Other_Comments",
                        "datatype": "string"
                    }
                ]
            },
            {
                "name": "bam_file",
                "columns": [
                    {
                        "name": "Sample",
                        "datatype": "string"
                    },
                    {
                        "name": "BAM_File_Path",
                        "datatype": "string"
                    },
                    {
                        "name": "BAM_File_Ref",
                        "datatype": "fileref"
                    }
                ]
            },
            {
                "name": "vcf_file",
                "columns": [
                    {
                        "name": "VCF_File_Path",
                        "datatype": "string"
                    },
                    {
                        "name": "VCF_File_Ref",
                        "datatype": "fileref"
                    },
                    {
                        "name": "Description",
                        "datatype": "string"
                    }
                ]
            }
        ],
        "relationships": [
            {
                "name": "pedigree_sample",
                "to": {
                    "table": "sample_info",
                    "column": "Family_ID"
                },
                "from": {
                    "table": "pedigree",
                    "column": "Family_ID"
                }
            },
            {
                "name": "sample_bam",
                "to": {
                    "table": "bam_file",
                    "column": "Sample"
                },
                "from": {
                    "table": "sample_info",
                    "column": "Sample"
                }
            }
        ],
        "assets": [
            {
                "name": "sample_pedigree",
                "rootTable": "sample_info",
                "rootColumn": "Sample",
                "tables": [
                    {
                        "name": "sample_info",
                        "columns": []
                    },
                    {
                        "name": "pedigree",
                        "columns": []
                    }
                ],
                "follow": [
                    "pedigree_sample"
                ]
            },
            {
                "name": "sample_bam",
                "rootTable": "sample_info",
                "rootColumn": "Sample",
                "tables": [
                    {
                        "name": "sample_info",
                        "columns": []
                    },
                    {
                        "name": "bam_file",
                        "columns": []
                    }
                ],
                "follow": [
                    "sample_bam"
                ]
            }
        ]
    }
}

print(datasetRequest)


### Make async request to create new dataset and check job status

In [None]:
#async request to create dataset
createDatasetThread = repoApi.create_dataset(async_req=True, dataset=datasetRequest)

In [None]:
createDatasetThread.get()

In [None]:
# retrieve JobModel generated by create_dataset request
createDatasetJobId = createDatasetThread.get().id
# Check status of job
repoApi.retrieve_job(createDatasetJobId)

In [None]:
# wait for job_status to equal succeeded
repoApi.retrieve_job(createDatasetJobId)

In [None]:
#Once job_status = succeeded, check if dataset appears in recently created datasets
repoApi.enumerate_datasets(sort="created_date", direction="desc", limit=5)

### Retrieve newly created dataset

In [None]:
# Retrieve Dataset
datasetList = repoApi.enumerate_datasets(sort="created_date", direction="desc", limit=1)
newDatasetId = datasetList.items[0].id
repoApi.retrieve_dataset(newDatasetId)
# newDatasetId = "c5248002-05d6-469e-bf31-38f1792198f3"

## Ingest Metadata (AKA Tabular Data) into Sample Info and Pedigree tables

In [None]:
# Ingest Request
str_uuid_ingest = str(uuid.uuid4())
strHashIdIngest = str_uuid_ingest[0:8]
ingestRequest = {
  "table": "sample_info",
  "path": "gs://jade-testdata/1000_genomes_sample_info.json",
  "format": "json",
  "load_tag": strHashIdIngest,
  "profile_id": profileId
}

In [None]:
ingestSampleInfoThread = repoApi.ingest_dataset(async_req=True, id=newDatasetId, ingest=ingestRequest)

In [None]:

ingestDatasetJobId = ingestSampleInfoThread.get().id
repoApi.retrieve_job(ingestDatasetJobId)

In [None]:
# Ingest pedigree tabular data
str_uuid_ingest = str(uuid.uuid4())
strHashIdIngest = str_uuid_ingest[0:8]
ingestPedigreeRequest = {
  "table": "pedigree",
  "path": "gs://jade-testdata/1000_genomes_pedigree.json",
  "format": "json",
  "load_tag": strHashIdIngest,
  "profile_id": profileId
}

In [None]:
ingestPedigreeThread = repoApi.ingest_dataset(async_req=True, id=newDatasetId, ingest=ingestPedigreeRequest)

In [None]:
ingestDatasetJobId = ingestPedigreeThread.get().id

In [None]:
repoApi.retrieve_job(ingestDatasetJobId)

# Combined ingest into BAM table
Files listed here: https://github.com/DataBiosphere/jade-search-poc/blob/master/1000Genomes/file-load/filepath-list.txt

In [None]:
str_uuid_ingest = str(uuid.uuid4())
strHashIdIngest = str_uuid_ingest[0:8]
ingestRequest = {
  "table": "bam_file",
  "records": [
    {
      "Sample": "HG00096",
      "BAM_File_Path": "genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam",
      "BAM_File_Ref": { "source_path": "gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam", "target_path": "/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam", "mime_type": "text/plain"}
    }
  ],
  "format": "array",
  "load_tag": strHashIdIngest,
  "profile_id": profileId
}

In [None]:
ingestSampleInfoThread = repoApi.ingest_dataset(async_req=True, id=newDatasetId, ingest=ingestRequest)

In [None]:
ingestDatasetJobId = ingestSampleInfoThread.get().id

In [None]:
repoApi.retrieve_job(ingestDatasetJobId)