# Imports

In [1]:
# !pip install import_ipynb polling2 dataclasses_json

In [1]:
import import_ipynb
import os
import pandas as pd
from tdr_dataset_ingest import *
from transform_task import *
from source_files_creation import *
import build_file_manifest as bfm
import biosample_transforms as bst
import sequencingactivity_transforms as sat
import file_transforms as ft
import familymember_transforms as fmt
import diagnosis_transforms as dt
import donor_transforms as dont
import variantcall_transforms as vct
import output_data_profiling as odp
from oauth2client.client import GoogleCredentials
import requests
import logging

importing Jupyter notebook from tdr_dataset_ingest.ipynb
workspace name = anvil_cmg_ingest_resources
workspace project = dsp-data-ingest
workspace bucket = gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46
importing Jupyter notebook from transform_task.ipynb
workspace name = anvil_cmg_ingest_resources
workspace project = dsp-data-ingest
workspace bucket = gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46
importing Jupyter notebook from source_files_creation.ipynb
workspace name = anvil_cmg_ingest_resources
workspace project = dsp-data-ingest
workspace bucket = gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46
importing Jupyter notebook from build_file_manifest.ipynb
importing Jupyter notebook from biosample_transforms.ipynb
importing Jupyter notebook from sequencingactivity_transforms.ipynb
importing Jupyter notebook from file_transforms.ipynb
importing Jupyter notebook from familymember_transforms.ipynb
importing Jupyter notebook from diagnosis_transforms.ipynb
importing Jupyter notebook from donor_

# Pipeline Run Variables

In [5]:
# Has dataset already been created? False if so.  The pipeline will then get the dataset id using the dataset_name value
should_create_dataset = False

# Can the creation of source files be skipped? True if so.
skip_source_file_creation = False

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# Transform Variables
params = {}
params["tf_input_dir"] = "ingest_pipeline/input/metadata"
params["tf_output_dir"] = "ingest_pipeline/output/tim_core/metadata"
params["val_output_dir"] = "ingest_pipeline/output/tim_core/validation"
params["tdr_schema_file"] = "ingest_pipeline/output/tim_core/schema/tdr_schema_object.json"
params["data_files_src_bucket"] = "fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46"
params["data_files_src_dirs"] = []  # Leave empty to include all
params["data_files_src_dirs_exclude"] = [] 
params["fileref_columns"] = ["sequencing_id", "seq_filename", "capture_region_bed_file"]

# List of tuples (transform file, table_name (target table/target tsv name minus type))
transformables = [("donor_transforms", "donor"),
                  ("familymember_transforms", "familymember"),
                  ("diagnosis_transforms", "diagnosis"),
                  ("biosample_transforms", "biosample"),
                  ("sequencingactivity_transforms", "sequencingactivity"), 
                  ("file_transforms", "file"),
                  (None, "variantcall"),
                  (None, "dataset"),
                  (None, "project")]

# Ingest Variables
dataset_name = ws_name.replace('-', '_') + '_TEST'
profile_id = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
dataset_key = params["tdr_schema_file"]
parsed_bucket = urlparse(ws_bucket)
bucket_name = parsed_bucket.netloc

## Setup Google Creds
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

## Setup TDR Client
client = TDRClient("data.terra.bio", creds.token)

print(f"should_create_dataset = {should_create_dataset}")
print("tf_input_dir = {in_dir}".format(in_dir = params["tf_input_dir"]))
print("tf_output_dir = {out_dir}".format(out_dir = params["tf_output_dir"]))
print(f"dataset_key = {dataset_key}")
print(f"bucket_name = {bucket_name}")


should_create_dataset = False
tf_input_dir = ingest_pipeline/input/metadata
tf_output_dir = ingest_pipeline/output/tim_core/metadata
dataset_key = ingest_pipeline/output/tim_core/schema/tdr_schema_object.json
bucket_name = fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46


# Pipeline Defs

In [6]:
def get_access_token():
    """Get access token."""

    scopes = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email"]
    credentials = GoogleCredentials.get_application_default()
    credentials = credentials.create_scoped(scopes)

    return credentials.get_access_token().access_token

def file_exists(key: str) -> bool:
    logging.info(f"Checking file {key}")
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    
    return storage.Blob(bucket=bucket, name=key).exists(storage_client)

def run_ingest(dataset_id: str):    
    # Loop through tfs to get the table files and ingest
    for tf in transformables:
        table = tf[1]
        tf_output_dir = params["tf_output_dir"]
        key = f"{tf_output_dir}/{table}.json"
        metadata_path = f"{ws_bucket}/{key}"
        
        if file_exists(key):
            logging.info(f"Running ingest: {table}")
            ingest_req = TDRIngestRequest(table=table, format="json", path=metadata_path, resolve_existing_files=True, updateStrategy=UpdateEnum.REPLACE)    
            ingest_resp = client.ingest(dataset_id, ingest_req)
        
            poll_resp = client.poll_job_status(ingest_resp.id)
        
            logging.info(f"Ingest {table} response {poll_resp}")
        else:
            logging.warning(f"Metadata file does not exist.  Skipping: {table}")
            continue
            
def run_dataset() -> str:
    logging.info(f"Running create dataset {should_create_dataset}")
    
    dataset_id = ""
    
    # Only create the dataset if needed
    if should_create_dataset == True:
        logging.info("Creating dataset")
        # Grab Dataset config from storage
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        dataset_blob = bucket.blob(dataset_key)
        
        # Grab phsId from the workspace attributes
        w_att = get_workspace_attributes(ws_project, ws_name)
        phs_id = get_workspace_phs_id(w_att)

        ## Build Validated Request
        schema = SchemaModel(**json.loads(dataset_blob.download_as_string(client=None)))        
        dataset_req = TDRDatasetRequest(name=dataset_name, defaultProfileId=profile_id, schema=schema, experimentalSelfHosted=True, phsId=phs_id)

        resp = client.create_dataset(dataset_req)

        poll_resp = client.poll_job_status(resp.id)
        
    logging.info("Getting dataset")
    
    # Now grab the dataset by name   
    dataset_resp = client.get_dataset_by_name(TDRDatasetSearchRequest(filter=dataset_name))
    
    logging.info(f"dataset_resp = {dataset_resp}")
    
    # Grab first item from dataset_resp
    dataset_id = dataset_resp.items[0]["id"]
    logging.info(f"dataset_id = {dataset_id}")
    return dataset_id

def get_dataset_details(id: str) -> TDRDatasetDetail:
    return client.get_dataset_details(id)

def run_transform(params):
    for tf in transformables:
        logging.info(f"Running {tf[0]}")
        if tf[0] == None:
            continue
        elif tf[0] == "biosample_transforms":
            bst.transform(params)
        elif tf[0] == "familymember_transforms":
            fmt.transform(params)
        elif tf[0] == "diagnosis_transforms":
             dt.transform(params)
        elif tf[0] == "donor_transforms":
            dont.transform(params)
        elif tf[0] == "sequencingactivity_transforms":
            sat.transform(params)
        elif tf[0] == "file_transforms":
            ft.transform(params)
        elif tf[0] == "variantcall_transforms":
            logging.warning("Don't forget to test varicantcall_transforms when available!")
        elif tf[0] == "project":
            logging.warning("Don't forget to test project transforms when available!")
        elif tf[0] == "dataset":
            logging.warning("Don't forget to test dataset transforms when available!")
    
def run_source_file_creation():
    if skip_source_file_creation == True:
        logging.info("Skipping source file creation")
    else:
        logging.info("Running source file creation")
        create_source_files()

def run_build_file_manifest(params):
    logging.info("Building data file manifest")
    manifest = bfm.build_manifest(params)
    return manifest
    
def run_output_data_profiling(params, dataset_id):
    logging.info("Running output data profiling")
    
    # Get dataset details to add to params dict
    dataset_details = (get_dataset_details(dataset_id)).accessInformation.bigQuery
    dataset_bq_name = dataset_details.datasetName 
    dataset_bq_project_id = dataset_details.projectId
    params["bq_project"] = dataset_bq_project_id
    params["bq_schema"] = dataset_bq_name
    
    # Profile data
    odp.profile_data(params)

def share_dataset(dataset_id):
    uri = f"https://data.terra.bio/api/repository/v1/datasets/{dataset_id}/policies/steward/members"
    
    headers = {"Authorization": "Bearer " + get_access_token(),
               "accept": "application/json",
               "Content-Type": "application/json"}
    
    email = json.dumps({"email":"prod-dsp-data-ingest@firecloud.org"})
    request = requests.post(uri, headers=headers, data=email)
    
    logging.info(f"Adding {email} as Steward to dataset.")
    
def run_pipeline():
    # Step 1: Create Source Files
    run_source_file_creation()
    
    # Step 2: Build File Manifest
    file_manifest = run_build_file_manifest(params)
    params["file_manifest"] = file_manifest
    
    # Step 3: Transform Structured Data
    run_transform(params)
    
    # Step 4: Create or Retrieve Dataset
    dataset_id = run_dataset()
    share_dataset(dataset_id)

    # Step 5: Ingest Data
    run_ingest(dataset_id)
    
    # Step 6: Share Dataset
    share_dataset(dataset_id)
    
    # Step 7: Profile Output Data 
    run_output_data_profiling(params, dataset_id) 
    
    logging.info("Yay! The pipeline has completed!")
    

# Run Pipeline

In [7]:
run_pipeline()

05/11/2022 02:11:05 PM - INFO: Skipping source file creation
05/11/2022 02:11:05 PM - INFO: Building data file manifest
05/11/2022 02:11:08 PM - INFO: Running donor_transforms
05/11/2022 02:11:14 PM - INFO: Running familymember_transforms
05/11/2022 02:11:18 PM - INFO: Running diagnosis_transforms
05/11/2022 02:11:22 PM - INFO: Running biosample_transforms
05/11/2022 02:11:26 PM - INFO: Running sequencingactivity_transforms
05/11/2022 02:11:29 PM - INFO: Running file_transforms
05/11/2022 02:11:33 PM - INFO: Running None
05/11/2022 02:11:33 PM - INFO: Running None
05/11/2022 02:11:33 PM - INFO: Running None
05/11/2022 02:11:33 PM - INFO: Running create dataset False
05/11/2022 02:11:33 PM - INFO: Getting dataset
05/11/2022 02:11:33 PM - INFO: dataset_resp = TDRDataset(total=295, filteredTotal=1, items=[{'id': 'fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', 'name': 'anvil_cmg_ingest_resources_TEST', 'description': None, 'defaultProfileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'createdDate': 

params: {'json': {'table': 'donor', 'format': 'json', 'path': 'gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/donor.json', 'resolve_existing_files': True, 'updateStrategy': 'replace'}}


05/11/2022 02:12:05 PM - INFO: Ingest donor response TDRResponse(id='rbwpgO2iTF6tGZXOsxannQ', job_status='succeeded', status_code=200, description='Ingest from gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/donor.json to donor in dataset id fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', submitted='2022-05-11T14:11:35.008521Z', completed='2022-05-11T14:11:56.177675Z', class_name='bio.terra.service.dataset.flight.ingest.DatasetIngestFlight')
05/11/2022 02:12:05 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/familymember.json
05/11/2022 02:12:05 PM - INFO: Running ingest: familymember


params: {'json': {'table': 'familymember', 'format': 'json', 'path': 'gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/familymember.json', 'resolve_existing_files': True, 'updateStrategy': 'replace'}}


05/11/2022 02:12:35 PM - INFO: Ingest familymember response TDRResponse(id='lB07Fc_oREKeW5nrLfMBGA', job_status='succeeded', status_code=200, description='Ingest from gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/familymember.json to familymember in dataset id fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', submitted='2022-05-11T14:12:05.462133Z', completed='2022-05-11T14:12:24.970298Z', class_name='bio.terra.service.dataset.flight.ingest.DatasetIngestFlight')
05/11/2022 02:12:35 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/diagnosis.json
05/11/2022 02:12:36 PM - INFO: Running ingest: diagnosis


params: {'json': {'table': 'diagnosis', 'format': 'json', 'path': 'gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/diagnosis.json', 'resolve_existing_files': True, 'updateStrategy': 'replace'}}


05/11/2022 02:13:06 PM - INFO: Ingest diagnosis response TDRResponse(id='8r2KWmo7TwGtzy4kXDZ9ew', job_status='succeeded', status_code=200, description='Ingest from gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/diagnosis.json to diagnosis in dataset id fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', submitted='2022-05-11T14:12:36.262334Z', completed='2022-05-11T14:12:56.687016Z', class_name='bio.terra.service.dataset.flight.ingest.DatasetIngestFlight')
05/11/2022 02:13:06 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/biosample.json
05/11/2022 02:13:06 PM - INFO: Running ingest: biosample


params: {'json': {'table': 'biosample', 'format': 'json', 'path': 'gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/biosample.json', 'resolve_existing_files': True, 'updateStrategy': 'replace'}}


05/11/2022 02:13:37 PM - INFO: Ingest biosample response TDRResponse(id='F8IU1aqlRvWcSgFwbhE6Uw', job_status='succeeded', status_code=200, description='Ingest from gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/biosample.json to biosample in dataset id fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', submitted='2022-05-11T14:13:06.774206Z', completed='2022-05-11T14:13:27.602732Z', class_name='bio.terra.service.dataset.flight.ingest.DatasetIngestFlight')
05/11/2022 02:13:37 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/sequencingactivity.json
05/11/2022 02:13:37 PM - INFO: Running ingest: sequencingactivity


params: {'json': {'table': 'sequencingactivity', 'format': 'json', 'path': 'gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/sequencingactivity.json', 'resolve_existing_files': True, 'updateStrategy': 'replace'}}


05/11/2022 02:14:08 PM - INFO: Ingest sequencingactivity response TDRResponse(id='_opvVDIzQWeNZnhF7pgwKA', job_status='succeeded', status_code=200, description='Ingest from gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/sequencingactivity.json to sequencingactivity in dataset id fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', submitted='2022-05-11T14:13:37.934812Z', completed='2022-05-11T14:13:57.292291Z', class_name='bio.terra.service.dataset.flight.ingest.DatasetIngestFlight')
05/11/2022 02:14:08 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/file.json
05/11/2022 02:14:08 PM - INFO: Running ingest: file


params: {'json': {'table': 'file', 'format': 'json', 'path': 'gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/file.json', 'resolve_existing_files': True, 'updateStrategy': 'replace'}}


05/11/2022 02:15:09 PM - INFO: Ingest file response TDRResponse(id='PaQLJQIkRWuP3v3CoP7D0Q', job_status='succeeded', status_code=200, description='Ingest from gs://fc-9cd4583e-7855-4b5e-ae88-d8971cfd5b46/ingest_pipeline/output/tim_core/metadata/file.json to file in dataset id fce017a3-1fc0-4cd6-9638-6a1f8dcd1c09', submitted='2022-05-11T14:14:08.427997Z', completed='2022-05-11T14:14:51.376334Z', class_name='bio.terra.service.dataset.flight.ingest.DatasetIngestFlight')
05/11/2022 02:15:09 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/variantcall.json
05/11/2022 02:15:09 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/dataset.json
05/11/2022 02:15:09 PM - INFO: Checking file ingest_pipeline/output/tim_core/metadata/project.json
05/11/2022 02:15:10 PM - INFO: Adding {"email": "prod-dsp-data-ingest@firecloud.org"} as Steward to dataset.
05/11/2022 02:15:10 PM - INFO: Running output data profiling
05/11/2022 02:15:10 PM - INFO: Building and executing tab

# Donor Transformations using transform_task

In [None]:
# Donor Transformations
src_file1 = TransformerSource(file_name="subject.tsv", primary_key="subject_id", rename_primary_key="entity:subject_id")
d_id = TransformerMap(source_column="subject_id", target_column="donor_id")
e_id = TransformerMap(source_column="ancestry", target_column="reported_ethnicity")
p_id = TransformerMap(source_column="sex", target_column="phenotypic_sex")
s_id = TransformerMap(source_column="sex", target_column="sex_assigned_at_birth")
sf_id = TransformerMap(source_column="twin_id", target_column="sibling_familymember_id")

maps = [d_id, e_id, p_id, s_id, sf_id]

diag_tf = TransformerTransform(source_columns=["donor_id", "disease_id"], target_column="diagnosis_id", transform_type=TransformType.CONCAT_STR_TO_LIST_PREFIX)
fam_tf = TransformerTransform(source_columns=["family_id"], target_column="family_id", transform_type=TransformType.COLS_TO_LIST)
bpf_tf = TransformerTransform(source_columns=["maternal_id", "paternal_id"], target_column="parent_familymember_id", transform_type=TransformType.COLS_TO_LIST)

tfs = [diag_tf, bpf_tf, fam_tf]

donor_pass = ['project_id', 'prior_testing', 'age_at_last_observation', 'congenital_status', 'multiple_datasets', 'ancestry_detail', 'submitter_id', 'dbgap_submission', 'affected_status']

tf_input_dir = "ingest_pipeline/input/metadata"
tf_output_dir = "ingest_pipeline/output/tim_core/metadata"

input_dir = f"{ws_bucket}/{tf_input_dir}"
req_donor = TransformerRequest(input_directory=input_dir, output_directory=tf_output_dir, source_files=[src_file1], destination_table="donor", passthrough_cols=donor_pass, maps=maps, transforms=tfs)


# # Run transform_task transform using above request

In [None]:
pd.set_option('display.max_columns', None)

df = transform(req_donor)
df

# Compare Donor Files

In [None]:
src_file_path = f"{ws_bucket}/{tf_output_dir}/donor.json"

src_file_path2 = f"{ws_bucket}/{tf_output_dir}/donor2.json"
df = pd.read_json(src_file_path, lines=True)
df2 = pd.read_json(src_file_path2, lines=True)

df2 = df2.drop(columns=["family_id"])

df = df.sort_index(axis=1)
df2 = df2.sort_index(axis=1)

#print("df")
#for col in df.columns:
    #print(col)
    
#print("df2")
#for col in df2.columns:
    #print(col)

df.equals(df2)