### Azure Snapshot Export Example
 This notebook:
 - Exports an Azure snapshot using python client (Same call can be made from swagger/curl)
 - Reads from parquet file included in the output from the export
 - Streams the file from Azure and copies it into a GCP Bucket
 - Imports the parquet file from the GCP Bucket into a BQ Dataset

# Setup

In [32]:
%%capture
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install --upgrade data_repo_client
!{sys.executable} -m pip install pyarrow
!{sys.executable} -m pip install fastparquet
!{sys.executable} -m pip install google-cloud-bigquery
!{sys.executable} -m pip install google-cloud-storage

import pandas as pd
# import pyarrow as pa
import datetime, uuid, urllib, os, time, getpass, uuid, json
# from tdr_utils import TdrUtils
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
from data_repo_client import ApiClient, ApiException, Configuration, DatasetsApi, SnapshotsApi, JobsApi, ResourcesApi, DataRepositoryServiceApi
from IPython.core.display import display, clear_output, HTML
from tdr_utils import TdrUtils

### Authenticate 

In [33]:
# Set up configuration
config = Configuration()
config.host="https://data.shelbee.bee.envs-terra.bio/"
token=!gcloud auth print-access-token sholden@broadinstitute.org
config.access_token = token[0]

apiClient = ApiClient(configuration=config)
apiClient.client_side_validation = False

# Create required API Clients
snapshots_api = SnapshotsApi(api_client=apiClient)
jobs_api = JobsApi(api_client=apiClient)
tdr_utils = TdrUtils(jobs_api)


# Snapshot Export

In [34]:
# Example Snapshot/table to export
exisitingSnapshotId = 'e3638824-9ed9-408e-b3f5-cba7585658a3'
exampleTableName = 'variant'

# View data using TDR endpoint
snapshots_api.lookup_snapshot_preview_by_id(exisitingSnapshotId, exampleTableName, filter="WHERE id IN ('1:93814411:A:G', '1:85011183:A:C')")

{'filtered_row_count': 2,
 'result': [{'alt': 'C',
             'chromosome': '1',
             'datarepo_row_id': '004EF96A-F144-44FE-B86D-9F77296107EC',
             'id': '1:85011183:A:C',
             'position': 85011183,
             'reference': 'A'},
            {'alt': 'G',
             'chromosome': '1',
             'datarepo_row_id': '00728302-8D2B-40FA-986F-8197D9DC8924',
             'id': '1:93814411:A:G',
             'position': 93814411,
             'reference': 'A'}],
 'total_row_count': 1004}

In [35]:
# Now, let's export the snapshot and take a look at the same "variant" table
export_snapshot_result = tdr_utils.wait_for_job(snapshots_api.export_snapshot(exisitingSnapshotId,  validate_primary_key_uniqueness='false'))


# Copy Azure Parquet Files to GCP Bucket
- Build Function to perform copy

In [36]:
import requests
from google.cloud import storage

def copyFileToGCPBucket(parquet_uri, destination_bucket, destination_file_name):
    with requests.get(parquet_uri, stream=True) as r:
        client = storage.Client()
        bucket = client.get_bucket(destination_bucket)
        blob = bucket.blob(destination_file_name)
        blob.upload_from_file(r.raw)

# Import parquet to BigQuery

In [38]:
from google.cloud import bigquery

existing_gcp_project = "broad-jade-sh"
existing_dataset_name = "test_parquet_export"

def importToBQ(new_table_name, source_gs_path):

    # Construct a BigQuery client object.
    client = bigquery.Client(project=existing_gcp_project)
    # Set table_id to the ID of the table to create.
    table_id = "{}.{}.{}".format(existing_gcp_project, existing_dataset_name, new_table_name)

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,
    )

    load_job = client.load_table_from_uri(
        source_gs_path, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))

# Example Copy and Import to BQ
### Variant Table

In [40]:
# Towards the bottom of the export manifest, we can find the signed URLs for the JSON manifest and the parquet files containing the tabular data.
# You'll find two urls for each table in this example
# The first URL is the directory location, the second is the actual file
# There can be multiple parquet files if the table is large
variantParquet = export_snapshot_result['format']['parquet']['location']['tables'][5]['paths'][1]
copyFileToGCPBucket(variantParquet, "test-data-sholden", "test-parquet-export/variant.parquet")
variant_uri = "gs://test-data-sholden/test-parquet-export/variant.parquet"
importToBQ("variant_table", variant_uri)

Loaded 1004 rows.


## All Data Types Table

In [41]:
# Test with another table - "all_data_types"
all_data_types_parquet = export_snapshot_result['format']['parquet']['location']['tables'][6]['paths'][1]
copyFileToGCPBucket(all_data_types_parquet, "test-data-sholden", "test-parquet-export/all-data-types.parquet")
all_uri = "gs://test-data-sholden/test-parquet-export/all-data-types.parquet"
importToBQ("all_data_types_table", all_uri)

# NOTE:
# Array columns are stored as strings in the parquet files, so we would need to do some post-processing to convert them to actual array fields
# Datetime and timestamp appear to transfer correctly, but the "time" column does not appear to be correct.

Loaded 5 rows.
