### Azure Snapshot Export Example
 This notebook:
 - Exports an Azure snapshot using python client (Same call can be made from swagger/curl)
 - Reads from parquet file included in the output from the export

# Setup

In [45]:
%%capture
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install --upgrade data_repo_client
!{sys.executable} -m pip install pyarrow
!{sys.executable} -m pip install fastparquet
!{sys.executable} -m pip install google-cloud-bigquery
!{sys.executable} -m pip install google-cloud-storage

import pandas as pd
# import pyarrow as pa
import datetime, uuid, urllib, os, time, getpass, uuid, json
# from tdr_utils import TdrUtils
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
from data_repo_client import ApiClient, ApiException, Configuration, DatasetsApi, SnapshotsApi, JobsApi, ResourcesApi, DataRepositoryServiceApi
from IPython.core.display import display, clear_output, HTML
from tdr_utils import TdrUtils

### Authenticate 

Retrieve your access token by running these commands in your terminal

  1. Login in with desired user

`gcloud auth login`

  2. Print the token to use in the next step

`gcloud auth print-access-token`

In [47]:
# Set up configuration
config = Configuration()
config.host="https://data.shelbee.bee.envs-terra.bio/"

# Use access token printed in last step
config.access_token= getpass.getpass("Paste token data ")
apiClient = ApiClient(configuration=config)
apiClient.client_side_validation = False

# Create required API Clients
snapshots_api = SnapshotsApi(api_client=apiClient)
jobs_api = JobsApi(api_client=apiClient)
tdr_utils = TdrUtils(jobs_api)


# Snapshot Export

In [48]:
# Example Snapshot/table to export
exisitingSnapshotId = 'e3638824-9ed9-408e-b3f5-cba7585658a3'
exampleTableName = 'variant'

# View data using TDR endpoint
snapshots_api.lookup_snapshot_preview_by_id(exisitingSnapshotId, exampleTableName)

{'filtered_row_count': 1004,
 'result': [{'alt': 'C',
             'chromosome': '1',
             'datarepo_row_id': '004EF96A-F144-44FE-B86D-9F77296107EC',
             'id': '1:85011183:A:C',
             'position': 85011183,
             'reference': 'A'},
            {'alt': 'G',
             'chromosome': '1',
             'datarepo_row_id': '00728302-8D2B-40FA-986F-8197D9DC8924',
             'id': '1:93814411:A:G',
             'position': 93814411,
             'reference': 'A'},
            {'alt': 'G',
             'chromosome': '1',
             'datarepo_row_id': '00D535C6-4E97-452C-A0B6-7D0B9BB77EF6',
             'id': '1:232940972:A:G',
             'position': 232940972,
             'reference': 'A'},
            {'alt': 'G',
             'chromosome': '1',
             'datarepo_row_id': '0150F103-7093-40F7-B189-B94FB428BAA2',
             'id': '1:195697623:A:G',
             'position': 195697623,
             'reference': 'A'},
            {'alt': 'G',
          

In [50]:
# Now, let's export the snapshot and take a look at the same "variant" table
export_snapshot_result = tdr_utils.wait_for_job(snapshots_api.export_snapshot(exisitingSnapshotId,  validate_primary_key_uniqueness='false'))


# Copy Azure Parquet Files to GCP Bucket
Towards the bottom of the export, we can find the signed URLs for the JSON manifest and the parquet files containing the tabular data.

In [51]:

variantParquet = export_snapshot_result['format']['parquet']['location']['tables'][0]['paths'][1]
print(variantParquet)

https://tdrsheagjudbqsygpddezygu.blob.core.windows.net/e3638824-9ed9-408e-b3f5-cba7585658a3/metadata%2Fparquet%2Fancestry_specific_meta_analysis%2Fancestry_specific_meta_analysis.parquet%2F2E96FCE6-DD21-4BAC-AF59-E7EE29168B4D_2477_0-1.parquet?sv=2022-11-02&spr=https&se=2023-11-06T02%3A24%3A51Z&sp=rl&sig=c2wbR%2B6%2FYnVEzNZBi8qlOXoz%2BdJHkTZWdEMrz3fXxaY%3D&sr=c&rscd=sholdendev%40gmail.com


In [53]:
destinationBucket = "test-data-sholden/test-parquet-export"
# Copy parquet file for variant table to GCS
destinationFileName = "testFile.parquet"
!{sys.executable} -m wget -O - testSourceUrl  | gcloud storage cp - "gs://{destinationBucket}/{destinationFileName}"

/usr/local/opt/python@3.9/bin/python3.9: No module named wget
Copying file://- to gs://test-data-sholden/test-parquet-export/testFile.parquet
  
.


# Import parquet to BigQuery

In [55]:
from google.cloud import bigquery

existing_gcp_project = "broad-jade-sh"
existing_dataset_name = "test_parquet_export"
new_table_name = "variantTable"

# Construct a BigQuery client object.
client = bigquery.Client(project=existing_gcp_project)
# Set table_id to the ID of the table to create.
table_id = "{}.{}.{}".format(existing_gcp_project, existing_dataset_name, new_table_name)

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.PARQUET,
)
#uri = "gs://{}/{}".format(destinationBucket, destinationFileName)
## MANUAL OVERRIDE BECAUSE COPIED FILE ISN'T WORKING
uri = "gs://test-data-sholden/test-parquet-export/41F221D2-4864-4CA7-9C69-787EF7C4D38C_2480_0-1.parquet"

load_job = client.load_table_from_uri(
    uri, table_id, job_config=job_config
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Loaded 1004 rows.
