<a href="https://colab.research.google.com/github/aihgii/gcp-variant-transforms/blob/master/playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up

### Clone the repo

In [None]:
%cd ~/
!rm -rf gcp-variant-transforms
!git clone https://github.com/aihgii/gcp-variant-transforms.git
%cd gcp-variant-transforms

### Install requirements

In [None]:
!pip3 install -r requirements.txt

### Import dependencies

In [3]:
import os
from string import Template

### Set up credentials and environment variables

In [None]:
!gcloud auth login

In [None]:
!gcloud auth application-default login

In [5]:
CREDENTIALS = "/content/.config/application_default_credentials.json" # @param {type:"string"}
PROJECT = "" # @param {type:"string"}
REGION = "us-west1" # @param {type:"string"}
ZONE = "us-west1-b" # @param {type:"string"}
BUCKET = "" # @param {type:"string"}
DATASET = "genomics" # @param {type:"string"}
TABLE = "variants" # @param {type:"string"}

os.environ.update({
    "GOOGLE_APPLICATION_CREDENTIALS": CREDENTIALS,
    "GOOGLE_CLOUD_PROJECT": PROJECT,
    "GOOGLE_CLOUD_REGION": REGION,
    "GOOGLE_CLOUD_ZONE": ZONE,
    "GCS_BUCKET": BUCKET,
    "BQ_DATASET": DATASET,
    "BQ_TABLE": TABLE
})

In [None]:
!gcloud config set project $GOOGLE_CLOUD_PROJECT

### Set up infrastructure

In [None]:
!wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
!echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
!sudo apt update && sudo apt install terraform

In [8]:
%%writefile main.tf

terraform {
  required_providers {
    google = {
      source = "hashicorp/google"
      version = "4.51.0"
    }
  }
}

provider "google" {
  credentials = "${GOOGLE_APPLICATION_CREDENTIALS}"
  project = "${GOOGLE_CLOUD_PROJECT}"
  region = "${GOOGLE_CLOUD_REGION}"
  zone = "${GOOGLE_CLOUD_ZONE}"
}

module "project-services" {
  source  = "terraform-google-modules/project-factory/google//modules/project_services"
  version = "~> 14.4"

  project_id = "${GOOGLE_CLOUD_PROJECT}"

  activate_apis = [
    "bigquery.googleapis.com",
    "compute.googleapis.com",
    "dataflow.googleapis.com",
    "lifesciences.googleapis.com",
    "storage-component.googleapis.com"
  ]
}

resource "google_storage_bucket" "${GCS_BUCKET}" {
  name          = "${GCS_BUCKET}"
  location      = "US"
  force_destroy = true

  public_access_prevention = "enforced"
}

resource "google_bigquery_dataset" "genomics" {
  dataset_id                  = "${BQ_DATASET}"
}

Writing main.tf


In [9]:
with open('main.tf', 'r') as f:
    tmp = Template(f.read()).substitute(os.environ)
with open('main.tf', 'w') as f:
    f.write(tmp)

In [None]:
!terraform init

In [None]:
!terraform plan

In [None]:
!terraform apply

## Playground

### Copying test data

In [None]:
!gsutil cp \
    gs://genomics-public-data/platinum-genomes/vcf/NA1287*_S1.genome.vcf \
    gs://$GCS_BUCKET/platinum-genomes/vcf/

Copying gs://genomics-public-data/platinum-genomes/vcf/NA12877_S1.genome.vcf [Content-Type=text/x-vcard]...
Copying gs://genomics-public-data/platinum-genomes/vcf/NA12878_S1.genome.vcf [Content-Type=text/x-vcard]...
/ [2 files][ 10.2 GiB/ 10.2 GiB]                                                
Operation completed over 2 objects/10.2 GiB.                                     


### Running VCF files to BigQuery Preprocessor

#### Direct runner

In [None]:
!python -m gcp_variant_transforms.vcf_to_bq_preprocess \
  --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \
  --report_path gs://$GCS_BUCKET/report.tsv \
  --job_name vcf-to-bigquery-preprocess-direct-runner \
  --resolved_headers_path gs://$GCS_BUCKET/resolved_headers.vcf \
  --temp_location gs://$GCS_BUCKET/temp

#### Dataflow runner

In [None]:
!python -m gcp_variant_transforms.vcf_to_bq_preprocess \
  --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \
  --report_path gs://$GCS_BUCKET/report.tsv \
  --job_name vcf-to-bigquery-preprocess \
  --resolved_headers_path gs://$GCS_BUCKET/resolved_headers.vcf \
  --report_all_conflicts true \
  --setup_file ./setup.py \
  --runner DataflowRunner \
  --project $GOOGLE_CLOUD_PROJECT \
  --region $GOOGLE_CLOUD_REGION \
  --temp_location gs://$GCS_BUCKET/temp \
  --requirements_file requirements.txt

### Running VCF files to BigQuery transformation

#### Direct runner

In [None]:
!python -m gcp_variant_transforms.vcf_to_bq \
  --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \
  --output_table $GOOGLE_CLOUD_PROJECT:$BQ_DATASET.$BQ_TABLE \
  --job_name vcf-to-bigquery-direct-runner \
  --temp_location gs://$GCS_BUCKET/temp

#### Dataflow runner

In [None]:
!python -m gcp_variant_transforms.vcf_to_bq \
  --input_pattern gs://$GCS_BUCKET/platinum-genomes/vcf/*.vcf \
  --output_table $GOOGLE_CLOUD_PROJECT:$BQ_DATASET.$BQ_TABLE \
  --job_name vcf-to-bigquery \
  --setup_file ./setup.py \
  --runner DataflowRunner \
  --project $GOOGLE_CLOUD_PROJECT \
  --region $GOOGLE_CLOUD_REGION \
  --temp_location gs://$GCS_BUCKET/temp \
  --requirements_file requirements.txt