# Create AWS HealthOmics Analytic Stores to Import Genomic Data and Run Queries

Follow steps in this notebook to:
1. create AWS HealthOmics Reference, Variant, and Annotation Stores
2. import reference genome, variant files, and ClinVar annotation file from S3 to the respective data stores
3. query the variant and annotation data. 

## Prerequisites and package dependencies

In [None]:
!pip install awswrangler

In [1]:
from datetime import datetime
from pprint import pprint
import urllib

import boto3
import botocore.exceptions

from utils import *

### Create service role

In [2]:
# set a timestamp
dt_fmt = '%Y%m%dT%H%M%S'
ts = datetime.now().strftime(dt_fmt)

policy = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Action": [
        "omics:*"
      ],
      "Resource": "*"
    },
    {
      "Effect": "Allow",
      "Action": [
        "ram:AcceptResourceShareInvitation",
        "ram:GetResourceShareInvitations"
      ],
      "Resource": "*"
    },
    {
      "Effect": "Allow",
      "Action": [
        "s3:GetBucketLocation",
        "s3:PutObject",
        "s3:GetObject",
        "s3:ListBucket",
        "s3:AbortMultipartUpload",
        "s3:ListMultipartUploadParts",
        "s3:GetObjectAcl",
        "s3:PutObjectAcl"
      ],
      "Resource": "*"
    }
  ]
}

trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "omics.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

In [3]:
# Base name for role and policy
omics_iam_name = f'multimodal-omics-{ts}'
create_omics_role(omics_iam_name, policy, trust_policy)

### Create Omics client

In [5]:
omics = boto3.client('omics', region_name='us-east-1')

### Source data
Set the source data bucket to the regional replica of the Synthea Coherent dataset. If you want to use different source data, replace the bucket name here and any S3 URIs where it is used in the rest of the notebook.

In [None]:
SOURCE_BUCKET_NAME = f"guidance-multimodal-hcls-healthai-machinelearning-{omics.meta.region_name}"

## Create reference store and import reference genome

### Create reference store 

In [6]:
print(f"Checking for a reference store in region: {omics.meta.region_name}")
if get_ref_store_id(omics) == None:
    response = omics.create_reference_store(name='myReferenceStore')
    print(response)
else:
    print("Congratulations, you have an existing reference store!")

Checking for a reference store in region: us-east-1
Congratulations, you have an existing reference store!


### Import reference genome to reference store

In [4]:
SOURCE_S3_URIS = {
    "reference": f"s3://{SOURCE_BUCKET_NAME}/genomic/reference/hg19.fa"
}

In [7]:
# If using a different reference genomem, replace "hg19" with a different prefix

ref_name = f'hg19-{ts}'

ref_import_job = omics.start_reference_import_job(
    referenceStoreId=get_ref_store_id(omics), 
    roleArn=get_role_arn(omics_iam_name),
    sources=[{
        'sourceFile': SOURCE_S3_URIS["reference"],
        'name': ref_name,
        'tags': {'SourceLocation': '1kg'}
    }])

In [None]:
ref_import_job = omics.get_reference_import_job(
    referenceStoreId=get_ref_store_id(omics), 
    id=ref_import_job['id'])
ref_import_job

In [None]:
try:
    waiter = omics.get_waiter('reference_import_job_completed')
    waiter.wait(id=ref_import_job['id'], referenceStoreId=ref_import_job['referenceStoreId'])
    
    print(f"reference import job {ref_import_job['id']} complete")
except botocore.exceptions.WaiterError as e:
    print(f"reference import job {ref_import_job['id']} FAILED")
    print(e)

### !!! Wait until the above import job has finished !!!

In [None]:
resp = omics.list_references(referenceStoreId=get_ref_store_id(omics), filter={"name": ref_name})

ref_list = resp
pprint(resp)

In [None]:
# Store this reference
ref = omics.get_reference_metadata(
    referenceStoreId=get_ref_store_id(omics), 
    id=ref_list['references'][0]['id'])
ref

## Create Variant Store and import VCF files

In [12]:
SOURCE_VARIANT_URI = f"s3://{SOURCE_BUCKET_NAME}"

In [13]:
# generate a list of VCF files to import

source = urllib.parse.urlparse(SOURCE_VARIANT_URI)
bucket = source.netloc
prefix = source.path[1:]

s3r = boto3.resource('s3')

bucket = s3r.Bucket(bucket)
objects = bucket.objects.filter(Prefix=prefix, MaxKeys=10_000)
ext = '_dna.vcf'

vcf_list = [f"s3://{o.bucket_name}/{o.key}" for o in objects if o.key.endswith(ext)]

### Create Variant Store

In [None]:
var_store_name = f'synthea_newvariants_{ts.lower()}'

response = omics.create_variant_store(
    name=var_store_name, 
    reference={"referenceArn": get_reference_arn(ref_name, omics)}
)

var_store = response
response

### !!! Wait until the Variant Store is created !!!

In [None]:
try:
    waiter = omics.get_waiter('variant_store_created')
    waiter.wait(name=var_store['name'])

    print(f"variant store {var_store['name']} ready for use")
except botocore.exceptions.WaiterError as e:
    print(f"variant store {var_store['name']} FAILED:")
    print(e)

var_store = omics.get_variant_store(name=var_store['name'])

### Import VCF files

In [17]:
l_vcf = [dict(zip(["source"],[uri])) for i, uri in enumerate(vcf_list)]

response = omics.start_variant_import_job(destinationName=var_store['name'], 
                                          roleArn=get_role_arn(omics_iam_name),
                                          items=l_vcf)

## Query Variant Store with Amazon Athena

In [None]:
# To run Athena queries on the data, use AWS LakeFormation to create resource links to the database
# For the following function to work, you need to ensure the IAM user running this notebook is a Data Lake Administrator.

create_resource_link('omicsdb', var_store, store_type='variant')

In [19]:
# Omics Analytic Stores requires Athena engine version 3 for querying
# https://docs.aws.amazon.com/athena/latest/ug/versions.html

# Locate or create a suitable workgroup for Athena queries

athena = boto3.client('athena')

athena_workgroups = athena.list_work_groups()['WorkGroups']

athena_workgroup = None
for wg in athena_workgroups:
    print(wg['EngineVersion']['EffectiveEngineVersion'])
    if wg['EngineVersion']['EffectiveEngineVersion'] == 'Athena engine version 3':
        print(f"Workgroup '{wg['Name']}' found using Athena engine version 3")
        athena_workgroup = wg
        break
else:
    print("No workgroups with Athena engine version 3 found. creating one")
    athena_workgroup = athena.create_work_group(
        Name='omics',
        Configuration={
            "EngineVersion": {
                "SelectedEngineVersion": "Athena engine version 3"
            }
        }
    )

athena_workgroup

Athena engine version 3
Workgroup 'omics' found using Athena engine version 3


{'Name': 'omics',
 'State': 'ENABLED',
 'Description': '',
 'CreationTime': datetime.datetime(2023, 3, 21, 20, 36, 59, 255000, tzinfo=tzlocal()),
 'EngineVersion': {'SelectedEngineVersion': 'Athena engine version 3',
  'EffectiveEngineVersion': 'Athena engine version 3'}}

In [26]:
# Use AWS Wrangler to submit query and get results as a Pandas Dataframe

import awswrangler as wr

df_var = wr.athena.read_sql_query(
    f"select sampleid, contigname, start, referenceallele, alternatealleles, calls from {var_store['name']} limit 10;", 
    database="omicsdb", workgroup = "omics")
df_var

Unnamed: 0,sampleid,contigname,start,referenceallele,alternatealleles,calls
0,69eab197-6c14-7fcf-16d8-a18a222b82a4,1,46932823,A,[G],"[0, 1]"
1,c7fff683-fd1b-f937-71ae-a490a80c9197,1,46932823,A,[G],"[0, 1]"
2,c7fff683-fd1b-f937-71ae-a490a80c9197,1,55039973,G,"[A, T]","[0, 1]"
3,c7fff683-fd1b-f937-71ae-a490a80c9197,1,46932823,A,[G],"[0, 1]"
4,1c906349-d5f7-3b79-9385-291d6ca12ddc,1,46932823,A,[G],"[0, 1]"
5,69eab197-6c14-7fcf-16d8-a18a222b82a4,1,46932823,A,[G],"[0, 1]"
6,69eab197-6c14-7fcf-16d8-a18a222b82a4,1,55039973,G,"[A, T]","[0, 1]"
7,1c906349-d5f7-3b79-9385-291d6ca12ddc,1,46932823,A,[G],"[0, 1]"
8,1c906349-d5f7-3b79-9385-291d6ca12ddc,1,55039973,G,"[A, T]","[0, 1]"
9,dccfd9ed-8080-2743-5c17-7888e93617d5,1,46932823,A,[G],"[0, 1]"


## Create Annotation Store and import ClinVar annotation file

In [None]:
SOURCE_ANNOTATION_URI = f"s3://{SOURCE_BUCKET_NAME}/genomic/annotation/clinvar.vcf.gz"

ann_store_name = f'synthea_annotations_{ts.lower()}'

response = omics.create_annotation_store(
    name=ann_store_name, 
    reference={"referenceArn": get_reference_arn(ref_name, omics)},
    storeFormat='VCF'
)

ann_store = response
response

try:
    waiter = omics.get_waiter('annotation_store_created')
    waiter.wait(name=ann_store['name'])

    print(f"annotation store {ann_store['name']} ready for use")
except botocore.exceptions.WaiterError as e:
    print(f"annotation store {ann_store['name']} FAILED:")
    print(e)

ann_store = omics.get_annotation_store(name=ann_store['name'])


### !!! Wait until the Annotation Store is created !!!

### Import annotation file

In [None]:
response = omics.start_annotation_import_job(
    destinationName=ann_store['name'],
    roleArn=get_role_arn(omics_iam_name),
    items=[{"source": SOURCE_ANNOTATION_URI}]
)
response

## Query Annotation Store with Amazon Athena

In [None]:
create_resource_link('omicsdb', ann_store, store_type='annotation')

In [27]:
df_ann = wr.athena.read_sql_query(
    f"select contigname, start, referenceallele, alternatealleles, attributes from {ann_store['name']} order by contigname limit 10;", 
    database="omicsdb", workgroup = "omics")
df_ann

Unnamed: 0,contigname,start,referenceallele,alternatealleles,attributes
0,1,926009,G,[T],"[(CLNSIG, Likely_benign), (GENEINFO, SAMD11:14..."
1,1,926026,C,[T],"[(CLNSIG, Likely_benign), (GENEINFO, SAMD11:14..."
2,1,925975,T,[C],"[(CLNSIG, Uncertain_significance), (GENEINFO, ..."
3,1,926002,C,[T],"[(CLNSIG, Uncertain_significance), (GENEINFO, ..."
4,1,926013,G,[A],"[(CLNSIG, Uncertain_significance), (GENEINFO, ..."
5,1,926024,G,[A],"[(CLNSIG, Likely_benign), (GENEINFO, SAMD11:14..."
6,1,925955,C,[T],"[(CLNSIG, Likely_benign), (GENEINFO, SAMD11:14..."
7,1,925968,C,[T],"[(CLNSIG, Likely_benign), (GENEINFO, SAMD11:14..."
8,1,925985,C,[T],"[(CLNSIG, Likely_benign), (GENEINFO, SAMD11:14..."
9,1,925951,G,[A],"[(RS, 1640863258), (CLNSIG, Uncertain_signific..."
