# VCF Import Status Tracker Setup

This notebook sets up the infrastructure for tracking HealthOmics VCF import job statuses with improved Lambda function.

## Prerequisites

In [None]:
!python3 -m pip install --upgrade -q botocore
!python3 -m pip install --upgrade -q boto3
!python3 -m pip install --upgrade -q awscli

In [62]:
import os
os.environ['AWS_PROFILE'] = 'YOUR_AWS_PROFILE'  # Update with your AWS profile

In [None]:
import boto3
import json
import time
import zipfile
import uuid
import pprint
import logging
import botocore.exceptions
import subprocess
import shutil
import sys
import re
print(f"Boto3 version: {boto3.__version__}")

In [64]:
# Initialize AWS clients
sts_client = boto3.client('sts')
iam_client = boto3.client('iam')
lambda_client = boto3.client('lambda')
dynamodb = boto3.client('dynamodb')
omics = boto3.client('omics')
s3 = boto3.client('s3')
events_client = boto3.client('events')

In [None]:
# Get AWS account information
session = boto3.session.Session()
region = session.region_name
account_id = sts_client.get_caller_identity()["Account"]
print(f"Region: {region}")
print(f"Account ID: {account_id}")

## 1. Create DynamoDB Table

In [None]:
# Create DynamoDB table for tracking VCF import jobs
table_name = 'VcfImportTracking3'

try:
    response = dynamodb.create_table(
        TableName=table_name,
        AttributeDefinitions=[
            {
                'AttributeName': 'SampleID',
                'AttributeType': 'S'
            }
        ],
        KeySchema=[
            {
                'AttributeName': 'SampleID',
                'KeyType': 'HASH'
            }
        ],
        BillingMode='PAY_PER_REQUEST'
    )
    print(f"✅ Table {table_name} created successfully!")
    print(f"Status: {response['TableDescription']['TableStatus']}")
    
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == 'ResourceInUseException':
        print(f"✅ Table {table_name} already exists")
    else:
        print(f"❌ Error creating table: {e}")

## 2. Create IAM Role for Lambda

In [None]:
# Create IAM role for Lambda function
role_name = 'VcfProcessorLambdaRole3'

# Trust policy for Lambda
lambda_trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": [
                    "lambda.amazonaws.com",
                    "omics.amazonaws.com"
                ]
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

try:
    # Check if role exists
    try:
        iam_client.get_role(RoleName=role_name)
        print(f"✅ Role {role_name} already exists")
    except iam_client.exceptions.NoSuchEntityException:
        # Create the role
        iam_client.create_role(
            RoleName=role_name,
            AssumeRolePolicyDocument=json.dumps(lambda_trust_policy),
            Description='Role for VCF Processor Lambda function'
        )
        print(f"✅ Role {role_name} created successfully!")
        
except Exception as e:
    print(f"❌ Error with role operation: {e}")

In [None]:
# Attach comprehensive policy to the role
policy_name = 'VcfProcessorPolicy3'

lambda_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "logs:CreateLogGroup",
                "logs:CreateLogStream",
                "logs:PutLogEvents"
            ],
            "Resource": "arn:aws:logs:*:*:*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetBucketLocation",
                "s3:PutObject",
                "s3:GetObject",
                "s3:ListBucket",
                "s3:AbortMultipartUpload",
                "s3:ListMultipartUploadParts",
                "s3:GetObjectAcl",
                "s3:PutObjectAcl"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "dynamodb:PutItem",
                "dynamodb:UpdateItem",
                "dynamodb:GetItem",
                "dynamodb:Scan"
            ],
            "Resource": [
                f"arn:aws:dynamodb:*:*:table/{table_name}"
            ]
        },
        {
            "Effect": "Allow",
            "Action": [
                "omics:StartVariantImportJob",
                "omics:GetVariantImportJob",
                "omics:ListVariantStores",
                "omics:CreateVariantStore",
                "omics:GetVariantStore",
                "omics:ListReferenceStores",
                "omics:GetReferenceStore",
                "omics:ListReferences", 
                "omics:GetReference",
                "omics:GetReferenceMetadata"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ram:AcceptResourceShareInvitation",
                "ram:GetResourceShareInvitations"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "lakeformation:GrantPermissions",
                "lakeformation:RevokePermissions",
                "lakeformation:ListPermissions",
                "lakeformation:GetDataAccess",
                "lakeformation:GetDataLakeSettings",
                "lakeformation:PutDataLakeSettings"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "glue:CreateTable",
                "glue:GetTable",
                "glue:UpdateTable",
                "glue:DeleteTable",
                "glue:GetTables",
                "glue:GetDatabase",
                "glue:GetDatabases",
                "lakeformation:GetDataAccess",
                "lakeformation:GrantPermissions",
                "lakeformation:RevokePermissions",
                "lakeformation:ListPermissions",
                "lakeformation:GetResourceLFTags",
                "lakeformation:ListLFTags"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": "iam:PassRole",
            "Resource": f"arn:aws:iam::{account_id}:role/{role_name}"
        },
        {
            "Effect": "Allow",
            "Action": [
                "kms:Decrypt",
                "kms:DescribeKey",
                "kms:GenerateDataKey",
                "kms:Encrypt"
            ],
             "Resource": "arn:aws:kms:us-east-1:AWS_ACCOUNT_ID:key/AWS_KMS_KEY_ID"
        }
    ]
}

try:
    # Check if policy exists
    try:
        iam_client.get_role_policy(RoleName=role_name, PolicyName=policy_name)
        print(f"✅ Policy {policy_name} already exists on role")
    except iam_client.exceptions.NoSuchEntityException:
        # Attach the policy
        iam_client.put_role_policy(
            RoleName=role_name,
            PolicyName=policy_name,
            PolicyDocument=json.dumps(lambda_policy)
        )
        print(f"✅ Policy {policy_name} attached to role successfully!")
        
except Exception as e:
    print(f"❌ Error with policy operation: {e}")

## 3. Setup HealthOmics Resources

### Create reference store

In [69]:
def get_role_arn(role_name):
    try:
        iam = boto3.resource('iam')
        role = iam.Role(role_name)
        role.load()  # calls GetRole to load attributes
    except ClientError:
        print("Couldn't get role named %s."%role_name)
        raise
    else:
        return role.arn

In [None]:
omics = boto3.client('omics', region_name='us-east-1')
list_ref_stores = omics.list_reference_stores()
ref_store = list_ref_stores.get('referenceStores')
if not ref_store:
    response = omics.create_reference_store(name='RefStore')
    print(response)
    ref_store_id = response['id']
else:
    ref_store_id = ref_store[0]['id']
ref_store_id
# reference_s3_uri = 's3://YOUR_S3_BUCKET/YOUR_PREFIX/hg38_alt_aware_nohla.fa'
# ref_import_job = omics.start_reference_import_job(
#     referenceStoreId = ref_store_id,
#     roleArn = get_role_arn(role_name),
#     sources=[{
#         'sourceFile': reference_s3_uri,
#         'name': 'YOUR_REFERENCE_NAME',
#         'tags': {'SourceLocation': '1kg'}
#    }])

In [None]:
ref_import_job = omics.get_reference_import_job(
    referenceStoreId=ref_store_id, 
    id=ref_import_job['id'])
ref_import_job

In [None]:
ref = omics.list_references(referenceStoreId=ref_store_id,
    filter ={"name": "YOUR_REFERENCE_NAME"})
print(ref)
reference_id = ref['references'][0]['id']
reference_arn = ref['references'][0]['arn']

### Create Variant store

In [111]:
var_store_name = f'YOUR_VARIANT_STORE_NAME'
ref_name = 'YOUR_REFERENCE_NAME'  ## Change this reference name to match one you have created if needed

In [None]:
import boto3
from botocore.exceptions import ClientError

# Initialize the omics client
omics = boto3.client('omics')

def get_existing_variant_store(store_name):
    """Check if a variant store with the given name already exists"""
    try:
        # List all variant stores
        response = omics.list_variant_stores()
        # Check if any store matches the name
        for store in response.get('variantStores', []):
            if store['name'] == store_name:
                return store
        return None
    except ClientError as e:
        print(f"Error listing variant stores: {e}")
        return None

try:
    # Try to create the variant store
    response = omics.create_variant_store(
        name=var_store_name, 
        reference={"referenceArn": reference_arn}
    )
    var_store = response
    print(f"Created new variant store: {response['id']}")
    
except ClientError as e:
    if e.response['Error']['Code'] == 'ConflictException':
        # Store already exists, get the existing one
        print(f"Variant store '{var_store_name}' already exists, retrieving existing store...")
        existing_store = get_existing_variant_store(var_store_name)
        if existing_store:
            var_store = existing_store
            response = existing_store
        else:
            raise Exception(f"Could not find existing variant store: {var_store_name}")
    else:
        raise e

In [None]:
var_store = omics.get_variant_store(name=var_store['name'])
print (var_store)

In [114]:
# Configuration - Update these values for your setup
var_store_name = 'YOUR_VARIANT_STORE_NAME'  # Update with your variant store name
reference_arn = 'arn:aws:omics:us-east-1:AWS_ACCOUNT_ID:referenceStore/YOUR_REF_STORE_ID/reference/YOUR_REFERENCE_ID'  # Update with your reference ARN

In [None]:
def enable_analytics_via_cli():
    """Enable HealthOmics Analytics using AWS CLI"""
    
    try:
        # AWS CLI command to enable analytics
        cmd = [
            'aws', 'omics', 'update-variant-store',
            '--name', 'YOUR_VARIANT_STORE_NAME',
            '--description', 'Store with analytics enabled for Athena queries',
            '--region', 'us-east-1'
        ]
        
        print("🔧 Running AWS CLI command...")
        print(f"Command: {' '.join(cmd)}")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print("✅ Command executed successfully!")
            print("Response:", result.stdout)
            return True
        else:
            print("❌ Command failed!")
            print("Error:", result.stderr)
            return False
            
    except Exception as e:
        print(f"❌ Error executing command: {e}")
        return False

# Run the command
success = enable_analytics_via_cli()

if success:
    print("\n🎯 Next steps:")
    print("1. Wait 10-15 minutes for analytics to be fully enabled")
    print("2. Try your Athena queries again")
    print("3. The resource links should now work properly")

In [None]:
# Get variant store details
if var_store:
    try:
        var_store_details = omics.get_variant_store(name=var_store['name'])
        print(f"📋 Variant Store Details:")
        print(f"   Name: {var_store_details['name']}")
        print(f"   ID: {var_store_details['id']}")
        print(f"   Status: {var_store_details['status']}")
        print(f"   ARN: {var_store_details['storeArn']}")
        
        # Store these for Lambda environment variables
        variant_store_name = var_store_details['name']
        variant_store_id = var_store_details['id']
        
    except Exception as e:
        print(f"❌ Error getting variant store details: {e}")
else:
    print("❌ No variant store available")

## 4. Deploy Improved Lambda Function

In [131]:
# Lambda deployment functions
def create_lambda_zip(source_file, output_zip):
    """Create a zip file containing the Lambda function code and dependencies"""
    print(f"Creating zip file {output_zip} with {source_file} and dependencies")
    
    temp_dir = "lambda_package_temp"
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)
    
    try:
        # Copy the source file
        source_filename = os.path.basename(source_file)
        shutil.copy(source_file, os.path.join(temp_dir, source_filename))
        
        # Install dependencies
        print("Installing dependencies...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", 
            "boto3", "botocore", 
            "--target", temp_dir, 
            "--no-cache-dir"
        ])
        
        # Create the zip file
        with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, _, files in os.walk(temp_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, temp_dir)
                    zipf.write(file_path, arcname)
        
        print(f"✅ Successfully created {output_zip} with dependencies")
        return True
    except Exception as e:
        print(f"❌ Error creating zip file: {e}")
        return False
    finally:
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

In [132]:
def deploy_lambda_function(function_name, zip_file, role_arn, handler, runtime, timeout, environment_vars):
    """Deploy or update Lambda function"""
    
    try:
        with open(zip_file, 'rb') as file_data:
            zip_bytes = file_data.read()
    except Exception as e:
        print(f"❌ Error reading zip file: {e}")
        return None
    
    # Check if function exists
    try:
        lambda_client.get_function(FunctionName=function_name)
        function_exists = True
        print(f"🔄 Lambda function {function_name} exists. Updating...")
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == 'ResourceNotFoundException':
            function_exists = False
            print(f"🆕 Creating new Lambda function {function_name}...")
        else:
            print(f"❌ Error checking Lambda function: {e}")
            return None
    
    try:
        if function_exists:
            # Update function code
            print("Updating function code...")
            lambda_client.update_function_code(
                FunctionName=function_name,
                ZipFile=zip_bytes
            )
            
            # Wait for update to complete
            print("Waiting for code update to complete...")
            time.sleep(10)  # Simple wait
            
            # Update configuration
            print("Updating function configuration...")
            response = lambda_client.update_function_configuration(
                FunctionName=function_name,
                Role=role_arn,
                Handler=handler,
                Runtime=runtime,
                Timeout=timeout,
                Environment={
                    'Variables': environment_vars
                }
            )
            print(f"✅ Lambda function {function_name} updated successfully!")
            return response
        else:
            # Create new function
            response = lambda_client.create_function(
                FunctionName=function_name,
                Runtime=runtime,
                Role=role_arn,
                Handler=handler,
                Code={
                    'ZipFile': zip_bytes
                },
                Timeout=timeout,
                Environment={
                    'Variables': environment_vars
                }
            )
            print(f"✅ Lambda function {function_name} created successfully!")
            return response
    except Exception as e:
        print(f"❌ Error deploying Lambda function: {e}")
        return None

In [None]:
# Deploy the improved Lambda function
# Make sure you have lambda_function_fixed_final.py in the current directory

# Parameters
source_file = 'lambda_function_fixed_final.py'  # Use the fixed_final version
zip_file = 'lambda_function.zip'
function_name = 'VcfProcessor3'
runtime = 'python3.9'
handler = 'lambda_function_fixed_final.lambda_handler'  # Updated handler
role_arn = f'arn:aws:iam::{account_id}:role/{role_name}'
timeout = 900

# Environment variables for Lambda
environment_vars = {
    'VARIANT_STORE_NAME': variant_store_name,
    'VARIANT_STORE_ID': variant_store_id,
    'DYNAMODB_TABLE': table_name
}

print(f"📦 Deploying Lambda function with:")
print(f"   Source: {source_file}")
print(f"   Function Name: {function_name}")
print(f"   Variant Store: {variant_store_name} ({variant_store_id})")
print(f"   DynamoDB Table: {table_name}")

# Execute deployment
if create_lambda_zip(source_file, zip_file):
    response = deploy_lambda_function(
        function_name,
        zip_file,
        role_arn,
        handler,
        runtime,
        timeout,
        environment_vars
    )
    
    if response:
        print(f"🎯 Function ARN: {response.get('FunctionArn')}")
        print(f"📊 Function State: {response.get('State')}")
        lambda_function_arn = response.get('FunctionArn')
    else:
        print("❌ Failed to deploy Lambda function")
else:
    print("❌ Failed to create deployment package")

## 5. Configure S3 Event Trigger

In [119]:
# Configure S3 bucket to trigger Lambda on VCF file uploads
def configure_s3_event_notification(s3_uri, lambda_function_arn, event_types, suffix=None):
    """Configure S3 bucket to trigger Lambda function on specified events"""
    
    # Parse S3 URI
    match = re.match(r's3://([^/]+)/?(.*)', s3_uri)
    if not match:
        raise ValueError(f"Invalid S3 URI format: {s3_uri}")
    
    bucket_name = match.group(1)
    prefix = match.group(2)
    
    # Create filter rules
    filter_rules = []
    if prefix:
        filter_rules.append({'Name': 'prefix', 'Value': prefix})
    if suffix:
        filter_rules.append({'Name': 'suffix', 'Value': suffix})
    
    # Create notification configuration
    notification_config = {
        'LambdaFunctionConfigurations': [
            {
                'LambdaFunctionArn': lambda_function_arn,
                'Events': event_types
            }
        ]
    }
    
    if filter_rules:
        notification_config['LambdaFunctionConfigurations'][0]['Filter'] = {
            'Key': {'FilterRules': filter_rules}
        }
    
    try:
        # Add Lambda permission for S3
        statement_id = f"s3-{bucket_name}-to-lambda-{lambda_function_arn.split(':')[-1]}"
        
        try:
            lambda_client.add_permission(
                FunctionName=lambda_function_arn,
                StatementId=statement_id,
                Action='lambda:InvokeFunction',
                Principal='s3.amazonaws.com',
                SourceArn=f"arn:aws:s3:::{bucket_name}"
            )
            print(f"✅ Added S3 permission for bucket {bucket_name}")
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == 'ResourceConflictException':
                print(f"✅ S3 permission already exists for bucket {bucket_name}")
            else:
                print(f"❌ Error adding S3 permission: {e}")
                return False
        
        # Apply notification configuration
        s3.put_bucket_notification_configuration(
            Bucket=bucket_name,
            NotificationConfiguration=notification_config
        )
        
        print(f"✅ S3 event notification configured for bucket {bucket_name} with prefix '{prefix}'")
        return True
        
    except Exception as e:
        print(f"❌ Error configuring S3 event notification: {e}")
        return False

In [None]:
# Configure S3 event trigger
# Update this S3 URI to match your bucket and prefix
s3_uri = "s3://YOUR_S3_BUCKET/YOUR_PREFIX/"  # UPDATE THIS
event_types = ['s3:ObjectCreated:*']
suffix = '.hard-filtered.vcf.gz'

print(f"🔗 Configuring S3 event trigger:")
print(f"   S3 URI: {s3_uri}")
print(f"   File suffix: {suffix}")
print(f"   Lambda ARN: {lambda_function_arn}")

if 'lambda_function_arn' in locals():
    success = configure_s3_event_notification(
        s3_uri,
        lambda_function_arn,
        event_types,
        suffix
    )
    
    if success:
        print("🎉 S3 event notification configured successfully!")
    else:
        print("❌ Failed to configure S3 event notification")
else:
    print("⚠️ Lambda function ARN not available. Deploy Lambda function first.")

## 6. Setup Scheduled Status Checking (Optional)

In [None]:
# # Optional: Set up EventBridge rule for scheduled status checking
# # This provides a fallback mechanism for status updates

# RULE_NAME = "VcfStatusCheckSchedule"
# SCHEDULE_EXPRESSION = "rate(10 minutes)"  # Check every 10 minutes

# print(f"⏰ Setting up scheduled status checking:")
# print(f"   Rule Name: {RULE_NAME}")
# print(f"   Schedule: {SCHEDULE_EXPRESSION}")

# try:
#     # Create EventBridge rule
#     rule_response = events_client.put_rule(
#         Name=RULE_NAME,
#         ScheduleExpression=SCHEDULE_EXPRESSION,
#         Description="Triggers VCF status checker as fallback",
#         State='ENABLED'
#     )
#     print(f"✅ EventBridge rule created: {rule_response['RuleArn']}")
    
#     # Add Lambda as target
#     if 'lambda_function_arn' in locals():
#         target_response = events_client.put_targets(
#             Rule=RULE_NAME,
#             Targets=[
#                 {
#                     'Id': '1',
#                     'Arn': lambda_function_arn
#                 }
#             ]
#         )
        
#         if target_response['FailedEntryCount'] == 0:
#             print(f"✅ Lambda function added as EventBridge target")
#         else:
#             print(f"❌ Failed to add Lambda as target: {target_response['FailedEntries']}")
        
#         # Grant EventBridge permission to invoke Lambda
#         try:
#             lambda_client.add_permission(
#                 FunctionName=function_name,
#                 StatementId="AllowExecutionFromEventBridge",
#                 Action="lambda:InvokeFunction",
#                 Principal="events.amazonaws.com",
#                 SourceArn=f"arn:aws:events:{region}:{account_id}:rule/{RULE_NAME}"
#             )
#             print(f"✅ EventBridge permission granted")
#         except botocore.exceptions.ClientError as e:
#             if e.response['Error']['Code'] == 'ResourceConflictException':
#                 print(f"✅ EventBridge permission already exists")
#             else:
#                 print(f"❌ Error granting EventBridge permission: {e}")
#     else:
#         print("⚠️ Lambda function ARN not available for EventBridge target")
        
# except Exception as e:
#     print(f"❌ Error setting up EventBridge rule: {e}")

## 7. Testing and Verification

In [None]:
# Check DynamoDB table contents
def show_all_records():
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    
    items = table.scan()['Items']
    print(f"Found {len(items)} records:\n")
    
    for i, item in enumerate(items, 1):
        print(f"Record {i}:")
        for key, value in item.items():
            print(f"  {key}: {value}")
        print()

show_all_records()

### Testing variant store queries

In [137]:
ram = boto3.client('ram')
glue = boto3.client('glue')

caller_identity = boto3.client('sts').get_caller_identity()
AWS_ACCOUNT_ID = caller_identity['Account']
AWS_IDENITY_ARN = caller_identity['Arn']

First we'll list available shared resources from OTHER-ACCOUNTS in AWS RAM and look for the resource that matches the id of the Variant store we created above.

In [None]:
response = ram.list_resources(resourceOwner='OTHER-ACCOUNTS', resourceType='glue:Database')

if not response.get('resources'):
    print('no shared resources found. verify that you have successfully created an Omics Analytics store')
else:
    variantstore_resources = [resource for resource in response['resources'] if var_store['id'] in resource['arn']]
    if not variantstore_resources:
        print(f"no shared resources matching variant store id {var_store['id']} found")
    else:
        variantstore_resource = variantstore_resources[0]

variantstore_resource

In [None]:
resource_share = ram.get_resource_shares(
    resourceOwner='OTHER-ACCOUNTS', 
    resourceShareArns=[variantstore_resource['resourceShareArn']])['resourceShares'][0]
resource_share

#### Create a Database

In [None]:
def create_your_own_database():
    """Create your own database instead of using the shared one"""
    
    glue = boto3.client('glue')
    lakeformation = boto3.client('lakeformation')
    sts_client = boto3.client('sts')
    
    account_id = sts_client.get_caller_identity()['Account']
    role_arn = f'arn:aws:iam::{account_id}:role/VcfProcessorLambdaRole3'
    
    # Create your own database
    database_name = 'vcf_analysis_db'
    
    try:
        # Create database
        glue.create_database(
            DatabaseInput={
                'Name': database_name,
                'Description': 'Database for VCF analysis and genomics data'
            }
        )
        print(f"✅ Created database: {database_name}")
        
        # Grant yourself ALL permissions on your own database
        lakeformation.grant_permissions(
            Principal={
                'DataLakePrincipalIdentifier': role_arn
            },
            Resource={
                'Database': {
                    'Name': database_name
                }
            },
            Permissions=['ALL']
        )
        print(f"✅ Granted ALL permissions on {database_name}")
    except Exception as e:
        print(f"❌ Error: {e}")

# Run this instead
create_your_own_database()

Now that we have resource links created, we can start quering the data using Amazon Athena. You don't need to wait for all the import jobs to complete to start doing this. Queries can be made while data imports in the background.

To query Amazon Omics Analytics stores, you need to use Athena engine version 3. The following code checks if you have an existing Athena workgroup that satisfies this criteria. If not it will create one called omics.

In [None]:
# # Delete a single table
# try:
#     response = glue.delete_table(
#         DatabaseName='vcf_analysis_db',
#         Name=var_store['name']
#     )
#     print(f"Table deleted successfully: {response}")
# except Exception as e:
#     print(f"Error deleting table: {e}")

In [None]:
glue.create_table(
    DatabaseName='vcf_analysis_db',
    TableInput = {
        "Name": var_store['name'],
        "TargetTable": {
            "CatalogId": resource_share['owningAccountId'],
            "DatabaseName": f"variant_{account_id}_{var_store['id']}",
            "Name": var_store['name'],
        }
    }
)

In [None]:
athena = boto3.client('athena')
athena_workgroups = athena.list_work_groups()['WorkGroups']
athena_workgroups

In [None]:
athena_workgroup = None
for wg in athena_workgroups:
    print(wg['EngineVersion']['EffectiveEngineVersion'])
    if wg['EngineVersion']['EffectiveEngineVersion'] == 'Athena engine version 3':
        print(f"Workgroup '{wg['Name']}' found using Athena engine version 3")
        athena_workgroup = wg
        break
else:
    print("No workgroups with Athena engine version 3 found. creating one")
    athena_workgroup = athena.create_work_group(
        Name='omics',
        Configuration={
            "EngineVersion": {
                "SelectedEngineVersion": "Athena engine version 3"
            }
        }
    )

athena_workgroup

In [None]:
!{sys.executable} -m pip install awswrangler

In [142]:
import awswrangler as wr
import pandas as pd
from datetime import datetime

In [None]:
def enable_analytics_via_cli():
    """Enable HealthOmics Analytics using AWS CLI"""
    
    try:
        # AWS CLI command to enable analytics
        cmd = [
            'aws', 'omics', 'update-variant-store',
            '--name', 'hcagentsvs3',
            '--description', 'Store with analytics enabled for Athena queries',
            '--region', 'us-east-1'
        ]
        
        print("🔧 Running AWS CLI command...")
        print(f"Command: {' '.join(cmd)}")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        
        if result.returncode == 0:
            print("✅ Command executed successfully!")
            print("Response:", result.stdout)
            return True
        else:
            print("❌ Command failed!")
            print("Error:", result.stderr)
            return False
            
    except Exception as e:
        print(f"❌ Error executing command: {e}")
        return False

# Run the command
success = enable_analytics_via_cli()

if success:
    print("\n🎯 Next steps:")
    print("1. Wait 10-15 minutes for analytics to be fully enabled")
    print("2. Try your Athena queries again")
    print("3. The resource links should now work properly")

In [86]:
simple_query = "SELECT * from vcf_analysis_db.hcagentsvs3 LIMIT 10"

In [None]:
import awswrangler as wr

# This should work now (after analytics enables):
df = wr.athena.read_sql_query(
    "SELECT * FROM vcf_analysis_db.YOUR_VARIANT_STORE_NAME LIMIT 5",
    database="vcf_analysis_db",
    workgroup="datasets-workgroup"
)

In [None]:
glue.create_table(
    DatabaseName='vcf_analysis_db',
    TableInput = {
        "Name": "annotationstore_cliinvar",
        "TargetTable": {
            "CatalogId": resource_share['owningAccountId'],
            "DatabaseName": f"variant_{account_id}_YOUR_ANNOTATION_STORE_ID",
            "Name": "annotationstore_cliinvar",
        }
    }
)

In [146]:
simple_query = "SELECT * from vcf_analysis_db.YOUR_VARIANT_STORE_NAME LIMIT 10"

In [None]:
import awswrangler as wr

# This should work now (after analytics enables):
df = wr.athena.read_sql_query(
    "SELECT * FROM vcf_analysis_db.YOUR_VARIANT_STORE_NAME LIMIT 5",
    database="vcf_analysis_db",
    workgroup="datasets-workgroup"
)

## 8. Setup Summary and Next Steps

In [None]:
# Display setup summary
print("🎉 VCF Import Status Tracker Setup Complete!")
print("=" * 50)
print()
print("📋 Components Created:")
print(f"   ✅ DynamoDB Table: {table_name}")
print(f"   ✅ IAM Role: {role_name}")
print(f"   ✅ Lambda Function: {function_name}")
if 'variant_store_name' in locals():
    print(f"   ✅ Variant Store: {variant_store_name} ({variant_store_id})")
print(f"   ✅ S3 Event Trigger: Configured")
print(f"   ✅ Scheduled Status Check: {RULE_NAME}")
print()
print("🔧 Key Improvements in Lambda Function:")
print("   ✅ fixed_final status checking logic")
print("   ✅ Always updates DynamoDB to match HealthOmics status")
print("   ✅ Better error handling for missing jobs")
print("   ✅ Consistent timestamp formatting")
print("   ✅ Clearer logging and debugging")
print()
print("📊 Monitoring:")
print(f"   📈 CloudWatch Logs: /aws/lambda/{function_name}")
print(f"   📊 DynamoDB Table: {table_name}")
print(f"   ⚡ HealthOmics EventBridge Rule: {OMICS_RULE_NAME}")
print()
print("🚀 Next Steps:")
print("   1. Update S3 URI in the S3 event configuration section")
print("   2. Upload a test VCF file to trigger the workflow")
print("   3. Monitor CloudWatch logs for execution details")
print("   4. Check DynamoDB table for real-time status updates")
print("   5. Verify that status updates happen instantly when jobs complete")
print()
print("🔍 Troubleshooting:")
print("   - Check CloudWatch logs if jobs fail")
print("   - Verify IAM permissions if access denied")
print("   - Ensure S3 bucket and prefix are correct")
print("   - Confirm HealthOmics variant store is active")
print("   - Use manual status check for debugging if needed")