In [None]:
import time
import json
from time import gmtime, strftime
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import Processor, ProcessingOutput

client = boto3.client("sts")
account=client.get_caller_identity()["Account"]

sess=sagemaker.Session()
region = boto3.session.Session().region_name
role = get_execution_role()
default_bucket = sess.default_bucket()

In [None]:
#User Inputs
prefix='alphafoldv2'
s3_genetic_db_bucket = f's3://{default_bucket}/alphafoldv2/alphafold-genetic-db/' 

# create a FSx for Lustre filesystem
# please specify valid vpc subnet ID and security group ID to train within your
# VPC of choice. This is required for job with Amazon FSx for lustre.
vpc_subnet_ids = ['xxxxxx'] 
security_group_ids = ['xxxxxx'] 

In [None]:
!pip install -q sagemaker-studio-image-build
!sm-docker build -h

In [None]:
setup_database_processor = False
if setup_database_processor:
    !sm-docker build . --repository sagemaker-studio-alphafold:processor --file ./docker/Dockerfile.processor

In [None]:
setup_alphafold = False
if setup_alphafold:
    !cd docker;sm-docker build . --repository sagemaker-studio-alphafold:estimator --file ./Dockerfile.alphafold --compute-type BUILD_GENERAL1_MEDIUM

In [None]:
setup_openfold = False
if setup_openfold:
    !git clone -b v1.0.1 --single-branch https://github.com/aqlaboratory/openfold.git

    !cd ~/openfold
    !sm-docker build . --repository sagemaker-studio-openfold:base-v1.0.1 --file ./Dockerfile --compute-type BUILD_GENERAL1_MEDIUM

    !cd ~/protein-folding-on-sagemaker/docker
    !sm-docker build . --repository sagemaker-studio-openfold:v1.0.1 --file ./Dockerfile.openfold 
    
    !aws s3 cp --no-sign-request s3://openfold/openfold_params/finetuning_ptm_2.pt ./source_dir/

In [None]:
processor_image_uri=f'{account}.dkr.ecr.{region}.amazonaws.com/sagemaker-studio-alphafold:processor'
alphafold_image_uri = f'{account}.dkr.ecr.{region}.amazonaws.com/sagemaker-studio-alphafold:v2.3.0-estimator'
openfold_image_uri=f'{account}.dkr.ecr.{region}.amazonaws.com/sagemaker-studio-openfold:v1.0.1'

In [None]:
download_database = False

if download_database:
    db_preset='full_dbs' # <full_dbs|reduced_dbs>
    download_script_in_image='/alphafold/scripts/download_all_data.sh'
    processor = Processor(image_uri=processor_image_uri,
                          role=role,
                          instance_count=1,
                          instance_type='ml.t3.xlarge',
                          volume_size_in_gb=3000,
                          max_runtime_in_seconds=432000, 
                          base_job_name='alphafold-genetic-db-prep',
                          sagemaker_session=sess,
                          entrypoint=[download_script_in_image])

    output_dir_in_image='/opt/ml/processing/alphafold-genetic-db'
    output=[ProcessingOutput(output_name='alphafold-genetic-db', 
                             destination=s3_genetic_db_bucket, 
                             source=output_dir_in_image,
                             s3_upload_mode='EndOfJob')] 

    processor.run(outputs=output, 
                  arguments=[output_dir_in_image, db_preset],
                  wait=False,
                  logs=False)

In [None]:
setup_file_system = False

fsx_client = boto3.client("fsx")
    
if setup_file_system:
    fsx_response = fsx_client.create_file_system(
        FileSystemType='LUSTRE',
        StorageCapacity=4800,
        StorageType='SSD',
        SubnetIds=[vpc_subnet_ids[0]],
        SecurityGroupIds=security_group_ids,
        LustreConfiguration={
            'DeploymentType': 'PERSISTENT_2',
            'PerUnitStorageThroughput': 250
        }
    )

    fsx_status = "CREATING"
        while fsx_status == "CREATING":
            time.sleep(60)
            fsx_describe = fsx_client.describe_file_systems(
                FileSystemIds=[fsx_response["FileSystem"]["FileSystemId"]]
            )
            fsx_status = fsx_describe["FileSystems"][0]["Lifecycle"]
            print(fsx_status)

    # also need to setup a s3 VPC gateway endpoint to access fasta file in s3 
    # because default VPC does not have internet 
    # Follow steps in https://docs.aws.amazon.com/sagemaker/latest/dg/train-vpc.html#train-vpc-s3

    FileSystemIds=str(fsx_response["FileSystem"]["FileSystemId"])

    data_rep_response = fsx_client.create_data_repository_association(
        FileSystemId=FileSystemIds,
        FileSystemPath=f'/{prefix}/alphafold-genetic-db/'
        DataRepositoryPath=s3_genetic_db_bucket,
        S3={
            'AutoImportPolicy': {
                'Events': ['NEW','CHANGED','DELETED']
            },
            'AutoExportPolicy': {
                'Events': ['NEW','CHANGED','DELETED']
            }
        }
    )