### Docker image to run Clay model in GPU instances

#### Clone Clay model repo at specific commit

see here for the clay GitHub repository: https://github.com/Clay-foundation/model/tree/main

In [1]:
!pip install gitpython

Collecting gitpython
  Using cached GitPython-3.1.44-py3-none-any.whl.metadata (13 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython)
  Using cached gitdb-4.0.12-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython)
  Using cached smmap-5.0.2-py3-none-any.whl.metadata (4.3 kB)
Using cached GitPython-3.1.44-py3-none-any.whl (207 kB)
Using cached gitdb-4.0.12-py3-none-any.whl (62 kB)
Using cached smmap-5.0.2-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython
Successfully installed gitdb-4.0.12 gitpython-3.1.44 smmap-5.0.2


In [2]:
URL = "https://github.com/Clay-foundation/model.git"
SHA = "32518ce" # LATEST COMMIT PRE v.1.5, see here: https://github.com/Clay-foundation/model/commit/32518ceed8f75f116f3325bdb68c62eeab9ddbae

In [3]:
from git import Repo

def clone_repo_at_commit(git_url, local_dir, commit_hash):
    """
    Clone a git repository at a specific commit
    
    Args:
        git_url (str): URL of the git repository
        local_dir (str): Local directory where to clone the repository
        commit_hash (str): The specific commit hash to checkout
    """
    try:
        # Clone the repository
        repo = Repo.clone_from(git_url, local_dir)
        
        # Checkout the specific commit
        repo.git.checkout(commit_hash)
        
        print(f"Successfully cloned repository at commit {commit_hash}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")

In [4]:
clone_repo_at_commit(git_url=URL, local_dir="./clay_assets", commit_hash=SHA)

Successfully cloned repository at commit 32518ce


#### Download model checkpoint from HuggingFace

see here for the Clay HF repository: https://huggingface.co/made-with-clay/Clay

In [5]:
import os
artifact_dir="./clay_assets/checkpoints/"
os.makedirs(artifact_dir,exist_ok=True)

In [6]:
hf_ckpt_path = "https://huggingface.co/made-with-clay/Clay/resolve/main/v1/clay-v1-base.ckpt"

In [7]:
!wget --quiet -P {artifact_dir} {hf_ckpt_path}

#### Write the Dockerfile

In [8]:
%%writefile Dockerfile

#Build from sagemaker distro image: https://gallery.ecr.aws/sagemaker/sagemaker-distribution
FROM public.ecr.aws/sagemaker/sagemaker-distribution:1.8.0-gpu

ARG NB_USER="sagemaker-user"
ARG NB_UID=1000
ARG NB_GID=100

ENV MAMBA_USER=$NB_USER

USER $ROOT

# Install system dependencies
RUN apt-get update && apt-get install -y \
    wget \
    && rm -rf /var/lib/apt/lists/*

# Install Mamba
RUN wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba \
    && mv bin/micromamba /usr/local/bin/ \
    && rm -rf bin

# Set up Mamba environment
ENV MAMBA_ROOT_PREFIX=/opt/conda
ENV PATH=$MAMBA_ROOT_PREFIX/bin:$PATH

# Copy environment files
COPY environment.yml /tmp/environment.yml
COPY requirements.txt /tmp/requirements.txt

# Create and activate the environment, install dependencies, and pip requirements
RUN micromamba create -f /tmp/environment.yml && \
    micromamba run -n claymodel pip install -r /tmp/requirements.txt && \
    micromamba clean --all --yes

# Set environment variable for the environment name
ENV ENV_NAME=claymodel

# Set the default environment for inference
ENV SAGEMAKER_JOB_CONDA_ENV=claymodel

# Set the default environment path
ENV PATH /opt/conda/envs/$ENV_NAME/bin:$PATH

# Copy model files
COPY  clay_assets/ /home/sagemaker-user/clay-model

# Add healthcheck to verify code is running at default path
HEALTHCHECK --interval=30s --timeout=3s \
  CMD pgrep -f "python3 /opt/ml/processing/input/code/" || exit 1

# Set the entrypoint to activate the environment
ENTRYPOINT ["/bin/bash", "-c"]

Overwriting Dockerfile


#### Build and tag Docker image

In [9]:
import boto3
import sagemaker

# Create a SageMaker session
sagemaker_session = sagemaker.Session()

# Get the region
ECR_REGION = sagemaker_session.boto_region_name

# Get the account number
sts_client = boto3.client('sts')
ECR_ACCOUNT_ID = sts_client.get_caller_identity()["Account"]

#Set Repo and Image name
REPO_NAME="clay-gpu-container-new"
IMG_NAME=f"{REPO_NAME}:latest"

print(f"Region: {ECR_REGION}")
print(f"Account Number: {ECR_ACCOUNT_ID}")
print(f"ECR Repository Name: {REPO_NAME}")
print(f"Image Name: {IMG_NAME}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Region: us-west-2
Account Number: 785721480234
ECR Repository Name: clay-gpu-container-new
Image Name: clay-gpu-container-new:latest


In [10]:
!aws ecr get-login-password --region {ECR_REGION} | docker login --username AWS --password-stdin {ECR_ACCOUNT_ID}.dkr.ecr.{ECR_REGION}.amazonaws.com

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [24]:
!docker build --quiet -f Dockerfile -t {IMG_NAME} .

sha256:bb6d5e04b0119ce36bafee8bbee5abe1720f6034f54ad1b95d52936f8ffea71d


In [25]:
!docker tag {IMG_NAME} {ECR_ACCOUNT_ID}.dkr.ecr.{ECR_REGION}.amazonaws.com/{IMG_NAME}

#### Push to ECR

Ensure that the ECR repository exists. Create it if it does not.

In [26]:
def ensure_ecr_repository(repository_name, region=ECR_REGION):
    """
    Check if ECR repository exists and create it if it doesn't.
    
    Args:
        repository_name (str): Name of the ECR repository
        region (str, optional): AWS region. If None, uses default region
    
    Returns:
        dict: Repository details
    """
    try:
        # Initialize ECR client
        ecr_client = boto3.client('ecr', region_name=region)
        
        try:
            # Try to describe the repository to check if it exists
            response = ecr_client.describe_repositories(
                repositoryNames=[repository_name]
            )
            print(f"Repository '{repository_name}' already exists")
            return response['repositories'][0]
            
        except ecr_client.exceptions.RepositoryNotFoundException:
            # Repository doesn't exist, create it
            print(f"Creating repository '{repository_name}'...")
            response = ecr_client.create_repository(
                repositoryName=repository_name,
                imageScanningConfiguration={'scanOnPush': True},
                encryptionConfiguration={'encryptionType': 'AES256'}
            )
            print(f"Repository '{repository_name}' created successfully")
            return response['repository']
            
    except Exception as e:
        print(f"Error managing ECR repository: {str(e)}")
        raise

In [27]:
try:
    repository = ensure_ecr_repository(REPO_NAME)
    print(f"Repository URI: {repository['repositoryUri']}")
except Exception as e:
    print(f"Failed to ensure repository exists: {str(e)}")

Repository 'clay-gpu-container-new' already exists
Repository URI: 785721480234.dkr.ecr.us-west-2.amazonaws.com/clay-gpu-container-new


In [28]:
!docker push {ECR_ACCOUNT_ID}.dkr.ecr.{ECR_REGION}.amazonaws.com/{IMG_NAME}

The push refers to repository [785721480234.dkr.ecr.us-west-2.amazonaws.com/clay-gpu-container-new]

[1B6e7ec919: Preparing 
[1Bd80307d3: Preparing 
[1B5057b437: Preparing 
[1Bc53c0e14: Preparing 
[1B27eabf42: Preparing 
[1Bc730229e: Preparing 
[1B26c0cb1e: Preparing 
[1Bc997ce61: Preparing 
[1B6e9815a6: Preparing 
[1B3aaf6c2e: Preparing 
[1B8bf32273: Preparing 
[1B00563650: Preparing 
[1Beb4c8f45: Preparing 
[1Bc8626fed: Preparing 
[1B3e97fc3c: Preparing 
[1B506e9e5b: Preparing 
[1Bdc818c02: Preparing 
[1B1c0396e6: Preparing 
[1B65b2433b: Preparing 
[1B9a8dd506: Preparing 
[1B0f0de58e: Preparing 
[1B79ea90a8: Preparing 
[1Ba4d57ea2: Preparing 
[1B90b4dbd4: Preparing 
[1Bd878abc1: Preparing 
[1B139153e9: Preparing 
[1Bd77f74a6: Preparing 
[1B6b6c1568: Preparing 
[1Bea765464: Preparing 
[1B90a66736: Preparing 
[1B0967d0a5: Preparing 
[1Bbbffc6af: Preparing 
[1Bbd6fa39e: Preparing 
[1B6fef1257: Preparing 
[1Bcc5f894c: Preparing 
[1B5c43fa03: Preparing 

#### Pull from ECR (Required to work with SM Notebookes in Local Mode)

In [29]:
!docker pull {ECR_ACCOUNT_ID}.dkr.ecr.{ECR_REGION}.amazonaws.com/{IMG_NAME}

latest: Pulling from clay-gpu-container-new
Digest: sha256:11097cd852899a6e917dae99c6819151c11c66301fe4ba678cf0e3d61a5aa7a7
Status: Image is up to date for 785721480234.dkr.ecr.us-west-2.amazonaws.com/clay-gpu-container-new:latest
785721480234.dkr.ecr.us-west-2.amazonaws.com/clay-gpu-container-new:latest
