3 test workflow (#4)

* build: support docker builds for both cuda and cpu-only pytorch * build: add python pipeline to make deps command * refactor: python item parser to support arbitrary files * feat: add script to download model files * refactor: clean up old requirements * ci: add build and test workflow * fix: sync python lock file * ci: skip test environment tear down * ci: fix docker commands for test setup/teardown to work in a headless environment * fix: run docker python tasks inside venv to prevent lambda base image dependency conflicts * test: add small delay to test environment setup for CI * test: increase delay for test env setup for CI * fix: database migration run scripts argument handling * test: disable approximate vector search in testing only * test: display full test logs * test: remove variable cursor from paginated item recommendations test
ae9is · May 29, 2024 · e063f7e · e063f7e
1 parent 3df418c
commit e063f7e
Show file tree

Hide file tree

Showing 21 changed files with 925 additions and 621 deletions.
diff --git a/.env b/.env
@@ -13,7 +13,10 @@ MODEL_API_URL=http://localhost:5000
 
 PYTHON_ENV=development
 CUDA_VISIBLE_DEVICES=0
+TORCH_VERSION=cpu
+
 MODEL_DIR=data/models/blair-roberta-base
+MODEL_NAME=hyp1231/blair-roberta-base
 
 ITEM_META_FILE=data/import/meta_Musical_Instruments.jsonl
 REVIEWS_FILE=data/import/Musical_Instruments.jsonl
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,88 @@
+name: main
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+  # Manual run from GitHub UI
+  workflow_dispatch:
+  # Wednesdays at 0400
+#  schedule:
+#    - cron: '0 4 * * 3'
+
+jobs:
+  build-and-test:
+    timeout-minutes: 10
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        java-version: [17]
+    env:
+      ENV_FILE: .env
+      # Below should be overwritten by .env
+      TORCH_VERSION: cpu
+      MODEL_NAME: hyp1231/blair-roberta-base
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Load environment variables
+      run: cat ${{ env.ENV_FILE }} >> $GITHUB_ENV
+
+    - name: Use Java ${{ matrix.java-version }}
+      uses: actions/setup-java@v2
+      with:
+        java-version: ${{ matrix.java-version }}
+        cache: 'gradle'
+        distribution: 'liberica'
+
+    - name: Setup pdm
+      uses: pdm-project/setup-pdm@v4
+    - name: Restore cached venv
+      id: cache-venv-restore
+      uses: actions/cache/restore@v4
+      with:
+        path: |
+          .venv
+        key: venv-${{ runner.os }}-${{ hashFiles('pdm.lock') }}
+        restore-keys: |
+          venv-${{ runner.os }}-
+          venv-
+    - name: Install dependencies
+      run: |
+        pdm install
+        pdm install-${{ env.TORCH_VERSION }}
+    - name: Save venv to cache
+      id: cache-venv-save
+      uses: actions/cache/save@v4
+      with:
+        path: |
+          .venv
+        key: ${{ steps.cache-venv-restore.outputs.cache-primary-key }}
+
+    - name: Restore cached model files
+      id: cache-model-restore
+      uses: actions/cache/restore@v4
+      with:
+        path: |
+          data/models
+        key: models-${{ env.MODEL_NAME }}
+    - name: Download model files
+      run: make get-model
+    - name: Save model files to cache
+      id: cache-model-save
+      uses: actions/cache/save@v4
+      with:
+        path: |
+          data/models
+        key: ${{ steps.cache-model-restore.outputs.cache-primary-key }}
+
+    - name: Install
+      run: make deps
+
+    - name: Build
+      run: make docker-build
+
+    - name: Test
+      run: make test-ci
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,14 @@
 FROM public.ecr.aws/lambda/python:3.12 as build
 # No python 3.12 yet in AL2023, so we use the Lambda team's image for now. 
 # ref: https://github.com/amazonlinux/amazon-linux-2023/issues/483
-# However, note to actually run on Lambda would need to swap out GPU torch for CPU-only in pyproject.toml.
+# To actually run on Lambda or other platforms without CUDA, swap out GPU torch for CPU-only.
 #FROM public.ecr.aws/amazonlinux/amazonlinux:2023 as build
 
+ENV TASK_ROOT=/var/task
+
 # Non-root user and group (only with AL2023 not Lambda base images)
 #RUN dnf install -y shadow-utils
 #RUN groupadd -g 888 python && useradd -r -u 888 -g python python
-#ENV TASK_ROOT=/var/task
 #RUN mkdir -p "${TASK_ROOT}"
 #RUN chown python:python "${TASK_ROOT}"
 #WORKDIR "${TASK_ROOT}"
@@ -18,6 +19,7 @@ FROM public.ecr.aws/lambda/python:3.12 as build
 #USER 888
 RUN python3.12 -m venv "${TASK_ROOT}"
 ENV PATH="${TASK_ROOT}/bin:${PATH}"
+RUN source "${TASK_ROOT}/bin/activate"
 RUN python3.12 -m ensurepip
 RUN python3.12 -m pip install --no-cache-dir --disable-pip-version-check -U gunicorn uvicorn[standard]
 
@@ -27,8 +29,9 @@ ARG MODEL_DIR=${MODEL_DIR:-data/models/blair-roberta-base}
 COPY --chown=python:python "${MODEL_DIR}"/* ./amazonrev/model/
 
 # Project dependencies
-COPY --chown=python:python requirements.prod.txt ./
-RUN python3.12 -m pip install --no-cache-dir --disable-pip-version-check -U -r requirements.prod.txt
+ARG TORCH_VERSION=${TORCH_VERSION:-cpu}
+COPY --chown=python:python requirements.prod.${TORCH_VERSION}.txt ./
+RUN python3.12 -m pip install --no-cache-dir --disable-pip-version-check -U -r requirements.prod.${TORCH_VERSION}.txt
 
 # Copy project source
 COPY --chown=python:python src/main/python/amazonrev/*.py ./amazonrev/

diff --git a/Makefile b/Makefile
@@ -8,21 +8,28 @@ TESTDB_PROCESS_ID = $(shell docker ps | grep reviews-pg-test | awk '{print $$1}'
 clean:
 	${GRADLE} clean
 
+# A small delay between bringing up docker compose services and executing the database scripts
+#  is needed for CI workflow. (Docker compose exits before the containers are fully up.)
 test-env-up:
 	docker compose -f docker-compose-test.yml up -d 
+	sleep 5
 	bash docker-db-up.sh reviews-pg-test 1 1
 
 test-env-down:
 	bash docker-db-down.sh reviews-pg-test 1 1
 	docker compose -f docker-compose-test.yml down
 
-test-py:
+test-py: test-env-up
 	MODEL_API_URL=http://localhost:5001 pdm test
 
-test-java:
+test-java: test-env-up
 	SPRING_DATASOURCE_URL=jdbc:postgresql://localhost:5433/reviews MODEL_API_URL=http://localhost:5001 ${GRADLE} test --rerun-tasks
 
-test: test-env-up test-java test-py test-env-down
+test-ci: test-java test-py
+
+# In make v4.4+ can just replace this with .WAIT
+test: test-env-up test-ci WAIT test-env-down
+WAIT: test-java test-py
 
 build:
 	${GRADLE} build
@@ -32,17 +39,25 @@ run:
 
 deps:
 	${GRADLE} dependencies
+	pdm install
+
+get-model:
+	pdm get-model
 
 parse:
 	${GRADLE} runParser
 
 embeddings:
 	pdm parser
 
-docker-build:
+docker-build: docker-build-java docker-build-py
+
+docker-build-java:
 	${GRADLE} bootBuildImage --imageName=${NAME}/graphql-api
+
+docker-build-py:
 	printf "PYTHON_ENV=${PYTHON_ENV}\nMODEL_DIR=./model\n" > .env.dockerfile
-	docker build -t ${NAME}/model-api --build-arg MODEL_DIR=${MODEL_DIR} -f Dockerfile .
+	docker build -t ${NAME}/model-api --build-arg MODEL_DIR=${MODEL_DIR} --build-arg TORCH_VERSION=${TORCH_VERSION} -f Dockerfile .
 
 docker-login:
 	aws ecr get-login-password --region ${AWS_REGION} --profile ${AWS_PROFILE} | docker login --username AWS --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com

diff --git a/README.md b/README.md
@@ -86,6 +86,12 @@ Open http://localhost:8080/graphiql?path=/graphql
 
 <img src="img/screenshot-reviews.png" width=400 />
 
+### GPU-accelerated containers
+
+To run the Python API inside a Docker container with CUDA enabled in PyTorch, the container host should setup Docker for CUDA.
+
+See: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html
+
 ## Test
 
 Build the Python API Docker image first using:

diff --git a/build.gradle b/build.gradle
@@ -50,3 +50,8 @@ dependencies {
 tasks.named('test') {
 	useJUnitPlatform()
 }
+
+test {
+  // ref: https://docs.gradle.org/current/dsl/org.gradle.api.tasks.testing.logging.TestLogging.html
+  testLogging.showStandardStreams = true
+}
diff --git a/data/models/README.md b/data/models/README.md
@@ -7,3 +7,5 @@ See:
 - [BLaIR-roberta-large](https://huggingface.co/hyp1231/blair-roberta-large)
 
 Set environment variable `MODEL_DIR` appropriately in `.env`.
+
+Note: we bypass the default Hugging Face cache directory (`~/.cache/huggingface/hub`) for ease in bundling the model files into the Python API Docker image.
diff --git a/docker-db-down.sh b/docker-db-down.sh
@@ -2,8 +2,8 @@
 # Export and drop database tables.
 # Executes against a running docker postgres container.
 container="${1:-postgres}"
-with_test_data="${2:-0}"
-force="${3:-0}"
+with_test_data=$2
+force=$3
 DB_PROCESS_ID=`docker ps | grep "${container}" | awk '{print $1}' | head -n 1`
 
 if [ "${force}" ]; then
@@ -30,7 +30,7 @@ started=`date`
 for scriptpath in `ls -r migrations/*.down.sql`; do
   script=`basename "${scriptpath}"`
   echo "Running script: ${script} at `date`..."
-  docker exec --user postgres -it ${DB_PROCESS_ID} psql -d ${POSTGRES_DB} -f "/export/${script}"
+  docker exec --user postgres ${DB_PROCESS_ID} psql -d ${POSTGRES_DB} -f "/export/${script}"
 done
 stopped=`date`
 echo "Started at: ${started}"

diff --git a/docker-db-up.sh b/docker-db-up.sh
@@ -3,8 +3,8 @@
 # Executes against a running docker postgres container.
 # Make sure to generate loadable data files first by running `make parse`.
 container="${1:-postgres}"
-with_test_data="${2:-0}"
-force="${3:-0}"
+with_test_data=$2
+force=$3
 DB_PROCESS_ID=`docker ps | grep "${container}" | awk '{print $1}' | head -n 1`
 
 if [ "${force}" ]; then
@@ -28,6 +28,10 @@ fi
 
 # Make sure to fix permissions on your mounted volume if docker is run as root
 cp migrations/*.sql "${export_dir}" || { echo "Cannot copy new migrations to docker mount point, quitting!"; exit 1; }
+if [ "${with_test_data}" ]; then
+  # Handle test-only database creation differences
+  rename --force 's/.test.sql/.sql/' "${export_dir}"/*.test.sql
+fi
 
 if [ "${force}" ]; then
   wipe=1
@@ -44,7 +48,7 @@ started=`date`
 for scriptpath in migrations/*.up.sql; do
   script=`basename "${scriptpath}"`
   echo "Running script: ${script} at `date`..."
-  docker exec --user postgres -it ${DB_PROCESS_ID} psql -d ${POSTGRES_DB} -f "/export/${script}"
+  docker exec --user postgres ${DB_PROCESS_ID} psql -d ${POSTGRES_DB} -f "/export/${script}"
 done
 stopped=`date`
 echo "Started at: ${started}"

diff --git a/migrations/016_create_item_embed.up.test.sql b/migrations/016_create_item_embed.up.test.sql
@@ -0,0 +1,13 @@
+-- Uses postgres extension pgvector: https://github.com/pgvector/pgvector
+-- Hidden size 768 pre-defined when training model: models/blair-roberta-base/config.json
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE TABLE IF NOT EXISTS item_embed (
+  item_id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
+  title vector(768)
+);
+COPY item_embed FROM '/export/item_embed.csv' WITH CSV DELIMITER E'\x1e' QUOTE E'\x1f' NULL AS '' HEADER;
+-- Disable approximate search for testing:
+--CREATE INDEX ON item_embed USING hnsw (
+--  title vector_ip_ops
+--);
+--ANALYZE item_embed;
diff --git a/migrations/README.md b/migrations/README.md
@@ -19,3 +19,5 @@ The export+drop and create+load steps have been combined here for convenience, a
 Foreign key constraints are used, along with bigint primary keys (instead of uuid/varchar). This could be changed to support sharding or to ease complex migrations.
 
 If you prefer, you may use the migrate tool instead of the shell scripts: https://github.com/golang-migrate/migrate
+
+In [docker-db-up.sh](/docker-db-up.sh), *.test.sql versions of files replace their *.sql counterparts for testing setup only.