Skip to content

Commit

Permalink
The tokenizer model is downloaded in the dockerfile to /tram/data, which
Browse files Browse the repository at this point in the history
is not a convenient location for developer environments. Move it to the
relative path data/ml-models, which is where all the other model files
are stored.
  • Loading branch information
mehaase committed Feb 6, 2024
1 parent cc59b14 commit ead1eac
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 11 deletions.
20 changes: 10 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ RUN --mount=type=cache,target=/var/cache/apt --mount=type=cache,target=/var/lib/

# Handle custom CA certificate, if specified.
RUN if test -n "${TRAM_CA_URL}" -a -n "${TRAM_CA_THUMBPRINT}" ; then \
echo "Installing certificate authority from ${TRAM_CA_URL}" && \
curl -sk "${TRAM_CA_URL}" -o /usr/local/share/ca-certificates/tram_ca.crt && \
DOWNLOAD_CA_THUMBPRINT=$(openssl x509 -in /usr/local/share/ca-certificates/tram_ca.crt -fingerprint -noout | cut -d= -f2) && \
if test "${DOWNLOAD_CA_THUMBPRINT}" = "${TRAM_CA_THUMBPRINT}"; then \
update-ca-certificates; \
else \
printf "\n=====\nERROR\nExpected thumbprint: %s\nActual thumbprint: %s\n=====\n" "${TRAM_CA_THUMBPRINT}" "${DOWNLOAD_CA_THUMBPRINT}"; \
exit 1; \
fi; \
echo "Installing certificate authority from ${TRAM_CA_URL}" && \
curl -sk "${TRAM_CA_URL}" -o /usr/local/share/ca-certificates/tram_ca.crt && \
DOWNLOAD_CA_THUMBPRINT=$(openssl x509 -in /usr/local/share/ca-certificates/tram_ca.crt -fingerprint -noout | cut -d= -f2) && \
if test "${DOWNLOAD_CA_THUMBPRINT}" = "${TRAM_CA_THUMBPRINT}"; then \
update-ca-certificates; \
else \
printf "\n=====\nERROR\nExpected thumbprint: %s\nActual thumbprint: %s\n=====\n" "${TRAM_CA_THUMBPRINT}" "${DOWNLOAD_CA_THUMBPRINT}"; \
exit 1; \
fi; \
fi

ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
Expand Down Expand Up @@ -110,7 +110,7 @@ RUN --mount=type=cache,target=/root/.cache \
curl -kJL -o ${bert_data_dir}/${bert_config_localfile} $bert_config_url

# run this command without cache volume mounted, so model is stored on image
RUN python3 -c "import os; import transformers; os.environ['CURL_CA_BUNDLE'] = ''; mdl = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased'); mdl.save_pretrained('/tram/data/priv-allenai-scibert-scivocab-uncased')"
RUN python3 -c "import os; import transformers; os.environ['CURL_CA_BUNDLE'] = ''; mdl = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased'); mdl.save_pretrained('/tram/data/ml-models/priv-allenai-scibert-scivocab-uncased')"

# Generate and Run Django migrations scripts, collectstatic app files
RUN tram makemigrations tram && \
Expand Down
4 changes: 3 additions & 1 deletion src/tram/ml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,9 @@ def predict_samples(self, samples: list[str]):
to that technique. The sum of each row will always be 1.
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("/tram/data/priv-allenai-scibert-scivocab-uncased")
tokenizer = AutoTokenizer.from_pretrained(
settings.ML_MODEL_DIR + "/priv-allenai-scibert-scivocab-uncased"
)
bert = (
BertForSequenceClassification.from_pretrained(
settings.ML_MODEL_DIR + "/bert_model"
Expand Down

0 comments on commit ead1eac

Please sign in to comment.