Skip to content

Commit

Permalink
Merge pull request #1 from c0sogi/temp-branch
Browse files Browse the repository at this point in the history
Dependency solution
  • Loading branch information
c0sogi committed Jul 31, 2023
2 parents 5f38c79 + 6a02465 commit 6cff18a
Show file tree
Hide file tree
Showing 38 changed files with 5,008 additions and 686 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Continuous Integration

on:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
build-and-test:
name: Build and Test
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ['3.8', '3.9', '3.10', '3.11']

steps:
- name: Check out code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Setup Python, install dependencies, and run tests
run: |
python -m pip install --upgrade pip
python -m llama_api.server.app_settings --install-pkgs
python -m unittest discover tests
build-release:
if: github.event_name == 'push' && success()
needs: build-and-test
name: Create Release
runs-on: ubuntu-latest
steps:
- name: Shorten SHA
id: shorten_sha
run: echo "::set-output name=sha::$(echo ${GITHUB_SHA:0:7})"
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.shorten_sha.outputs.sha }}
release_name: master-${{ steps.shorten_sha.outputs.sha }}
draft: false
prerelease: false
9 changes: 5 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ models/ggml/*
models/gptq/*
!models/ggml/llama_cpp_models_here.txt
!models/gptq/exllama_models_here.txt
repositories/
*.log
.venv
.vscode
*.pyc
PRIVATE_*
repositories
.venv/
.vscode/
.test-venv/
PRIVATE_*
22 changes: 13 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
### Dockerfile for Python 3.11.4 & CUDA 12.1.1 & Ubuntu 22.04
### Approximately 7 ~ 10 minutes to build
### Approximately 5 ~ 10 minutes to build

# 필요한 CUDA 버전을 선택합니다.
# Select the required CUDA version.
ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
FROM nvidia/cuda:${CUDA_IMAGE}
ENV PYTHON_VERSION="3.11.4"
ENV PYTHON_VERSION_SHORT="3.11"
ENV HOST 0.0.0.0
ENV PORT=8000

# 필요한 파일들을 복사합니다.
COPY requirements.txt /tmp/requirements.txt
# Copy the necessary files.
COPY requirements.txt /app/requirements.txt
COPY pyproject.toml /app/pyproject.toml
COPY llama_api /app/llama_api

# Python 설치를 위한 종속성들을 설치하고, Python을 설치하고 설정합니다.
# Install the necessary applications, and then install Python.
# Then, install the necessary Python packages(Dependencies).
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
zlib1g-dev \
Expand All @@ -34,11 +37,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
&& python3 -m pip install --upgrade pip \
&& pip install --no-cache-dir -r /tmp/requirements.txt \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& rm -rf /tmp/*
&& rm -rf /tmp/* \
&& cd /app \
&& python3 -m llama_api.server.app_settings --force-cuda --install-pkgs

# 작업 디렉토리를 설정하고, 서버를 실행합니다.
# Set the working directory and start the server.
WORKDIR /app
ENTRYPOINT [ "python3", "-m", "main" "--port", "${PORT}" ]
ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
9 changes: 9 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
MIT License

Copyright (c) 2023 Andrei Betlen

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 changes: 26 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ version: '3'

services:
llama-api:
build:
context: .
dockerfile: Dockerfile
image: cosogi/llama-api:230730
entrypoint: ["python3", "-m", "main", "--port", "8000"]
environment:
- MAX_WORKERS=1
Expand All @@ -14,11 +12,35 @@ services:
- ./model_definitions.py:/app/model_definitions.py
- ./main.py:/app/main.py
- ./requirements.txt:/app/requirements.txt
- ./pyproject.toml:/app/pyproject.toml
ports:
- 8000:8000
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
capabilities: [gpu]

# services:
# llama-api:
# build:
# context: .
# dockerfile: Dockerfile
# entrypoint: ["python3", "-m", "main", "--port", "8000"]
# environment:
# - MAX_WORKERS=1
# volumes:
# - ./models:/app/models
# - ./llama_api:/app/llama_api
# - ./model_definitions.py:/app/model_definitions.py
# - ./main.py:/app/main.py
# - ./requirements.txt:/app/requirements.txt
# ports:
# - 8000:8000
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# capabilities: [gpu]
9 changes: 5 additions & 4 deletions llama_api/mixins/prompt_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import List
from ..schemas.api import APIChatMessage, TextGenerationSettings


Expand All @@ -9,7 +10,7 @@ class PromptUtilsMixin:
ai_fallback_input_role: str = "Assistant"

@staticmethod
def get_stop_strings(*roles: str) -> list[str]:
def get_stop_strings(*roles: str) -> List[str]:
"""A helper method to generate stop strings for a given set of roles.
Stop strings are required to stop text completion API from generating
text that does not belong to the current chat turn.
Expand All @@ -33,7 +34,7 @@ def get_stop_strings(*roles: str) -> list[str]:

@classmethod
def convert_messages_into_prompt(
cls, messages: list[APIChatMessage], settings: TextGenerationSettings
cls, messages: List[APIChatMessage], settings: TextGenerationSettings
) -> str:
"""A helper method to convert list of messages into one text prompt."""

Expand All @@ -48,7 +49,7 @@ def convert_messages_into_prompt(
input_role = ai_input_role = message.role
chat_history += f"### {input_role}:{message.content}"

prompt_stop: list[str] = cls.get_stop_strings(
prompt_stop: List[str] = cls.get_stop_strings(
cls.user_input_role, cls.system_input_role, ai_input_role
)
if isinstance(settings.stop, str):
Expand All @@ -61,7 +62,7 @@ def convert_messages_into_prompt(

@staticmethod
def is_possible_to_generate_stops(
decoded_text: str, stops: list[str]
decoded_text: str, stops: List[str]
) -> bool:
"""A helper method to check if
the decoded text contains any of the stop tokens."""
Expand Down
13 changes: 6 additions & 7 deletions llama_api/modules/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Iterator, TypeVar
from typing import Any, Iterator, List, TypeVar

from ..mixins.prompt_utils import PromptUtilsMixin
from ..mixins.waiter import WaiterMixin
from ..schemas.api import (
APIChatMessage,
ChatCompletion,
Expand All @@ -24,7 +23,7 @@ class BaseLLMModel:
max_total_tokens: int = 2048


class BaseCompletionGenerator(ABC, PromptUtilsMixin, WaiterMixin):
class BaseCompletionGenerator(ABC, PromptUtilsMixin):
"""Base class for all completion generators."""

@abstractmethod
Expand Down Expand Up @@ -57,14 +56,14 @@ def generate_completion_with_streaming(

@abstractmethod
def generate_chat_completion(
self, messages: list[APIChatMessage], settings: TextGenerationSettings
self, messages: List[APIChatMessage], settings: TextGenerationSettings
) -> ChatCompletion:
"""Generate a completion for a given prompt."""
...

@abstractmethod
def generate_chat_completion_with_streaming(
self, messages: list[APIChatMessage], settings: TextGenerationSettings
self, messages: List[APIChatMessage], settings: TextGenerationSettings
) -> Iterator[ChatCompletionChunk]:
"""Generate a completion for a given prompt,
yielding chunks of text as they are generated."""
Expand Down Expand Up @@ -92,9 +91,9 @@ def from_pretrained(cls, model_name: str) -> "BaseEmbeddingGenerator":
@abstractmethod
def generate_embeddings(
self,
texts: list[str],
texts: List[str],
**kwargs: Any,
) -> list[list[float]]:
) -> List[List[float]]:
"""Generate embeddings for a list of texts."""
...

Expand Down
45 changes: 26 additions & 19 deletions llama_api/modules/exllama.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""Wrapper for exllama to generate text completions."""

# flake8: noqa

from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional
from typing import TYPE_CHECKING, Dict, Iterator, List, Optional

from torch import IntTensor, cuda
from torch import IntTensor, Tensor, cuda

from ..schemas.models import ExllamaModel
from ..utils.completions import (
Expand All @@ -15,8 +12,9 @@
make_completion,
make_completion_chunk,
)
from ..utils.dependency import import_repository
from ..utils.logger import ApiLogger
from ..utils.path import import_repository, resolve_model_path_to_posix
from ..utils.path import resolve_model_path_to_posix
from .base import BaseCompletionGenerator

with import_repository(
Expand All @@ -41,6 +39,15 @@
assert cuda.is_available(), "CUDA must be available to use ExLlama."


def _encode(tokenizer: ExLlamaTokenizer, text: str) -> Tensor:
"""Encode a text string into a tensor."""
result = tokenizer.encode(text)
if isinstance(result, tuple):
return result[0]
else:
return result


def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
"""Create a config object for the ExLlama model."""
model_folder_path = Path(
Expand All @@ -52,7 +59,7 @@ def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
config = ExLlamaConfig((model_folder_path / "config.json").as_posix())

# Find the model checkpoint
model_file_found: list[Path] = []
model_file_found: List[Path] = []
for ext in (".safetensors", ".pt", ".bin"):
model_file_found.extend(model_folder_path.glob(f"*{ext}"))
if model_file_found:
Expand Down Expand Up @@ -107,7 +114,7 @@ class ExllamaCompletionGenerator(BaseCompletionGenerator):
tokenizer: Optional[ExLlamaTokenizer] = None
generator: Optional[ExLlamaGenerator] = None
_llm_model: Optional["ExllamaModel"] = None
_completion_status: dict[
_completion_status: Dict[
str, int
] = {} # key: completion_id, value: number of completion tokens

Expand Down Expand Up @@ -272,7 +279,7 @@ def generate_completion_with_streaming(
text=last_token if last_token is not None else "",
finish_reason="length"
if self._completion_status.get(
completion_id, self.tokenizer.encode(generated_text).shape[1]
completion_id, _encode(self.tokenizer, generated_text).shape[1]
)
>= settings.max_tokens
else "stop",
Expand All @@ -284,9 +291,9 @@ def generate_completion(
assert self.tokenizer is not None and self.config is not None
completion_id: str = settings.completion_id
generated_text: str = self._generate_text(prompt, settings=settings)
n_prompt_tokens: int = self.tokenizer.encode(prompt).shape[1]
n_prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
n_completion_tokens: int = self._completion_status.get(
completion_id, self.tokenizer.encode(generated_text).shape[1]
completion_id, _encode(self.tokenizer, generated_text).shape[1]
)
return make_completion(
id=completion_id,
Expand All @@ -301,7 +308,7 @@ def generate_completion(

def generate_chat_completion_with_streaming(
self,
messages: list["APIChatMessage"],
messages: List["APIChatMessage"],
settings: "TextGenerationSettings",
) -> Iterator["ChatCompletionChunk"]:
assert self.config is not None and self.tokenizer is not None
Expand All @@ -328,23 +335,23 @@ def generate_chat_completion_with_streaming(
content=last_token if last_token is not None else "",
finish_reason="length"
if self._completion_status.get(
completion_id, self.tokenizer.encode(generated_text).shape[1]
completion_id, _encode(self.tokenizer, generated_text).shape[1]
)
else "stop",
)

def generate_chat_completion(
self,
messages: list["APIChatMessage"],
messages: List["APIChatMessage"],
settings: "TextGenerationSettings",
) -> "ChatCompletion":
assert self.tokenizer is not None and self.config is not None
completion_id: str = settings.completion_id
prompt = self.convert_messages_into_prompt(messages, settings=settings)
generated_text: str = self._generate_text(prompt, settings=settings)
prompt_tokens: int = self.tokenizer.encode(prompt).shape[1]
prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
completion_tokens: int = self._completion_status.get(
completion_id, self.tokenizer.encode(generated_text).shape[1]
completion_id, _encode(self.tokenizer, generated_text).shape[1]
)
return make_chat_completion(
id=completion_id,
Expand All @@ -357,10 +364,10 @@ def generate_chat_completion(
else "stop",
)

def encode(self, message: str, /) -> list[int]:
def encode(self, message: str, /) -> List[int]:
assert self.tokenizer is not None, "Tokenizer is not initialized"
return self.tokenizer.encode(message).flatten().tolist()
return _encode(self.tokenizer, message).flatten().tolist()

def decode(self, tokens: list[int], /) -> str:
def decode(self, tokens: List[int], /) -> str:
assert self.tokenizer is not None, "Tokenizer is not initialized"
return str(self.tokenizer.decode(IntTensor(tokens)))
Loading

0 comments on commit 6cff18a

Please sign in to comment.