Merge pull request #1 from c0sogi/temp-branch

Dependency solution
c0sogi · Jul 31, 2023 · 6cff18a · 6cff18a
2 parents 5f38c79 + 6a02465
commit 6cff18a
Show file tree

Hide file tree

Showing 38 changed files with 5,008 additions and 686 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,53 @@
+name: Continuous Integration
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build-and-test:
+    name: Build and Test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Setup Python, install dependencies, and run tests
+      run: |
+        python -m pip install --upgrade pip
+        python -m llama_api.server.app_settings --install-pkgs
+        python -m unittest discover tests
+
+  build-release:
+    if: github.event_name == 'push' && success()
+    needs: build-and-test
+    name: Create Release
+    runs-on: ubuntu-latest
+    steps:
+    - name: Shorten SHA
+      id: shorten_sha
+      run: echo "::set-output name=sha::$(echo ${GITHUB_SHA:0:7})"
+    - name: Create Release
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        tag_name: ${{ steps.shorten_sha.outputs.sha }}
+        release_name: master-${{ steps.shorten_sha.outputs.sha }}
+        draft: false
+        prerelease: false
diff --git a/.gitignore b/.gitignore
@@ -2,9 +2,10 @@ models/ggml/*
 models/gptq/*
 !models/ggml/llama_cpp_models_here.txt
 !models/gptq/exllama_models_here.txt
+repositories/
 *.log
-.venv
-.vscode
 *.pyc
-PRIVATE_*
-repositories
+.venv/
+.vscode/
+.test-venv/
+PRIVATE_*
diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,21 @@
 ### Dockerfile for Python 3.11.4 & CUDA 12.1.1 & Ubuntu 22.04
-### Approximately 7 ~ 10 minutes to build
+### Approximately 5 ~ 10 minutes to build
 
-# 필요한 CUDA 버전을 선택합니다.
+# Select the required CUDA version.
 ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
 FROM nvidia/cuda:${CUDA_IMAGE}
 ENV PYTHON_VERSION="3.11.4"
 ENV PYTHON_VERSION_SHORT="3.11"
 ENV HOST 0.0.0.0
 ENV PORT=8000
 
-# 필요한 파일들을 복사합니다.
-COPY requirements.txt /tmp/requirements.txt
+# Copy the necessary files.
+COPY requirements.txt /app/requirements.txt
+COPY pyproject.toml /app/pyproject.toml
+COPY llama_api /app/llama_api
 
-# Python 설치를 위한 종속성들을 설치하고, Python을 설치하고 설정합니다.
+# Install the necessary applications, and then install Python.
+# Then, install the necessary Python packages(Dependencies).
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     zlib1g-dev \
@@ -34,11 +37,12 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && update-alternatives --install /usr/bin/python python /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
     && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python${PYTHON_VERSION_SHORT} 1 \
     && python3 -m pip install --upgrade pip \
-    && pip install --no-cache-dir -r /tmp/requirements.txt \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean \
-    && rm -rf /tmp/*
+    && rm -rf /tmp/* \
+    && cd /app \
+    && python3 -m llama_api.server.app_settings --force-cuda --install-pkgs
 
-# 작업 디렉토리를 설정하고, 서버를 실행합니다.
+# Set the working directory and start the server.
 WORKDIR /app
-ENTRYPOINT [ "python3", "-m", "main" "--port", "${PORT}" ]
+ENTRYPOINT [ "python3", "-m", "main", "--port", "${PORT}" ]
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,9 @@
+MIT License
+
+Copyright (c) 2023 Andrei Betlen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,9 +2,7 @@ version: '3'
 
 services:
   llama-api:
-    build:
-      context: .
-      dockerfile: Dockerfile
+    image: cosogi/llama-api:230730
     entrypoint: ["python3", "-m", "main", "--port", "8000"]
     environment:
       - MAX_WORKERS=1
@@ -14,11 +12,35 @@ services:
       - ./model_definitions.py:/app/model_definitions.py
       - ./main.py:/app/main.py
       - ./requirements.txt:/app/requirements.txt
+      - ./pyproject.toml:/app/pyproject.toml
     ports:
       - 8000:8000
     deploy:
       resources:
         reservations:
           devices:
           - driver: nvidia
-            capabilities: [gpu]
+            capabilities: [gpu]
+
+# services:
+  # llama-api:
+  #   build:
+  #     context: .
+  #     dockerfile: Dockerfile
+  #   entrypoint: ["python3", "-m", "main", "--port", "8000"]
+  #   environment:
+  #     - MAX_WORKERS=1
+  #   volumes:
+  #     - ./models:/app/models
+  #     - ./llama_api:/app/llama_api
+  #     - ./model_definitions.py:/app/model_definitions.py
+  #     - ./main.py:/app/main.py
+  #     - ./requirements.txt:/app/requirements.txt
+  #   ports:
+  #     - 8000:8000
+  #   deploy:
+  #     resources:
+  #       reservations:
+  #         devices:
+  #         - driver: nvidia
+  #           capabilities: [gpu]
diff --git a/llama_api/mixins/prompt_utils.py b/llama_api/mixins/prompt_utils.py
@@ -1,3 +1,4 @@
+from typing import List
 from ..schemas.api import APIChatMessage, TextGenerationSettings
 
 
@@ -9,7 +10,7 @@ class PromptUtilsMixin:
     ai_fallback_input_role: str = "Assistant"
 
     @staticmethod
-    def get_stop_strings(*roles: str) -> list[str]:
+    def get_stop_strings(*roles: str) -> List[str]:
         """A helper method to generate stop strings for a given set of roles.
         Stop strings are required to stop text completion API from generating
         text that does not belong to the current chat turn.
@@ -33,7 +34,7 @@ def get_stop_strings(*roles: str) -> list[str]:
 
     @classmethod
     def convert_messages_into_prompt(
-        cls, messages: list[APIChatMessage], settings: TextGenerationSettings
+        cls, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> str:
         """A helper method to convert list of messages into one text prompt."""
 
@@ -48,7 +49,7 @@ def convert_messages_into_prompt(
                 input_role = ai_input_role = message.role
             chat_history += f"### {input_role}:{message.content}"
 
-        prompt_stop: list[str] = cls.get_stop_strings(
+        prompt_stop: List[str] = cls.get_stop_strings(
             cls.user_input_role, cls.system_input_role, ai_input_role
         )
         if isinstance(settings.stop, str):
@@ -61,7 +62,7 @@ def convert_messages_into_prompt(
 
     @staticmethod
     def is_possible_to_generate_stops(
-        decoded_text: str, stops: list[str]
+        decoded_text: str, stops: List[str]
     ) -> bool:
         """A helper method to check if
         the decoded text contains any of the stop tokens."""

diff --git a/llama_api/modules/base.py b/llama_api/modules/base.py
@@ -1,9 +1,8 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Iterator, TypeVar
+from typing import Any, Iterator, List, TypeVar
 
 from ..mixins.prompt_utils import PromptUtilsMixin
-from ..mixins.waiter import WaiterMixin
 from ..schemas.api import (
     APIChatMessage,
     ChatCompletion,
@@ -24,7 +23,7 @@ class BaseLLMModel:
     max_total_tokens: int = 2048
 
 
-class BaseCompletionGenerator(ABC, PromptUtilsMixin, WaiterMixin):
+class BaseCompletionGenerator(ABC, PromptUtilsMixin):
     """Base class for all completion generators."""
 
     @abstractmethod
@@ -57,14 +56,14 @@ def generate_completion_with_streaming(
 
     @abstractmethod
     def generate_chat_completion(
-        self, messages: list[APIChatMessage], settings: TextGenerationSettings
+        self, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> ChatCompletion:
         """Generate a completion for a given prompt."""
         ...
 
     @abstractmethod
     def generate_chat_completion_with_streaming(
-        self, messages: list[APIChatMessage], settings: TextGenerationSettings
+        self, messages: List[APIChatMessage], settings: TextGenerationSettings
     ) -> Iterator[ChatCompletionChunk]:
         """Generate a completion for a given prompt,
         yielding chunks of text as they are generated."""
@@ -92,9 +91,9 @@ def from_pretrained(cls, model_name: str) -> "BaseEmbeddingGenerator":
     @abstractmethod
     def generate_embeddings(
         self,
-        texts: list[str],
+        texts: List[str],
         **kwargs: Any,
-    ) -> list[list[float]]:
+    ) -> List[List[float]]:
         """Generate embeddings for a list of texts."""
         ...
 

diff --git a/llama_api/modules/exllama.py b/llama_api/modules/exllama.py
@@ -1,12 +1,9 @@
 """Wrapper for exllama to generate text completions."""
-
-# flake8: noqa
-
 from contextlib import contextmanager
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterator, Optional
+from typing import TYPE_CHECKING, Dict, Iterator, List, Optional
 
-from torch import IntTensor, cuda
+from torch import IntTensor, Tensor, cuda
 
 from ..schemas.models import ExllamaModel
 from ..utils.completions import (
@@ -15,8 +12,9 @@
     make_completion,
     make_completion_chunk,
 )
+from ..utils.dependency import import_repository
 from ..utils.logger import ApiLogger
-from ..utils.path import import_repository, resolve_model_path_to_posix
+from ..utils.path import resolve_model_path_to_posix
 from .base import BaseCompletionGenerator
 
 with import_repository(
@@ -41,6 +39,15 @@
 assert cuda.is_available(), "CUDA must be available to use ExLlama."
 
 
+def _encode(tokenizer: ExLlamaTokenizer, text: str) -> Tensor:
+    """Encode a text string into a tensor."""
+    result = tokenizer.encode(text)
+    if isinstance(result, tuple):
+        return result[0]
+    else:
+        return result
+
+
 def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
     """Create a config object for the ExLlama model."""
     model_folder_path = Path(
@@ -52,7 +59,7 @@ def _make_config(llm_model: "ExllamaModel") -> ExLlamaConfig:
     config = ExLlamaConfig((model_folder_path / "config.json").as_posix())
 
     # Find the model checkpoint
-    model_file_found: list[Path] = []
+    model_file_found: List[Path] = []
     for ext in (".safetensors", ".pt", ".bin"):
         model_file_found.extend(model_folder_path.glob(f"*{ext}"))
         if model_file_found:
@@ -107,7 +114,7 @@ class ExllamaCompletionGenerator(BaseCompletionGenerator):
     tokenizer: Optional[ExLlamaTokenizer] = None
     generator: Optional[ExLlamaGenerator] = None
     _llm_model: Optional["ExllamaModel"] = None
-    _completion_status: dict[
+    _completion_status: Dict[
         str, int
     ] = {}  # key: completion_id, value: number of completion tokens
 
@@ -272,7 +279,7 @@ def generate_completion_with_streaming(
             text=last_token if last_token is not None else "",
             finish_reason="length"
             if self._completion_status.get(
-                completion_id, self.tokenizer.encode(generated_text).shape[1]
+                completion_id, _encode(self.tokenizer, generated_text).shape[1]
             )
             >= settings.max_tokens
             else "stop",
@@ -284,9 +291,9 @@ def generate_completion(
         assert self.tokenizer is not None and self.config is not None
         completion_id: str = settings.completion_id
         generated_text: str = self._generate_text(prompt, settings=settings)
-        n_prompt_tokens: int = self.tokenizer.encode(prompt).shape[1]
+        n_prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
         n_completion_tokens: int = self._completion_status.get(
-            completion_id, self.tokenizer.encode(generated_text).shape[1]
+            completion_id, _encode(self.tokenizer, generated_text).shape[1]
         )
         return make_completion(
             id=completion_id,
@@ -301,7 +308,7 @@ def generate_completion(
 
     def generate_chat_completion_with_streaming(
         self,
-        messages: list["APIChatMessage"],
+        messages: List["APIChatMessage"],
         settings: "TextGenerationSettings",
     ) -> Iterator["ChatCompletionChunk"]:
         assert self.config is not None and self.tokenizer is not None
@@ -328,23 +335,23 @@ def generate_chat_completion_with_streaming(
             content=last_token if last_token is not None else "",
             finish_reason="length"
             if self._completion_status.get(
-                completion_id, self.tokenizer.encode(generated_text).shape[1]
+                completion_id, _encode(self.tokenizer, generated_text).shape[1]
             )
             else "stop",
         )
 
     def generate_chat_completion(
         self,
-        messages: list["APIChatMessage"],
+        messages: List["APIChatMessage"],
         settings: "TextGenerationSettings",
     ) -> "ChatCompletion":
         assert self.tokenizer is not None and self.config is not None
         completion_id: str = settings.completion_id
         prompt = self.convert_messages_into_prompt(messages, settings=settings)
         generated_text: str = self._generate_text(prompt, settings=settings)
-        prompt_tokens: int = self.tokenizer.encode(prompt).shape[1]
+        prompt_tokens: int = _encode(self.tokenizer, prompt).shape[1]
         completion_tokens: int = self._completion_status.get(
-            completion_id, self.tokenizer.encode(generated_text).shape[1]
+            completion_id, _encode(self.tokenizer, generated_text).shape[1]
         )
         return make_chat_completion(
             id=completion_id,
@@ -357,10 +364,10 @@ def generate_chat_completion(
             else "stop",
         )
 
-    def encode(self, message: str, /) -> list[int]:
+    def encode(self, message: str, /) -> List[int]:
         assert self.tokenizer is not None, "Tokenizer is not initialized"
-        return self.tokenizer.encode(message).flatten().tolist()
+        return _encode(self.tokenizer, message).flatten().tolist()
 
-    def decode(self, tokens: list[int], /) -> str:
+    def decode(self, tokens: List[int], /) -> str:
         assert self.tokenizer is not None, "Tokenizer is not initialized"
         return str(self.tokenizer.decode(IntTensor(tokens)))