exllama support with updated docker image

c0sogi · Jul 3, 2023 · 11761da · 11761da
1 parent b5fd98e
commit 11761da
Show file tree

Hide file tree

Showing 47 changed files with 17,300 additions and 15,765 deletions.
diff --git a/.env-sample b/.env-sample
@@ -1,7 +1,7 @@
 # DELETE THESE COMMENT LINE!!
 # DEFAULT_LLM_MODEL is defined in `LLM_MODELS` in `app\models\llms.py`
 
-API_ENV="local"
+API_ENV="test"
 PORT=8000
 DEFAULT_LLM_MODEL="gpt_3_5_turbo"
 MYSQL_DATABASE="traffic"

diff --git a/.gitignore b/.gitignore
@@ -2,10 +2,10 @@
 acme.json
 PRIVATE_*
 venv/
+.cache/
 *.pyc
 *.log
-llama_models/ggml/*
-llama_models/gptq/*
+llama_models/*
 !llama_models/ggml/llama_cpp_models_here.txt
-!llama_models/gptq/gptq_models_here.txt
+!llama_models/gptq/exllama_models_here.txt
 deprecated_*
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "repositories/llama_cpp"]
 	path = repositories/llama_cpp
 	url = https://github.com/abetlen/llama-cpp-python
+[submodule "repositories/exllama"]
+	path = repositories/exllama
+	url = https://github.com/turboderp/exllama
diff --git a/app/common/app_settings.py b/app/common/app_settings.py
@@ -1,11 +1,5 @@
-from contextlib import asynccontextmanager
-from multiprocessing import Process
-from os import kill
-from signal import SIGINT
-from threading import Event, Thread
-from urllib import parse
-
-import requests
+from threading import Thread
+
 from fastapi import Depends, FastAPI
 from fastapi.staticfiles import StaticFiles
 from starlette.middleware import Middleware
@@ -70,7 +64,7 @@ async def on_startup():
     except ImportError:
         ApiLogger.ccritical("uvloop not installed!")
 
-    if config.llama_cpp_completion_url:
+    if config.llama_completion_url:
         # Start Llama CPP server monitoring
         ApiLogger.ccritical("Llama CPP server monitoring started!")
         shared.thread = Thread(

diff --git a/app/common/app_settings_llama_cpp.py b/app/common/app_settings_llama_cpp.py
@@ -34,12 +34,12 @@ def start_llama_cpp_server(config: Config, shared: Shared):
         shared.process.terminate()
         shared.process.join()
 
-    if config.llama_cpp_server_port is None:
+    if config.llama_server_port is None:
         raise NotImplementedError("Llama CPP server port is not set")
 
     ApiLogger.ccritical("Starting Llama CPP server")
     shared.process = Process(
-        target=run_llama_cpp, args=(config.llama_cpp_server_port,), daemon=True
+        target=run_llama_cpp, args=(config.llama_server_port,), daemon=True
     )
     shared.process.start()
 
@@ -62,27 +62,29 @@ def monitor_llama_cpp_server(
     - `config: Config`: An object representing the server configuration.
     - `shared: Shared`: An object representing shared data."""
     thread_sigterm: Event = shared.thread_terminate_signal
-    if not config.llama_cpp_completion_url:
+    if not config.llama_completion_url:
         return
     while True:
-        if not check_health(config.llama_cpp_completion_url):
+        if not check_health(config.llama_completion_url):
             if thread_sigterm.is_set():
                 break
-            if config.is_llama_cpp_booting:
+            if config.is_llama_booting:
                 continue
             ApiLogger.cerror("Llama CPP server is not available")
-            config.is_llama_cpp_available = False
-            config.is_llama_cpp_booting = True
+            config.is_llama_available = False
+            config.is_llama_booting = True
             try:
                 start_llama_cpp_server(config=config, shared=shared)
             except (ImportError, NotImplementedError):
                 ApiLogger.cerror("ImportError: Llama CPP server is not available")
                 return
             except Exception:
+                ApiLogger.cexception("Unknown error: Llama CPP server is not available")
+                config.is_llama_booting = False
                 continue
         else:
-            config.is_llama_cpp_booting = False
-            config.is_llama_cpp_available = True
+            config.is_llama_booting = False
+            config.is_llama_available = True
     shutdown_llama_cpp_server(shared)
 
 

diff --git a/app/common/config.py b/app/common/config.py
@@ -1,17 +1,20 @@
-from __future__ import annotations
-
 import logging
 from dataclasses import dataclass, field
 from os import environ
 from pathlib import Path
 from re import Pattern, compile
-from typing import Optional
+from sys import modules
+from typing import Optional, Union
 from urllib import parse
 
 from aiohttp import ClientTimeout
-from dotenv import load_dotenv
 
-load_dotenv()
+
+# API Server Variables
+API_ENV: str = environ.get("API_ENV", "local")
+print(f"- API_ENV: {API_ENV}")
+DOCKER_MODE: bool = environ.get("DOCKER_MODE", "True").lower() == "true"
+print(f"- DOCKER_MODE: {DOCKER_MODE}")
 
 
 class SingletonMetaClass(type):
@@ -23,10 +26,6 @@ def __call__(cls, *args, **kwargs):
         return cls._instances[cls]
 
 
-# API Server Variables
-API_ENV: str = environ.get("API_ENV", "local")
-DOCKER_MODE: bool = environ.get("DOCKER_MODE", "True").lower() == "true"
-
 EXCEPT_PATH_LIST: tuple = (
     "/",
     "/openapi.json",
@@ -144,13 +143,13 @@ class Config(metaclass=SingletonMetaClass):
     shared_vectorestore_name: str = QDRANT_COLLECTION
     trusted_hosts: list[str] = field(default_factory=lambda: ["*"])
     allowed_sites: list[str] = field(default_factory=lambda: ["*"])
-    llama_cpp_completion_url: Optional[str] = "http://localhost:8002/v1/completions"
-    llama_cpp_embedding_url: Optional[str] = "http://localhost:8002/v1/embeddings"
-    llama_cpp_server_port: Optional[int] = 8002
+    llama_completion_url: Optional[str] = "http://localhost:8002/v1/completions"
+    llama_embedding_url: Optional[str] = "http://localhost:8002/v1/embeddings"
+    llama_server_port: Optional[int] = 8002
 
     def __post_init__(self):
-        self.is_llama_cpp_available: bool = False
-        self.is_llama_cpp_booting: bool = False
+        self.is_llama_available: bool = False
+        self.is_llama_booting: bool = False
         if not DOCKER_MODE:
             self.port = 8001
             self.mysql_host = "localhost"
@@ -185,25 +184,22 @@ def __post_init__(self):
     @staticmethod
     def get(
         option: Optional[str] = None,
-    ) -> LocalConfig | ProdConfig | TestConfig:
-        if environ.get("PYTEST_RUNNING") is not None:
-            return TestConfig()
+    ) -> Union["LocalConfig", "ProdConfig", "TestConfig"]:
+        if option is not None:
+            return {
+                "prod": ProdConfig,
+                "local": LocalConfig,
+                "test": TestConfig,
+            }[option]()
         else:
-            if option is not None:
+            if API_ENV is not None:
                 return {
                     "prod": ProdConfig,
                     "local": LocalConfig,
                     "test": TestConfig,
-                }[option]()
+                }[API_ENV.lower()]()
             else:
-                if API_ENV is not None:
-                    return {
-                        "prod": ProdConfig,
-                        "local": LocalConfig,
-                        "test": TestConfig,
-                    }[API_ENV]()
-                else:
-                    return LocalConfig()
+                return LocalConfig()
 
 
 @dataclass

diff --git a/app/common/constants.py b/app/common/constants.py
@@ -105,7 +105,11 @@ class ChatTurnTemplates:
         input_variables=["role", "content"],
         template_format="f-string",
     )
-
+    ROLE_CONTENT_6: PromptTemplate = PromptTemplate(
+        template="{role}: {content}</s>",
+        input_variables=["role", "content"],
+        template_format="f-string",
+    )
 
 class SummarizationTemplates:
     TEXT__MARKUP: PromptTemplate = PromptTemplate(

diff --git a/app/common/lotties.py b/app/common/lotties.py
@@ -1,5 +1,3 @@
-from typing import Self
-
 from app.common.mixins import EnumMixin