refactor & fixed chatroom name not modifying bug

c0sogi · Jun 28, 2023 · b5fd98e · b5fd98e
1 parent a1cb2eb
commit b5fd98e
Show file tree

Hide file tree

Showing 26 changed files with 1,127 additions and 1,207 deletions.
diff --git a/app/common/app_settings.py b/app/common/app_settings.py
@@ -1,3 +1,4 @@
+from contextlib import asynccontextmanager
 from multiprocessing import Process
 from os import kill
 from signal import SIGINT
@@ -16,6 +17,7 @@
 from starlette_admin.views import DropDown, Link
 
 from app.auth.admin import MyAuthProvider
+from app.common.app_settings_llama_cpp import monitor_llama_cpp_server
 from app.common.config import JWT_SECRET, Config
 from app.database.connection import cache, db
 from app.database.schemas.auth import ApiKeys, ApiWhiteLists, Users
@@ -30,72 +32,6 @@
 from app.viewmodels.admin import ApiKeyAdminView, UserAdminView
 
 
-def check_health(url: str) -> bool:
-    """Check if the given url is available or not"""
-    try:
-        schema = parse.urlparse(url).scheme
-        netloc = parse.urlparse(url).netloc
-        if requests.get(f"{schema}://{netloc}/health").status_code != 200:
-            return False
-        return True
-    except Exception:
-        return False
-
-
-def start_llama_cpp_server(shared: Shared):
-    """Start Llama CPP server. if it is already running, terminate it first."""
-    from app.start_llama_cpp_server import run
-
-    if shared.process.is_alive():
-        ApiLogger.cwarning("Terminating existing Llama CPP server")
-        shared.process.terminate()
-        shared.process.join()
-
-    ApiLogger.ccritical("Starting Llama CPP server")
-    shared.process = Process(target=run, daemon=True)
-    shared.process.start()
-
-
-def shutdown_llama_cpp_server(shared: Shared):
-    """Shutdown Llama CPP server."""
-    ApiLogger.ccritical("Shutting down Llama CPP server")
-    if shared.process.is_alive() and shared.process.pid:
-        kill(shared.process.pid, SIGINT)
-        shared.process.join()
-
-
-def monitor_llama_cpp_server(
-    config: Config,
-    shared: Shared,
-) -> None:
-    """Monitors the Llama CPP server and handles server availability.
-
-    Parameters:
-    - `config: Config`: An object representing the server configuration.
-    - `shared: Shared`: An object representing shared data."""
-    thread_sigterm: Event = shared.thread_terminate_signal
-    if not config.llama_cpp_completion_url:
-        return
-    while True:
-        if not check_health(config.llama_cpp_completion_url):
-            if thread_sigterm.is_set():
-                break
-            if config.is_llama_cpp_booting:
-                continue
-            ApiLogger.cerror("Llama CPP server is not available")
-            config.is_llama_cpp_available = False
-            config.is_llama_cpp_booting = True
-            try:
-                start_llama_cpp_server(shared)
-            except ImportError:
-                ApiLogger.cerror("ImportError: Llama CPP server is not available")
-                return
-        else:
-            config.is_llama_cpp_booting = False
-            config.is_llama_cpp_available = True
-    shutdown_llama_cpp_server(shared)
-
-
 async def on_startup():
     """
     Performs necessary operations during application startup.
@@ -300,4 +236,9 @@ def create_app(config: Config) -> FastAPI:
     )
     new_app.state.config = config
     new_app.state.shared = Shared()
+
+    @new_app.get("/health")
+    async def health():
+        return "ok"
+
     return new_app
diff --git a/app/common/app_settings_llama_cpp.py b/app/common/app_settings_llama_cpp.py
@@ -0,0 +1,134 @@
+from contextlib import asynccontextmanager
+from multiprocessing import Process
+from os import kill
+from signal import SIGINT
+from threading import Event
+from urllib import parse
+
+import requests
+from fastapi import FastAPI
+from starlette.middleware.cors import CORSMiddleware
+
+from app.common.config import Config
+from app.shared import Shared
+from app.utils.logger import ApiLogger
+
+
+def check_health(url: str) -> bool:
+    """Check if the given url is available or not"""
+    try:
+        schema = parse.urlparse(url).scheme
+        netloc = parse.urlparse(url).netloc
+        if requests.get(f"{schema}://{netloc}/health").status_code != 200:
+            return False
+        return True
+    except Exception:
+        return False
+
+
+def start_llama_cpp_server(config: Config, shared: Shared):
+    """Start Llama CPP server. if it is already running, terminate it first."""
+
+    if shared.process.is_alive():
+        ApiLogger.cwarning("Terminating existing Llama CPP server")
+        shared.process.terminate()
+        shared.process.join()
+
+    if config.llama_cpp_server_port is None:
+        raise NotImplementedError("Llama CPP server port is not set")
+
+    ApiLogger.ccritical("Starting Llama CPP server")
+    shared.process = Process(
+        target=run_llama_cpp, args=(config.llama_cpp_server_port,), daemon=True
+    )
+    shared.process.start()
+
+
+def shutdown_llama_cpp_server(shared: Shared):
+    """Shutdown Llama CPP server."""
+    ApiLogger.ccritical("Shutting down Llama CPP server")
+    if shared.process.is_alive() and shared.process.pid:
+        kill(shared.process.pid, SIGINT)
+        shared.process.join()
+
+
+def monitor_llama_cpp_server(
+    config: Config,
+    shared: Shared,
+) -> None:
+    """Monitors the Llama CPP server and handles server availability.
+
+    Parameters:
+    - `config: Config`: An object representing the server configuration.
+    - `shared: Shared`: An object representing shared data."""
+    thread_sigterm: Event = shared.thread_terminate_signal
+    if not config.llama_cpp_completion_url:
+        return
+    while True:
+        if not check_health(config.llama_cpp_completion_url):
+            if thread_sigterm.is_set():
+                break
+            if config.is_llama_cpp_booting:
+                continue
+            ApiLogger.cerror("Llama CPP server is not available")
+            config.is_llama_cpp_available = False
+            config.is_llama_cpp_booting = True
+            try:
+                start_llama_cpp_server(config=config, shared=shared)
+            except (ImportError, NotImplementedError):
+                ApiLogger.cerror("ImportError: Llama CPP server is not available")
+                return
+            except Exception:
+                continue
+        else:
+            config.is_llama_cpp_booting = False
+            config.is_llama_cpp_available = True
+    shutdown_llama_cpp_server(shared)
+
+
+@asynccontextmanager
+async def lifespan_llama_cpp(app: FastAPI):
+    ApiLogger.ccritical("🦙 Llama.cpp server is running")
+    yield
+    ApiLogger.ccritical("🦙 Shutting down llama.cpp server...")
+
+
+def create_app_llama_cpp():
+    from app.routers import v1
+
+    new_app = FastAPI(
+        title="🦙 llama.cpp Python API",
+        version="0.0.1",
+        lifespan=lifespan_llama_cpp,
+    )
+    new_app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    @new_app.get("/health")
+    async def health():
+        return "ok"
+
+    new_app.include_router(v1.router)
+    return new_app
+
+
+def run_llama_cpp(port: int) -> None:
+    from uvicorn import Config, Server
+
+    from maintools import initialize_before_launch
+
+    initialize_before_launch()
+
+    Server(
+        config=Config(
+            create_app_llama_cpp(),
+            host="0.0.0.0",
+            port=port,
+            log_level="warning",
+        )
+    ).run()
diff --git a/app/common/config.py b/app/common/config.py
@@ -146,9 +146,10 @@ class Config(metaclass=SingletonMetaClass):
     allowed_sites: list[str] = field(default_factory=lambda: ["*"])
     llama_cpp_completion_url: Optional[str] = "http://localhost:8002/v1/completions"
     llama_cpp_embedding_url: Optional[str] = "http://localhost:8002/v1/embeddings"
+    llama_cpp_server_port: Optional[int] = 8002
 
     def __post_init__(self):
-        self.is_llama_cpp_available: bool = self.llama_cpp_completion_url is not None
+        self.is_llama_cpp_available: bool = False
         self.is_llama_cpp_booting: bool = False
         if not DOCKER_MODE:
             self.port = 8001

diff --git a/app/common/constants.py b/app/common/constants.py
@@ -49,6 +49,10 @@ class DescriptionTemplates:
         ),
         input_variables=[],
     )
+    USER_AI__VERT_SHORT: PromptTemplate = PromptTemplate(
+        template="You are a helpful AI assistant.",
+        input_variables=[],
+    )
 
     USER_AI__GAME: PromptTemplate = PromptTemplate(
         template=(
@@ -96,6 +100,11 @@ class ChatTurnTemplates:
         input_variables=["role", "content"],
         template_format="f-string",
     )
+    ROLE_CONTENT_5: PromptTemplate = PromptTemplate(
+        template="{role}: {content}\n",
+        input_variables=["role", "content"],
+        template_format="f-string",
+    )
 
 
 class SummarizationTemplates:

diff --git a/app/models/llms.py b/app/models/llms.py
@@ -268,6 +268,7 @@ class LLMModels(EnumMixin):
         max_total_tokens=2048,  # context tokens (n_ctx)
         max_tokens_per_request=1024,  # The maximum number of tokens to generate.
         token_margin=8,
+        prefix_template=None,
         tokenizer=LlamaTokenizer("victor123/WizardLM-13B-1.0"),
         model_path="wizardLM-13B-Uncensored.ggmlv3.q5_K_M.bin",  # The filename of model. Must end with .bin.
         user_chat_roles=UserChatRoles(
@@ -348,6 +349,33 @@ class LLMModels(EnumMixin):
         prefix_template=None,
         embedding=True,
     )
+    orca_mini_3b = LlamaCppModel(
+        name="orca_mini_3B-GGML",
+        max_total_tokens=2048,  # context tokens (n_ctx)
+        max_tokens_per_request=1024,  # The maximum number of tokens to generate.
+        token_margin=8,
+        tokenizer=LlamaTokenizer("psmathur/orca_mini_3b"),
+        model_path="orca-mini-3b.ggmlv3.q4_1.bin",  # The filename of model. Must end with .bin.
+        chat_turn_prompt=ChatTurnTemplates.ROLE_CONTENT_2,
+        user_chat_roles=UserChatRoles(
+            user="User",
+            ai="Response",
+            system="System",
+        ),
+    )
+    guanaco_33b = LlamaCppModel(
+        name="guanaco-33B-GGML",
+        max_total_tokens=2048,  # context tokens (n_ctx)
+        max_tokens_per_request=1024,  # The maximum number of tokens to generate.
+        token_margin=8,
+        tokenizer=LlamaTokenizer("timdettmers/guanaco-33b-merged"),
+        model_path="guanaco-33B.ggmlv3.q3_K_S.bin",  # The filename of model. Must end with .bin.
+        user_chat_roles=UserChatRoles(
+            user="Human",
+            ai="Assistant",
+            system="System",
+        ),
+    )
 
     @classmethod
     def find_model_by_name(cls, name: str) -> LLMModel | None: