Skip to content

Commit

Permalink
exllama support with updated docker image
Browse files Browse the repository at this point in the history
  • Loading branch information
c0sogi committed Jul 3, 2023
1 parent b5fd98e commit 11761da
Show file tree
Hide file tree
Showing 47 changed files with 17,300 additions and 15,765 deletions.
2 changes: 1 addition & 1 deletion .env-sample
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# DELETE THESE COMMENT LINE!!
# DEFAULT_LLM_MODEL is defined in `LLM_MODELS` in `app\models\llms.py`

API_ENV="local"
API_ENV="test"
PORT=8000
DEFAULT_LLM_MODEL="gpt_3_5_turbo"
MYSQL_DATABASE="traffic"
Expand Down
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
acme.json
PRIVATE_*
venv/
.cache/
*.pyc
*.log
llama_models/ggml/*
llama_models/gptq/*
llama_models/*
!llama_models/ggml/llama_cpp_models_here.txt
!llama_models/gptq/gptq_models_here.txt
!llama_models/gptq/exllama_models_here.txt
deprecated_*
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "repositories/llama_cpp"]
path = repositories/llama_cpp
url = https://github.com/abetlen/llama-cpp-python
[submodule "repositories/exllama"]
path = repositories/exllama
url = https://github.com/turboderp/exllama
12 changes: 3 additions & 9 deletions app/common/app_settings.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
from contextlib import asynccontextmanager
from multiprocessing import Process
from os import kill
from signal import SIGINT
from threading import Event, Thread
from urllib import parse

import requests
from threading import Thread

from fastapi import Depends, FastAPI
from fastapi.staticfiles import StaticFiles
from starlette.middleware import Middleware
Expand Down Expand Up @@ -70,7 +64,7 @@ async def on_startup():
except ImportError:
ApiLogger.ccritical("uvloop not installed!")

if config.llama_cpp_completion_url:
if config.llama_completion_url:
# Start Llama CPP server monitoring
ApiLogger.ccritical("Llama CPP server monitoring started!")
shared.thread = Thread(
Expand Down
20 changes: 11 additions & 9 deletions app/common/app_settings_llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def start_llama_cpp_server(config: Config, shared: Shared):
shared.process.terminate()
shared.process.join()

if config.llama_cpp_server_port is None:
if config.llama_server_port is None:
raise NotImplementedError("Llama CPP server port is not set")

ApiLogger.ccritical("Starting Llama CPP server")
shared.process = Process(
target=run_llama_cpp, args=(config.llama_cpp_server_port,), daemon=True
target=run_llama_cpp, args=(config.llama_server_port,), daemon=True
)
shared.process.start()

Expand All @@ -62,27 +62,29 @@ def monitor_llama_cpp_server(
- `config: Config`: An object representing the server configuration.
- `shared: Shared`: An object representing shared data."""
thread_sigterm: Event = shared.thread_terminate_signal
if not config.llama_cpp_completion_url:
if not config.llama_completion_url:
return
while True:
if not check_health(config.llama_cpp_completion_url):
if not check_health(config.llama_completion_url):
if thread_sigterm.is_set():
break
if config.is_llama_cpp_booting:
if config.is_llama_booting:
continue
ApiLogger.cerror("Llama CPP server is not available")
config.is_llama_cpp_available = False
config.is_llama_cpp_booting = True
config.is_llama_available = False
config.is_llama_booting = True
try:
start_llama_cpp_server(config=config, shared=shared)
except (ImportError, NotImplementedError):
ApiLogger.cerror("ImportError: Llama CPP server is not available")
return
except Exception:
ApiLogger.cexception("Unknown error: Llama CPP server is not available")
config.is_llama_booting = False
continue
else:
config.is_llama_cpp_booting = False
config.is_llama_cpp_available = True
config.is_llama_booting = False
config.is_llama_available = True
shutdown_llama_cpp_server(shared)


Expand Down
50 changes: 23 additions & 27 deletions app/common/config.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
from __future__ import annotations

import logging
from dataclasses import dataclass, field
from os import environ
from pathlib import Path
from re import Pattern, compile
from typing import Optional
from sys import modules
from typing import Optional, Union
from urllib import parse

from aiohttp import ClientTimeout
from dotenv import load_dotenv

load_dotenv()

# API Server Variables
API_ENV: str = environ.get("API_ENV", "local")
print(f"- API_ENV: {API_ENV}")
DOCKER_MODE: bool = environ.get("DOCKER_MODE", "True").lower() == "true"
print(f"- DOCKER_MODE: {DOCKER_MODE}")


class SingletonMetaClass(type):
Expand All @@ -23,10 +26,6 @@ def __call__(cls, *args, **kwargs):
return cls._instances[cls]


# API Server Variables
API_ENV: str = environ.get("API_ENV", "local")
DOCKER_MODE: bool = environ.get("DOCKER_MODE", "True").lower() == "true"

EXCEPT_PATH_LIST: tuple = (
"/",
"/openapi.json",
Expand Down Expand Up @@ -144,13 +143,13 @@ class Config(metaclass=SingletonMetaClass):
shared_vectorestore_name: str = QDRANT_COLLECTION
trusted_hosts: list[str] = field(default_factory=lambda: ["*"])
allowed_sites: list[str] = field(default_factory=lambda: ["*"])
llama_cpp_completion_url: Optional[str] = "http://localhost:8002/v1/completions"
llama_cpp_embedding_url: Optional[str] = "http://localhost:8002/v1/embeddings"
llama_cpp_server_port: Optional[int] = 8002
llama_completion_url: Optional[str] = "http://localhost:8002/v1/completions"
llama_embedding_url: Optional[str] = "http://localhost:8002/v1/embeddings"
llama_server_port: Optional[int] = 8002

def __post_init__(self):
self.is_llama_cpp_available: bool = False
self.is_llama_cpp_booting: bool = False
self.is_llama_available: bool = False
self.is_llama_booting: bool = False
if not DOCKER_MODE:
self.port = 8001
self.mysql_host = "localhost"
Expand Down Expand Up @@ -185,25 +184,22 @@ def __post_init__(self):
@staticmethod
def get(
option: Optional[str] = None,
) -> LocalConfig | ProdConfig | TestConfig:
if environ.get("PYTEST_RUNNING") is not None:
return TestConfig()
) -> Union["LocalConfig", "ProdConfig", "TestConfig"]:
if option is not None:
return {
"prod": ProdConfig,
"local": LocalConfig,
"test": TestConfig,
}[option]()
else:
if option is not None:
if API_ENV is not None:
return {
"prod": ProdConfig,
"local": LocalConfig,
"test": TestConfig,
}[option]()
}[API_ENV.lower()]()
else:
if API_ENV is not None:
return {
"prod": ProdConfig,
"local": LocalConfig,
"test": TestConfig,
}[API_ENV]()
else:
return LocalConfig()
return LocalConfig()


@dataclass
Expand Down
6 changes: 5 additions & 1 deletion app/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,11 @@ class ChatTurnTemplates:
input_variables=["role", "content"],
template_format="f-string",
)

ROLE_CONTENT_6: PromptTemplate = PromptTemplate(
template="{role}: {content}</s>",
input_variables=["role", "content"],
template_format="f-string",
)

class SummarizationTemplates:
TEXT__MARKUP: PromptTemplate = PromptTemplate(
Expand Down
2 changes: 0 additions & 2 deletions app/common/lotties.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Self

from app.common.mixins import EnumMixin


Expand Down
Loading

0 comments on commit 11761da

Please sign in to comment.