diff --git a/koboldcpp.py b/koboldcpp.py
index 9c050cc879212..cb3d45573b662 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -25,7 +25,6 @@ class load_model_inputs(ctypes.Structure):
("blasthreads", ctypes.c_int),
("max_context_length", ctypes.c_int),
("batch_size", ctypes.c_int),
- ("f16_kv", ctypes.c_bool),
("low_vram", ctypes.c_bool),
("use_mmq", ctypes.c_bool),
("executable_path", ctypes.c_char_p),
@@ -78,6 +77,10 @@ class generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
("text", ctypes.c_char * 32768)]
+class token_count_outputs(ctypes.Structure):
+ _fields_ = [("count", ctypes.c_int),
+ ("ids", ctypes.POINTER(ctypes.c_int))]
+
handle = None
def getdirpath():
@@ -219,7 +222,7 @@ def init_library():
handle.get_total_gens.restype = ctypes.c_int
handle.get_last_stop_reason.restype = ctypes.c_int
handle.abort_generate.restype = ctypes.c_bool
- handle.token_count.restype = ctypes.c_int
+ handle.token_count.restype = token_count_outputs
handle.get_pending_output.restype = ctypes.c_char_p
def load_model(model_filename):
@@ -232,7 +235,6 @@ def load_model(model_filename):
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False)
inputs.blasthreads = args.blasthreads
- inputs.f16_kv = True
inputs.use_mmap = (not args.nommap)
inputs.use_mlock = args.usemlock
inputs.lora_filename = "".encode("UTF-8")
@@ -393,14 +395,14 @@ def bring_terminal_to_foreground():
modelbusy = threading.Lock()
requestsinqueue = 0
defaultport = 5001
-KcppVersion = "1.51.1.yr2-ROCm"
+KcppVersion = "1.52.yr1.rc1-ROCm"
showdebug = True
showsamplerwarning = True
showmaxctxwarning = True
session_kudos_earned = 0
session_jobs = 0
session_starttime = None
-exitcounter = 0
+exitcounter = -1
punishcounter = 0 #causes a timeout if too many errors
rewardcounter = 0 #reduces error counts for successful jobs
totalgens = 0
@@ -627,9 +629,87 @@ async def handle_request(self, genparams, api_format, stream_flag):
except Exception as e:
print(e)
+ def noscript_webui(self):
+ global modelbusy
+ import html
+ import urllib.parse as urlparse
+ parsed_url = urlparse.urlparse(self.path)
+ parsed_dict = urlparse.parse_qs(parsed_url.query)
+ reply = ""
+ status = str(parsed_dict['status'][0]) if 'status' in parsed_dict else "Ready To Generate"
+ prompt = str(parsed_dict['prompt'][0]) if 'prompt' in parsed_dict else ""
+ max_length = int(parsed_dict['max_length'][0]) if 'max_length' in parsed_dict else 100
+ temperature = float(parsed_dict['temperature'][0]) if 'temperature' in parsed_dict else 0.7
+ top_k = int(parsed_dict['top_k'][0]) if 'top_k' in parsed_dict else 100
+ top_p = float(parsed_dict['top_p'][0]) if 'top_p' in parsed_dict else 0.9
+ rep_pen = float(parsed_dict['rep_pen'][0]) if 'rep_pen' in parsed_dict else 1.1
+ use_default_badwordsids = int(parsed_dict['use_default_badwordsids'][0]) if 'use_default_badwordsids' in parsed_dict else 0
+ gencommand = (parsed_dict['generate'][0] if 'generate' in parsed_dict else "")=="Generate"
+
+ if modelbusy.locked():
+ status = "Model is currently busy, try again later."
+ elif gencommand:
+ if prompt=="" or max_length<=0:
+ status = "Need a valid prompt and length to generate."
+ else:
+ if max_length>512:
+ max_length = 512
+ epurl = f"http://localhost:{args.port}"
+ if args.host!="":
+ epurl = f"http://{args.host}:{args.port}"
+ gen_payload = {"prompt": prompt,"max_length": max_length,"temperature": temperature,"prompt": prompt,"top_k": top_k,"top_p": top_p,"rep_pen": rep_pen,"use_default_badwordsids":use_default_badwordsids}
+ respjson = make_url_request(f'{epurl}/api/v1/generate', gen_payload)
+ reply = html.escape(respjson["results"][0]["text"])
+ status = "Generation Completed"
+
+ if "generate" in parsed_dict:
+ del parsed_dict["generate"]
+ parsed_dict["prompt"] = prompt + reply
+ parsed_dict["status"] = status
+ updated_query_string = urlparse.urlencode(parsed_dict, doseq=True)
+ updated_path = parsed_url._replace(query=updated_query_string).geturl()
+ self.path = updated_path
+ self.send_response(302)
+ self.send_header("location", self.path)
+ self.end_headers(content_type='text/html')
+ return
+
+ finalhtml = f'''
+
+
+
+
KoboldCpp NoScript Mode
+
KoboldCpp NoScript Mode
+
+
KoboldCpp can be used without Javascript enabled, however this is not recommended.
+
If you have Javascript, please use Kobold Lite WebUI instead.
+
+
+
+'''
+ finalhtml = finalhtml.encode('utf-8')
+ self.send_response(200)
+ self.send_header('content-length', str(len(finalhtml)))
+ self.end_headers(content_type='text/html')
+ self.wfile.write(finalhtml)
def do_GET(self):
- global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens, preloaded_story
+ global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens, preloaded_story, exitcounter
self.path = self.path.rstrip('/')
response_body = None
content_type = 'application/json'
@@ -641,6 +721,10 @@ def do_GET(self):
else:
response_body = self.embedded_kailite
+ elif self.path in ["/noscript", "/noscript?"] or self.path.startswith(('/noscript?','noscript?')): #it's possible for the root url to have ?params without /
+ self.noscript_webui()
+ return
+
elif self.path.endswith(('/api/v1/model', '/api/latest/model')):
response_body = (json.dumps({'result': friendlymodelname }).encode())
@@ -671,7 +755,7 @@ def do_GET(self):
lastc = handle.get_last_token_count()
totalgens = handle.get_total_gens()
stopreason = handle.get_last_stop_reason()
- response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "total_gens":totalgens, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1)}).encode())
+ response_body = (json.dumps({"last_process":lastp,"last_eval":laste,"last_token_count":lastc, "total_gens":totalgens, "stop_reason":stopreason, "queue":requestsinqueue, "idle":(0 if modelbusy.locked() else 1), "hordeexitcounter":exitcounter}).encode())
elif self.path.endswith('/api/extra/generate/check'):
pendtxtStr = ""
@@ -730,8 +814,11 @@ def do_POST(self):
try:
genparams = json.loads(body)
countprompt = genparams.get('prompt', "")
- count = handle.token_count(countprompt.encode("UTF-8"))
- response_body = (json.dumps({"value": count}).encode())
+ rawcountdata = handle.token_count(countprompt.encode("UTF-8"))
+ countlimit = rawcountdata.count if (rawcountdata.count>=0 and rawcountdata.count<50000) else 0
+ # the above protects the server in case the count limit got corrupted
+ countdata = [rawcountdata.ids[i] for i in range(countlimit)]
+ response_body = (json.dumps({"value": len(countdata),"ids": countdata}).encode())
except Exception as e:
utfprint("Count Tokens - Body Error: " + str(e))
@@ -953,7 +1040,7 @@ def show_new_gui():
ctk.set_appearance_mode("dark")
root = ctk.CTk()
root.geometry(str(windowwidth) + "x" + str(windowheight))
- root.title("KoboldCpp v"+KcppVersion)
+ root.title("KoboldCpp v"+KcppVersion+" MIXTRAL FANSERVICE EDITION")
root.resizable(False,False)
tabs = ctk.CTkFrame(root, corner_radius = 0, width=windowwidth, height=windowheight-50)
@@ -1022,7 +1109,7 @@ def show_new_gui():
keepforeground = ctk.IntVar()
quietmode = ctk.IntVar(value=0)
- lowvram_var = ctk.IntVar()
+ lowvram_var = ctk.IntVar(value=1)
mmq_var = ctk.IntVar(value=1)
blas_threads_var = ctk.StringVar()
blas_size_var = ctk.IntVar()
@@ -1320,6 +1407,15 @@ def auto_gpu_heuristics():
changed_gpu_choice_var()
return
+ def on_picked_model_file(filepath):
+ if filepath.lower().endswith('.kcpps'):
+ #load it as a config file instead
+ with open(filepath, 'r') as f:
+ dict = json.load(f)
+ import_vars(dict)
+ else:
+ autoset_gpu_layers(filepath)
+
def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to use
try:
global gui_layers_untouched
@@ -1339,6 +1435,12 @@ def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to u
layerlimit = int(min(200,mem/sizeperlayer))
else:
layerlimit = 200 #assume full offload
+
+ if layerlimit>=200:
+ lowvram_var.set(0)
+ else:
+ lowvram_var.set(1)
+
old_gui_layers_untouched = gui_layers_untouched
gui_layers_zeroed = gpulayers_var.get()=="" or gpulayers_var.get()=="0"
if (gui_layers_untouched or gui_layers_zeroed) and layerlimit>0:
@@ -1508,7 +1610,7 @@ def changerunmode(a,b,c):
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=3)
# load model
- makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170, onchoosefile=autoset_gpu_layers)
+ makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170, onchoosefile=on_picked_model_file)
# Hardware Tab
hardware_tab = tabcontent["Hardware"]
@@ -1583,7 +1685,7 @@ def togglerope(a,b,c):
# Model Tab
model_tab = tabcontent["Model"]
- makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1, onchoosefile=autoset_gpu_layers)
+ makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1, onchoosefile=on_picked_model_file)
makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
makefileentry(model_tab, "Preloaded Story:", "Select Preloaded Story File", preloadstory_var, 7)
@@ -2022,23 +2124,48 @@ def show_gui_msgbox(title,message):
except Exception as ex2:
pass
+def print_with_time(txt):
+ from datetime import datetime
+ print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
+
+def make_url_request(url, data, method='POST', headers={}):
+ import urllib.request
+ try:
+ request = None
+ if method=='POST':
+ json_payload = json.dumps(data).encode('utf-8')
+ request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
+ request.add_header('content-type', 'application/json')
+ else:
+ request = urllib.request.Request(url, headers=headers, method=method)
+ response_data = ""
+ with urllib.request.urlopen(request) as response:
+ response_data = response.read().decode('utf-8')
+ json_response = json.loads(response_data)
+ return json_response
+ except urllib.error.HTTPError as e:
+ try:
+ errmsg = e.read().decode('utf-8')
+ print_with_time(f"Error: {e} - {errmsg}")
+ except Exception as e:
+ print_with_time(f"Error: {e}")
+ return None
+ except Exception as e:
+ print_with_time(f"Error: {e} - {response_data}")
+ return None
+
#A very simple and stripped down embedded horde worker with no dependencies
def run_horde_worker(args, api_key, worker_name):
- import urllib.request
from datetime import datetime
global friendlymodelname, maxhordectx, maxhordelen, exitcounter, punishcounter, modelbusy, session_starttime
epurl = f"http://localhost:{args.port}"
if args.host!="":
epurl = f"http://{args.host}:{args.port}"
- def print_with_time(txt):
- print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
-
def submit_completed_generation(url, jobid, sessionstart, submit_dict):
global exitcounter, punishcounter, session_kudos_earned, session_jobs, rewardcounter
- reply = make_url_request(url, submit_dict)
+ reply = make_url_request_horde(url, submit_dict)
if not reply:
- exitcounter += 1
punishcounter += 1
print_with_time(f"Error, Job submit failed.")
else:
@@ -2056,60 +2183,46 @@ def submit_completed_generation(url, jobid, sessionstart, submit_dict):
rewardcounter += 1
if rewardcounter > 50:
rewardcounter = 0
- if exitcounter > 5:
+ if exitcounter > 1:
exitcounter -= 1
- def make_url_request(url, data, method='POST'):
- try:
- request = None
- headers = {"apikey": api_key,'User-Agent':'KoboldCppEmbeddedWorkerV2','Client-Agent':'KoboldCppEmbedWorker:2'}
- if method=='POST':
- json_payload = json.dumps(data).encode('utf-8')
- request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
- request.add_header('content-type', 'application/json')
- else:
- request = urllib.request.Request(url, headers=headers, method=method)
- response_data = ""
- with urllib.request.urlopen(request) as response:
- response_data = response.read().decode('utf-8')
- json_response = json.loads(response_data)
- return json_response
- except urllib.error.HTTPError as e:
- try:
- errmsg = e.read().decode('utf-8')
- print_with_time(f"Error: {e} - {errmsg}, Make sure your Horde API key and worker name is valid.")
- except Exception as e:
- print_with_time(f"Error: {e}, Make sure your Horde API key and worker name is valid.")
- return None
- except Exception as e:
- print_with_time(f"Error: {e} - {response_data}, Make sure your Horde API key and worker name is valid.")
- return None
+ def make_url_request_horde(url, data, method='POST'):
+ headers = headers = {"apikey": api_key,'User-Agent':'KoboldCppEmbeddedWorkerV2','Client-Agent':'KoboldCppEmbedWorker:2'}
+ ret = make_url_request(url, data, method, headers)
+ if not ret:
+ print("Make sure your Horde API key and worker name is valid!")
+ return ret
current_id = None
current_payload = None
current_generation = None
session_starttime = datetime.now()
sleepy_counter = 0 #if this exceeds a value, worker becomes sleepy (slower)
+ exitcounter = 0
print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
cluster = "https://horde.koboldai.net"
- while exitcounter < 35:
+ while exitcounter < 10:
time.sleep(3)
- readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
+ readygo = make_url_request_horde(f'{epurl}/api/v1/info/version', None,'GET')
if readygo:
print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
break
- while exitcounter < 40:
+ while exitcounter < 10:
currentjob_attempts = 0
current_generation = None
- if punishcounter >= 8:
+ if punishcounter >= 5:
punishcounter = 0
- penaltymult = (1 + (exitcounter//10))
- print_with_time(f"Horde Worker Paused for {penaltymult*10} min - Too many errors. It will resume automatically, but you should restart it.")
- print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
- time.sleep(600 * penaltymult)
+ exitcounter += 1
+ if exitcounter < 10:
+ penaltytime = (2 ** exitcounter)
+ print_with_time(f"Horde Worker Paused for {penaltytime} min - Too many errors. It will resume automatically, but you should restart it.")
+ print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
+ time.sleep(60 * penaltytime)
+ else:
+ print_with_time(f"Horde Worker Exit limit reached, too many errors.")
#first, make sure we are not generating
if modelbusy.locked():
@@ -2126,9 +2239,8 @@ def make_url_request(url, data, method='POST'):
"softprompts": [],
"bridge_agent": BRIDGE_AGENT,
}
- pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
+ pop = make_url_request_horde(f'{cluster}/api/v2/generate/text/pop',gen_dict)
if not pop:
- exitcounter += 1
punishcounter += 1
print_with_time(f"Failed to fetch job from {cluster}. Waiting 10 seconds...")
time.sleep(10)
@@ -2148,9 +2260,9 @@ def make_url_request(url, data, method='POST'):
print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
#do gen
- while exitcounter < 35:
+ while exitcounter < 10:
if not modelbusy.locked():
- current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
+ current_generation = make_url_request_horde(f'{epurl}/api/v1/generate', current_payload)
if current_generation:
break
else:
@@ -2460,15 +2572,21 @@ def main(launch_args,start_server=True):
basepath = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(basepath, "klite.embd"), mode='rb') as f:
embedded_kailite = f.read()
+ # patch it with extra stuff
+ origStr = "Sorry, Kobold Lite requires Javascript to function."
+ patchedStr = "Sorry, Kobold Lite requires Javascript to function.
You can use
KoboldCpp NoScript mode instead."
+ embedded_kailite = embedded_kailite.decode("UTF-8","ignore")
+ embedded_kailite = embedded_kailite.replace(origStr, patchedStr)
+ embedded_kailite = embedded_kailite.encode()
print("Embedded Kobold Lite loaded.")
- except:
+ except Exception as e:
print("Could not find Kobold Lite. Embedded Kobold Lite will not be available.")
try:
basepath = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(basepath, "kcpp_docs.embd"), mode='rb') as f:
embedded_kcpp_docs = f.read()
- except:
+ except Exception as e:
print("Could not find Embedded KoboldCpp API docs.")
if args.port_param!=defaultport:
diff --git a/koboldcpp.sh b/koboldcpp.sh
new file mode 100755
index 0000000000000..50074e402e176
--- /dev/null
+++ b/koboldcpp.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+if [ ! -f "bin/micromamba" ]; then
+ curl -Ls https://anaconda.org/conda-forge/micromamba/1.5.3/download/linux-64/micromamba-1.5.3-0.tar.bz2 | tar -xvj bin/micromamba
+fi
+
+if [[ ! -f "conda/envs/linux/bin/python" || $1 == "rebuild" ]]; then
+ bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
+ bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
+ bin/micromamba run -r conda -n linux make clean
+fi
+
+bin/micromamba run -r conda -n linux make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1
+
+if [[ $1 == "rebuild" ]]; then
+ echo Rebuild complete, you can now try to launch Koboldcpp.
+elif [[ $1 == "dist" ]]; then
+ bin/micromamba remove -r conda -n linux --force ocl-icd -y
+ bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_cublas.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data='./koboldcpp_clblast_noavx2.so:.' --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64"
+ bin/micromamba install -r conda -n linux ocl-icd -c conda-forge -y
+ else
+ bin/micromamba run -r conda -n linux python koboldcpp.py $*
+fi
diff --git a/llama.cpp b/llama.cpp
index 2d7499f7f5914..99b1604b7b1e4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -75,6 +75,7 @@
#include
#include
#include
+#include
#include
#if defined(_MSC_VER)
@@ -91,7 +92,8 @@
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
-#define LLAMA_MAX_NODES 8192
+#define LLAMA_MAX_NODES 8192
+#define LLAMA_MAX_EXPERTS 8
//
// logging
@@ -193,6 +195,7 @@ enum llm_arch {
LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_STABLELM,
+ LLM_ARCH_QWEN,
LLM_ARCH_UNKNOWN,
};
@@ -209,6 +212,7 @@ static std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
+ { LLM_ARCH_QWEN, "qwen" },
};
enum llm_kv {
@@ -229,6 +233,8 @@ enum llm_kv {
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
+ LLM_KV_EXPERT_COUNT,
+ LLM_KV_EXPERT_USED_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -279,6 +285,8 @@ static std::map LLM_KV_NAMES = {
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -336,10 +344,14 @@ enum llm_tensor {
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_NORM_2,
LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_NORM,
LLM_TENSOR_FFN_GATE,
LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_FFN_UP,
- LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_DOWN_EXP,
+ LLM_TENSOR_FFN_GATE_EXP,
+ LLM_TENSOR_FFN_UP_EXP,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
};
@@ -358,10 +370,14 @@ static std::map> LLM_TENSOR_NAMES =
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
},
},
{
@@ -519,6 +535,22 @@ static std::map> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_QWEN,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_UNKNOWN,
@@ -567,27 +599,16 @@ struct LLM_TN {
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
}
+
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix;
+ }
};
//
// gguf helpers
//
-#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
-do { \
- const std::string skey(key); \
- const int kid = gguf_find_key(ctx, skey.c_str()); \
- if (kid >= 0) { \
- enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
- if (ktype != (type)) { \
- throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
- } \
- (dst) = func(ctx, kid); \
- } else if (req) { \
- throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
- } \
-} while (0)
-
static std::map LLAMA_ROPE_SCALING_TYPES = {
{ LLAMA_ROPE_SCALING_NONE, "none" },
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
@@ -621,7 +642,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
}
}
-static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
@@ -1165,6 +1186,8 @@ struct llama_hparams {
uint32_t n_layer;
uint32_t n_rot;
uint32_t n_ff;
+ uint32_t n_expert = 0;
+ uint32_t n_expert_used = 0;
float f_norm_eps;
float f_norm_rms_eps;
@@ -1179,15 +1202,18 @@ struct llama_hparams {
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const {
- if (this->vocab_only != other.vocab_only) return true;
- if (this->n_vocab != other.n_vocab) return true;
- if (this->n_ctx_train != other.n_ctx_train) return true;
- if (this->n_embd != other.n_embd) return true;
- if (this->n_head != other.n_head) return true;
- if (this->n_head_kv != other.n_head_kv) return true;
- if (this->n_layer != other.n_layer) return true;
- if (this->n_rot != other.n_rot) return true;
- if (this->n_ff != other.n_ff) return true;
+ if (this->vocab_only != other.vocab_only) return true;
+ if (this->n_vocab != other.n_vocab) return true;
+ if (this->n_ctx_train != other.n_ctx_train) return true;
+ if (this->n_embd != other.n_embd) return true;
+ if (this->n_head != other.n_head) return true;
+ if (this->n_head_kv != other.n_head_kv) return true;
+ if (this->n_layer != other.n_layer) return true;
+ if (this->n_rot != other.n_rot) return true;
+ if (this->n_ff != other.n_ff) return true;
+ if (this->n_expert != other.n_expert) return true;
+ if (this->n_expert_used != other.n_expert_used) return true;
+
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
@@ -1232,6 +1258,7 @@ struct llama_cparams {
float yarn_beta_slow;
bool mul_mat_q;
+ bool offload_kqv;
};
struct llama_layer {
@@ -1253,6 +1280,9 @@ struct llama_layer {
struct ggml_tensor * wqkv;
// attention bias
+ struct ggml_tensor * bq;
+ struct ggml_tensor * bk;
+ struct ggml_tensor * bv;
struct ggml_tensor * bo;
struct ggml_tensor * bqkv;
@@ -1265,6 +1295,12 @@ struct llama_layer {
struct ggml_tensor * ffn_down; // w2
struct ggml_tensor * ffn_up; // w3
+ // ff MoE
+ struct ggml_tensor * ffn_gate_inp;
+ struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
+ struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
+ struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
+
// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
@@ -1297,8 +1333,8 @@ struct llama_kv_cache {
std::vector cells;
- struct ggml_tensor * k = NULL;
- struct ggml_tensor * v = NULL;
+ std::vector k_l; // per layer
+ std::vector v_l;
struct ggml_context * ctx = NULL;
@@ -1311,8 +1347,10 @@ struct llama_kv_cache {
#ifdef GGML_USE_CUBLAS
if (ggml_cublas_loaded()) {
- ggml_cuda_free_data(k);
- ggml_cuda_free_data(v);
+ for (size_t i = 0; i < k_l.size(); ++i) {
+ ggml_cuda_free_data(k_l[i]);
+ ggml_cuda_free_data(v_l[i]);
+ }
}
#endif
}
@@ -1507,9 +1545,11 @@ struct llama_context {
static bool llama_kv_cache_init(
const struct llama_hparams & hparams,
struct llama_kv_cache & cache,
- ggml_type wtype,
+ ggml_type ktype,
+ ggml_type vtype,
uint32_t n_ctx,
- int n_gpu_layers) {
+ int n_gpu_layers,
+ bool offload) {
const uint32_t n_embd = hparams.n_embd_gqa();
const uint32_t n_layer = hparams.n_layer;
@@ -1525,7 +1565,7 @@ static bool llama_kv_cache_init(
cache.cells.clear();
cache.cells.resize(n_ctx);
- cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
+ cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
memset(cache.buf.data, 0, cache.buf.size);
struct ggml_init_params params;
@@ -1535,37 +1575,44 @@ static bool llama_kv_cache_init(
cache.ctx = ggml_init(params);
+ size_t vram_kv_cache = 0;
+
if (!cache.ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
return false;
}
- cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
- cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
- ggml_set_name(cache.k, "cache_k");
- ggml_set_name(cache.v, "cache_v");
+ cache.k_l.reserve(n_layer);
+ cache.v_l.reserve(n_layer);
- (void) n_gpu_layers;
+ const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
-#ifdef GGML_USE_CUBLAS
- if (ggml_cublas_loaded()) {
- size_t vram_kv_cache = 0;
+ GGML_UNUSED(offload);
- if (n_gpu_layers > (int)n_layer + 1) {
- ggml_cuda_assign_buffers_no_scratch(cache.v);
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
- vram_kv_cache += ggml_nbytes(cache.v);
- }
- if (n_gpu_layers > (int)n_layer + 2) {
- ggml_cuda_assign_buffers_no_scratch(cache.k);
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
- vram_kv_cache += ggml_nbytes(cache.k);
- }
- if (vram_kv_cache > 0) {
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+ for (int i = 0; i < (int) n_layer; i++) {
+ ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
+ ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
+ ggml_format_name(k, "cache_k_l%d", i);
+ ggml_format_name(v, "cache_v_l%d", i);
+ cache.k_l.push_back(k);
+ cache.v_l.push_back(v);
+#ifdef GGML_USE_CUBLAS
+ if (i >= i_gpu_start) {
+ if (offload) {
+ ggml_cuda_assign_buffers_no_scratch(k);
+ vram_kv_cache += ggml_nbytes(k);
+ ggml_cuda_assign_buffers_no_scratch(v);
+ vram_kv_cache += ggml_nbytes(v);
+ }
}
+#endif // GGML_USE_CUBLAS
+ }
+
+ if (vram_kv_cache > 0) {
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
}
-#endif
+
+ GGML_UNUSED(n_gpu_layers);
return true;
}
@@ -1786,6 +1833,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
return buf;
}
+namespace GGUFMeta {
+ template
+ struct GKV_Base_Type {
+ static constexpr gguf_type gt = gt_;
+
+ static T getter(const gguf_context * ctx, const int kid) {
+ return gfun(ctx, kid);
+ }
+ };
+
+ template struct GKV_Base;
+
+ template<> struct GKV_Base: GKV_Base_Type {};
+ template<> struct GKV_Base