diff --git a/convert.py b/convert.py index de6c39c67672b..e340d2273f378 100644 --- a/convert.py +++ b/convert.py @@ -998,9 +998,9 @@ def write_vocab(self, vocab: Vocab) -> None: def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, - n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) + n_head=1, n_layer=0) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) of.fout.close() diff --git a/examples/common.cpp b/examples/common.cpp index fd164eceb366e..92c7c07f8fe73 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -536,7 +536,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s return res; } -struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { +std::tuple llama_init_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; @@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; - llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); + llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return std::make_tuple(nullptr, nullptr); + } + llama_context * lctx = llama_new_context_with_model(model, lparams); if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return NULL; + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } if (!params.lora_adapter.empty()) { - int err = llama_apply_lora_from_file(lctx, + int err = llama_model_apply_lora_from_file(model, params.lora_adapter.c_str(), params.lora_base.empty() ? NULL : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); - return NULL; + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } } - return lctx; + return std::make_tuple(model, lctx); } void console_init(console_state & con_st) { diff --git a/examples/common.h b/examples/common.h index 6c2953cb2a7c6..713320179e2be 100644 --- a/examples/common.h +++ b/examples/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #if !defined (_WIN32) #include @@ -95,7 +96,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s // Model utils // -struct llama_context * llama_init_from_gpt_params(const gpt_params & params); +std::tuple llama_init_from_gpt_params(const gpt_params & params); // // Console utils diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 860f99f672c9c..369eac1d1c391 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -37,11 +37,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -90,6 +91,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 941312f9cc756..c1e6bf126804e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -107,12 +107,13 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; g_ctx = &ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -139,6 +140,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } @@ -147,6 +149,7 @@ int main(int argc, char ** argv) { if (params.export_cgraph) { llama_eval_export(ctx, "llama.ggml"); llama_free(ctx); + llama_free_model(model); return 0; } @@ -666,6 +669,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ae8cfe0afc0b7..b59f5971e3dd2 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -149,11 +149,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -169,6 +170,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6b8018ee28432..9cea472dedb82 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -320,6 +320,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Loading model\n"); const int64_t t_main_start_us = ggml_time_us(); + llama_model * model; llama_context * ctx; { @@ -330,12 +331,20 @@ int main(int argc, char ** argv) { lparams.f16_kv = false; lparams.use_mlock = false; - ctx = llama_init_from_file(params.model.c_str(), lparams); + model = llama_load_model_from_file(params.model.c_str(), lparams); - if (ctx == NULL) { + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return 1; } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return 1; + } } const auto &tensors = llama_internal_get_tensor_map(ctx); @@ -357,6 +366,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); + llama_free_model(model); return 1; } included_layers++; @@ -415,6 +425,7 @@ int main(int argc, char ** argv) { llama_free(ctx); + llama_free_model(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index da4d37ad03de7..4c868850317fe 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -35,12 +35,22 @@ int main(int argc, char ** argv) { auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init - auto ctx = llama_init_from_file(params.model.c_str(), lparams); + auto model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == nullptr) { + return 1; + } + auto ctx = llama_new_context_with_model(model, lparams); + if (ctx == nullptr) { + llama_free_model(model); + return 1; + } auto tokens = std::vector(params.n_ctx); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } @@ -84,6 +94,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } n_past += 1; @@ -91,23 +103,27 @@ int main(int argc, char ** argv) { printf("\n\n"); - // free old model + // free old context llama_free(ctx); - // load new model - auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); + // make new context + auto ctx2 = llama_new_context_with_model(model, lparams); // Load state (rng, logits, embedding and kv_cache) from file { FILE *fp_read = fopen("dump_state.bin", "rb"); if (state_size != llama_get_state_size(ctx2)) { fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } const size_t ret = fread(state_mem, 1, state_size, fp_read); if (ret != state_size) { fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } @@ -138,6 +154,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } n_past += 1; @@ -145,5 +163,8 @@ int main(int argc, char ** argv) { printf("\n\n"); + llama_free(ctx2); + llama_free_model(model); + return 0; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8aaf103395faa..8aea0f7cd8f12 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -115,6 +115,7 @@ struct llama_server_context { std::vector embd; std::vector last_n_tokens; + llama_model * model = nullptr; llama_context * ctx = nullptr; gpt_params params; @@ -130,6 +131,10 @@ struct llama_server_context { llama_free(ctx); ctx = nullptr; } + if (model) { + llama_free_model(model); + model = nullptr; + } } void rewind() { @@ -150,8 +155,8 @@ struct llama_server_context { bool loadModel(const gpt_params & params_) { params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == nullptr) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) { LOG_ERROR("unable to load model", { { "model", params_.model } }); return false; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 76f991cdc028f..fc45c93406bc4 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -68,11 +68,12 @@ int main(int argc, char ** argv) llama_init_backend(); - llama_context * ctx ; + llama_model * model; + llama_context * ctx; - ctx = llama_init_from_gpt_params( params ); + std::tie(model, ctx) = llama_init_from_gpt_params( params ); - if ( ctx == NULL ) + if ( model == NULL ) { fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); return 1; @@ -170,6 +171,7 @@ int main(int argc, char ** argv) } // wend of main loop llama_free( ctx ); + llama_free_model( model ); return 0; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7ec85951adc57..61c829e5c0f8a 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) { struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; - struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); + struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); struct llama_vocab vocab; { @@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); + llama_free_model(lmodel); ggml_free(model.ctx); return 0; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9fd3d7ea0686a..566dc26919a84 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2688,7 +2688,7 @@ void ggml_cuda_free_scratch() { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT + || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); switch (tensor->op) { diff --git a/ggml.c b/ggml.c index 61f8fed1bb43c..961d90ce6fe84 100644 --- a/ggml.c +++ b/ggml.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef GGML_USE_METAL #include @@ -4736,10 +4737,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam return tensor; } +struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); + va_end(args); + return tensor; +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + ggml_format_name(result, "%s (view)", src->name); result->nb[0] = src->nb[0]; result->nb[1] = src->nb[1]; @@ -5901,6 +5911,11 @@ struct ggml_tensor * ggml_cpy_impl( // make a view of the destination struct ggml_tensor * result = ggml_view_tensor(ctx, b); + if (strlen(b->name) > 0) { + ggml_format_name(result, "%s (copy of %s)", b->name, a->name); + } else { + ggml_format_name(result, "%s (copy)", a->name); + } result->op = GGML_OP_CPY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5937,6 +5952,7 @@ struct ggml_tensor * ggml_cont_impl( } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5980,6 +5996,7 @@ struct ggml_tensor * ggml_reshape( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6004,6 +6021,7 @@ struct ggml_tensor * ggml_reshape_1d( const int64_t ne[1] = { ne0 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6029,6 +6047,7 @@ struct ggml_tensor * ggml_reshape_2d( const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6055,6 +6074,7 @@ struct ggml_tensor * ggml_reshape_3d( const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6083,6 +6103,7 @@ struct ggml_tensor * ggml_reshape_4d( const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6107,10 +6128,12 @@ struct ggml_tensor * ggml_view_1d( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6143,10 +6166,12 @@ struct ggml_tensor * ggml_view_2d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6185,10 +6210,12 @@ struct ggml_tensor * ggml_view_3d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6229,10 +6256,12 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6278,6 +6307,7 @@ struct ggml_tensor * ggml_permute( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (permuted)", a->name); int ne[GGML_MAX_DIMS]; int nb[GGML_MAX_DIMS]; @@ -6337,6 +6367,7 @@ struct ggml_tensor * ggml_transpose( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (transposed)", a->name); result->ne[0] = a->ne[1]; result->ne[1] = a->ne[0]; @@ -14882,7 +14913,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS @@ -16006,7 +16037,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + ggml_format_name(node, "leaf_%d", cgraph->n_leafs); } cgraph->leafs[cgraph->n_leafs] = node; @@ -16015,7 +16046,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + ggml_format_name(node, "node_%d", cgraph->n_nodes); } cgraph->nodes[cgraph->n_nodes] = node; @@ -17399,6 +17430,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr return NULL; } +static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); + struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", + gparent0 ? (void *) gparent0 : (void *) parent, + gparent0 ? "g" : "x", + gparent ? (void *) gparent : (void *) node, + gparent ? "g" : "x", + gparent ? "empty" : "vee", + gparent ? "dashed" : "solid", + label); +} + +static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", + (void *) parent, "x", + (void *) node, "x", + label); +} + void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; @@ -17434,7 +17485,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s |", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } if (node->n_dims == 2) { @@ -17443,7 +17496,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]); } - if (node->grad) { fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); } else { @@ -17462,18 +17514,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s | ", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (ggml_nelements(node) == 1) { - if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { - fprintf(fp, "%d", ggml_get_i32_1d(node, 0)); - } - else { - fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0)); + + fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + if (ggml_nelements(node) < 5) { + fprintf(fp, " | ("); + for (int j = 0; j < ggml_nelements(node); j++) { + if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + fprintf(fp, "%d", ggml_get_i32_1d(node, j)); + } + else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); + } + else { + fprintf(fp, "#"); + } + if (j < ggml_nelements(node) - 1) { + fprintf(fp, ", "); + } } - } - else { - fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + fprintf(fp, ")"); } fprintf(fp, "\"; ]\n"); } @@ -17481,30 +17544,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; - struct ggml_tensor * parent = ggml_graph_get_parent(gb, node); - if (node->src0) { - struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); - - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", - parent0 ? (void *) parent0 : (void *) node->src0, - parent0 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); } if (node->src1) { - struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); - - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", - parent1 ? (void *) parent1 : (void *) node->src1, - parent1 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); + } + + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + } } } @@ -17512,15 +17565,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph struct ggml_tensor * node = gb->leafs[i]; if (node->src0) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", - (void *) node->src0, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); } if (node->src1) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", - (void *) node->src1, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); + } + + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + } } } diff --git a/ggml.h b/ggml.h index 18c78551f3dcd..4b6b7284510f9 100644 --- a/ggml.h +++ b/ggml.h @@ -563,6 +563,7 @@ extern "C" { GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); + GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...); // // operations on tensors with backpropagation diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 43a6d2e2449f7..ce2b6da150514 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -78,6 +78,7 @@ static std::vector smartcontext; static std::vector stop_sequence; static std::vector top_picks; static int remaining_tokens = 0; +static int stopper_unused_tokens = 0; static std::string concat_output = ""; inline bool IsNanCheck(float f) @@ -760,6 +761,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in bool gpttype_generate_abort() { + stopper_unused_tokens = remaining_tokens; remaining_tokens = 0; return true; } @@ -900,7 +902,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o current_context_tokens.resize(n_past); remaining_tokens = params.n_predict; - int stopper_unused_tokens = 0; + stopper_unused_tokens = 0; int input_consumed = 0; std::mt19937 rng(params.seed); concat_output = ""; diff --git a/klite.embd b/klite.embd index ce641d7ffb439..c79babbe74430 100644 --- a/klite.embd +++ b/klite.embd @@ -1,6 +1,6 @@