```c++
common_params params;
--> common_params_parse;
common_init();
llama_backend_init();
llama_numa_init(params.numa);
// load the model and apply lora adapter, if any
--> common_init_from_params;
llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get();
struct results_perplexity results;
--> perplexity;
llama_perf_context_print(ctx);
--> llama_backend_free;
```

# [`common_params_parse`](https://github.com/ggml-org/llama.cpp/blob/master/common/arg.cpp#L1183)

```c++
common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY);

bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {}
```

# [`common_init_from_params`](https://github.com/ggml-org/llama.cpp/blob/master/common/common.cpp#L888)

```c++
common_init_result llama_init = common_init_from_params(params);

struct common_init_result common_init_from_params(common_params & params) {}
```

<details>
<summary>struct common_init_result</summary>

```c++
// note: defines object's lifetime
struct common_init_result {
    llama_model_ptr   model;
    llama_context_ptr context;

    std::vector<llama_adapter_lora_ptr> lora;
};
```

</details>

```c++
common_init_result iparams;

auto mparams = common_model_params_to_llama(params);
--> llama_model_load_from_file --> llama_model_load_from_file_impl --> llama_model_load --> llama_model * model;
auto cparams = common_context_params_to_llama(params);
--> llama_init_from_model --> llama_context::llama_context --> llama_context * lctx;
llama_set_warmup(lctx, warmup=true); --> cparams.warmup = warmup;
std::vector<llama_token> tmp;
llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_vocab_eos(vocab);
tmp.push_back(bos);
tmp.push_back(eos);
--> llama_decode;
llama_memory_clear(llama_get_memory(lctx), true);
llama_synchronize(lctx);
llama_perf_context_reset(lctx);
llama_set_warmup(lctx, false);

iparams.model.reset(model);
iparams.context.reset(lctx);
return iparams;
```

## [`llama_model_load`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama.cpp#L87)

```c++
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

struct llama_model * llama_model_load_from_file(
    const char * path_model,
    struct llama_model_params params) {
    std::vector<std::string> splits = {};
    --> return llama_model_load_from_file_impl(path_model, splits, params);
}

static struct llama_model * llama_model_load_from_file_impl(
        const std::string & path_model,
        std::vector<std::string> & splits,
        struct llama_model_params params) {
    llama_model * model = new llama_model(params);
    --> const int status = llama_model_load(path_model, splits, *model, params);
    return model;
}

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {}
```

<details>
<summary>struct llama_model</summary>

```c++
struct llama_model {
    llm_type type = LLM_TYPE_UNKNOWN;
    llm_arch arch = LLM_ARCH_UNKNOWN;

    std::string name = "n/a";

    llama_hparams hparams = {};
    llama_vocab   vocab;

    struct ggml_tensor * tok_embd   = nullptr;
    struct ggml_tensor * type_embd  = nullptr;
    struct ggml_tensor * pos_embd   = nullptr;
    struct ggml_tensor * tok_norm   = nullptr;
    struct ggml_tensor * tok_norm_b = nullptr;

    struct ggml_tensor * output_norm     = nullptr;
    struct ggml_tensor * output_norm_b   = nullptr;
    struct ggml_tensor * output          = nullptr;
    struct ggml_tensor * output_b        = nullptr;
    struct ggml_tensor * output_norm_enc = nullptr;

    std::vector<llama_layer> layers;

    llama_model_params params;

    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;

    // list of devices used in this model
    std::vector<ggml_backend_dev_t> devices;

    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;

    // note: can mutate `cparams`
    // TODO: move this to new llm_arch_model_i interface
    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;

    // TODO: move this to new llm_arch_model_i interface
    llm_graph_result_ptr build_graph(
            const llm_graph_params & params,
                       ggml_cgraph * gf,
                    llm_graph_type   type) const;

private:
    struct impl;
    std::unique_ptr<impl> pimpl;
};

```
    
</details>

```c++
--> llama_model_loader::llama_model_loader --> llama_model_loader ml;
ml.print_info();
model.load_arch(ml);
--> llama_model::load_hparams;
--> llama_model::load_vocab --> llama_vocab::load --> llama_vocab::impl::load;
model.load_stats(ml);
model.print_info();
--> llama_model::load_tensors;
return 0;
```

### [`llama_model_loader::llama_model_loader`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model-loader.cpp#468)

```c++
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);

llama_model_loader::llama_model_loader(
        const std::string & fname,
        std::vector<std::string> & splits,
        bool use_mmap,
        bool check_tensors,
        const llama_model_kv_override * param_overrides_p,
        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {}
```

```c++
// Load the main GGUF
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
    /*.no_alloc = */ true,
    /*.ctx      = */ &ctx,
};

llama_model_loader:: gguf_context_ptr meta;
--> gguf_init_from_file --> gguf_init_from_file_impl --> meta;

files.emplace_back(new llama_file(fname.c_str(), "rb"));
contexts.emplace_back(ctx);

for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
    std::string tensor_name = std::string(cur->name);
    n_elements += ggml_nelements(cur);
    n_bytes    += ggml_nbytes(cur);
    weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
}

n_kv      = gguf_get_n_kv(meta.get());
n_tensors = weights_map.size();

fver = (enum llama_fver) gguf_get_version(meta.get());

this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
```

#### [`gguf_init_from_file_impl`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/gguf.cpp#319)

```c++
meta.reset(gguf_init_from_file(fname.c_str(), params));

struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
    FILE * file = ggml_fopen(fname, "rb");
    --> struct gguf_context * result = gguf_init_from_file_impl(file, params);
    fclose(file);
    return result;
}

struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {}
```

<details>
<summary>struct gguf_context</summary>

```c++
struct gguf_context {
    uint32_t version = GGUF_VERSION;

    std::vector<struct gguf_kv> kv;
    std::vector<struct gguf_tensor_info> info;

    size_t alignment = GGUF_DEFAULT_ALIGNMENT;
    size_t offset    = 0; // offset of `data` from beginning of file
    size_t size      = 0; // size of `data` in bytes

    void * data = nullptr;
};
```
    
</details>

```c++
const struct gguf_reader gr(file);
struct gguf_context * ctx = new gguf_context;
// file magic
gr.read(magic, 4);
// header
gr.read(ctx->version);
gr.read(n_tensors);
gr.read(n_kv);
// KV pairs
for (int64_t i = 0; ok && i < n_kv; ++i)
    gr.read(key);
    gr.read(type);
    is_array = true; gr.read(type); gr.read(n);
    gguf_read_emplace_helper<xxx>    (gr, ctx->kv, key, is_array, n);
const int alignment_idx = gguf_find_key(ctx, GGUF_KEY_GENERAL_ALIGNMENT);
ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx);

// read the tensor info
for (int64_t i = 0; ok && i < n_tensors; ++i)
    struct gguf_tensor_info info;
    std::string name; gr.read(name); ggml_set_name(&info.t, name.c_str()); // tensor name
    uint32_t n_dims = -1; gr.read(n_dims); gr.read(info.t.ne); // tensor shape
    gr.read(info.t.type); // tensor type
    // calculate byte offsets given the tensor shape and type
    const size_t  type_size = ggml_type_size(info.t.type);
    const int64_t blck_size = ggml_blck_size(info.t.type);
    info.t.nb;
    gr.read(info.offset); // tensor data offset within buffer
    ctx->info.push_back(info);

// store the current file offset - this is where the data section starts
ctx->offset = ftell(file);

// compute the total size of the data section, taking into account the alignment
ctx->size = 0;
for (size_t i = 0; i < ctx->info.size(); ++i)
    const gguf_tensor_info & ti = ctx->info[i];
    ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);


// load the tensor data only if requested
// compute the exact size needed for the new ggml_context
const size_t mem_size = n_tensors * ggml_tensor_overhead();

struct ggml_init_params pdata = {
    /*mem_size   =*/ mem_size,
    /*mem_buffer =*/ nullptr,
    /*no_alloc   =*/ params.no_alloc,
};
--> ggml_init --> *params.ctx;
struct ggml_context * ctx_data = *params.ctx;

// create the tensors
for (size_t i = 0; i < ctx->info.size(); ++i) {
    const struct gguf_tensor_info & info = ctx->info[i];
    --> ggml_new_tensor --> ggml_new_tensor_impl --> struct ggml_tensor * cur;
    ggml_set_name(cur, info.t.name);
}
return ctx;
```

##### [`ggml_init`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml.c#L1420)

```c++
*params.ctx = ggml_init(pdata);

struct ggml_context * ggml_init(struct ggml_init_params params) {}
```

<details>
<summary>struct ggml_context</summary>

```c++
struct ggml_context {
    size_t mem_size;
    void * mem_buffer;
    bool   mem_buffer_owned;
    bool   no_alloc;

    int    n_objects;

    struct ggml_object * objects_begin;
    struct ggml_object * objects_end;
};

size_t ggml_tensor_overhead(void) {
    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
}
```

</details>


```c++
struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));

// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
    params.mem_size = GGML_MEM_ALIGN;
}

const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);

*ctx = (struct ggml_context) {
    /*.mem_size           =*/ mem_size,
    /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
    /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
    /*.no_alloc           =*/ params.no_alloc,
    /*.n_objects          =*/ 0,
    /*.objects_begin      =*/ NULL,
    /*.objects_end        =*/ NULL,
};

return ctx;
```

##### [`ggml_new_tensor_impl`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml.c#L1647)

```c++
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, info.t.type, GGML_MAX_DIMS, info.t.ne);

struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int                   n_dims,
        const int64_t       * ne) {
    --> return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
}

static struct ggml_tensor * ggml_new_tensor_impl(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int                   n_dims,
        const int64_t       * ne,
        struct ggml_tensor  * view_src,
        size_t                view_offs) {}
```

<details>
<summary>struct ggml_tensor</summary>

```c++
// n-dimensional tensor
struct ggml_tensor {
    enum ggml_type type;

    struct ggml_backend_buffer * buffer;

    int64_t ne[GGML_MAX_DIMS]; // number of elements
    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                               // nb[0] = ggml_type_size(type)
                               // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
                               // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;

    // op params - allocated as int32_t for alignment
    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

    int32_t flags;

    struct ggml_tensor * src[GGML_MAX_SRC];

    // source tensor and offset for views
    struct ggml_tensor * view_src;
    size_t               view_offs;

    void * data;

    char name[GGML_MAX_NAME];

    void * extra; // extra things e.g. for ggml-cuda.cu

    char padding[8];
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
```
</details>


```c++
--> ggml_new_object --> obj_new;
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
*result = (struct ggml_tensor) {
    /*.type         =*/ type,
    /*.buffer       =*/ NULL,
    /*.ne           =*/ { 1, 1, 1, 1 },
    /*.nb           =*/ { 0, 0, 0, 0 },
    /*.op           =*/ GGML_OP_NONE,
    /*.op_params    =*/ { 0 },
    /*.flags        =*/ 0,
    /*.src          =*/ { NULL },
    /*.view_src     =*/ view_src,
    /*.view_offs    =*/ view_offs,
    /*.data         =*/ view_src != NULL ? view_src->data + view_offs : NULL,
    /*.name         =*/ { 0 },
    /*.extra        =*/ NULL,
    /*.padding      =*/ { 0 },
};

result->ne;
result->nb;
ctx->n_objects++;
return result;
```

[`ggml_new_object`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml.c#L1525)

```c++
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE);

static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {}
```

<details>
<summary>struct ggml_object</summary>

```c++
struct ggml_object {
    size_t offs;
    size_t size;

    struct ggml_object * next;

    enum ggml_object_type type;

    char padding[4];
};

static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
```
</details>


```c++
// always insert objects at the end of the context's memory pool
struct ggml_object * obj_cur = ctx->objects_end;

const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end  = cur_offs + cur_size;

// align to GGML_MEM_ALIGN
size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);

char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);

*obj_new = (struct ggml_object) {
    .offs = cur_end + GGML_OBJECT_SIZE,
    .size = size_needed,
    .next = NULL,
    .type = type,
};

GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);

if (obj_cur != NULL) {
    obj_cur->next = obj_new;
} else {
    // this is the first object in this context
    ctx->objects_begin = obj_new;
}

ctx->objects_end = obj_new;
return obj_new;
```

### [`llama_model::load_hparams`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#423)

```c++
model.load_hparams(ml);

void llama_model::load_hparams(llama_model_loader & ml) {}
```

```c++
const gguf_context * ctx = ml.meta.get();
// get metadata as string
// gguf metadata
llama_model:: std::unordered_map<std::string, std::string> gguf_kv;
gguf_kv.emplace(name, value);
hparams.xxx = xxx; // via ml.get_key
pimpl->n_bytes = ml.n_bytes;
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
```

### [`llama_vocab::impl::load`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-vocab.cpp#1372)

```c++
model.load_vocab(ml);

void llama_model::load_vocab(llama_model_loader & ml) {
    const auto kv = LLM_KV(arch);
    --> vocab.load(ml, kv);
}

void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
    --> pimpl->load(ml, kv);
}

void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {}
```

```c++
struct gguf_context * ctx = ml.meta.get();
// determine vocab type
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
// for now, only BPE models have pre-tokenizers
llama_vocab::impl::
    std::unordered_map<std::string, llama_token> token_to_id;
    std::vector<token_data> id_to_token;
--> init_tokenizer(type); --> tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n
// special tokens
    std::set<llama_token> special_eog_ids; // set of all tokens that cause "end of generation"
// build special tokens cache
    std::vector<llama_token> cache_special_tokens;
// build token to piece cache
    std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
```

### [`llama_model::load_tensors`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#1467)

```c++
model.load_tensors(ml);

bool llama_model::load_tensors(llama_model_loader & ml) {}
```

```c++
// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
// calculate the split points
std::vector<float> splits(n_devices());
std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
// sum and normalize the splits to get the split points
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {}
// assign the input layer, there is very little benefit to offloading the input layer, so always keep it on the CPU
pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
// assign the repeating layers to the devices according to the splits
pimpl->dev_layer.resize(n_layer);
for (int il = 0; il < n_layer; ++il) pimpl->dev_layer[il] = get_layer_buft_list(il);
// assign the output layer
pimpl->dev_output = get_layer_buft_list(n_layer);
// one ggml context per buffer type
int max_n_tensors = ml.n_tensors;
max_n_tensors += 1;         // duplicated output tensor
max_n_tensors += n_layer*2; // duplicated rope freq tensors
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
    auto it = ctx_map.find(buft);
    if (it == ctx_map.end()) {
        ggml_init_params params = {
            /*.mem_size   =*/ ctx_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx = ggml_init(params);
        ctx_map[buft] = ctx;
        pimpl->ctxs.emplace_back(ctx);
        return ctx;
    }
    return it->second;
};
// create tensors for the weights
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
    ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
    llm_tensor_info info = llm_tensor_info_for(tn_tensor);
    // select the buffer type for this tensor
    switch (info.layer) {}
    --> select_weight_buft --> weight_buft_supported --> buft;
    ggml_context * ctx = ctx_for_buft(buft);
    --> return llama_model_loader::create_tensor;
}
```

```c++
layers.resize(n_layer);
// TODO: move to a separate function
const auto tn = LLM_TN(arch);

tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);

// if output is NULL, init from the input tok embed
if (output == NULL)
    output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);

for (int i = 0; i < n_layer; ++i)
    auto & layer = layers[i];

    layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);

    layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
    layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
    layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
    layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

    // optional bias tensors
    layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
    layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
    layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
    layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);

    layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);

    layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
    layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
    layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);

    // optional MLP bias
    layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
    layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
    layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
```

```c++
ml.done_getting_tensors();
--> llama_model_loader::init_mappings --> ml.mappings;
pimpl->mappings.reserve(ml.mappings.size());
for (auto & mapping : ml.mappings)
    pimpl->mappings.emplace_back(std::move(mapping));

// create the backend buffers
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
ctx_bufs.reserve(ctx_map.size());

// Ensure we have enough capacity for the maximum backend buffer we will potentially create
pimpl->bufs.reserve(ctx_map.size());

for (auto & it : ctx_map)
    ggml_context * ctx = it.second;
    llama_buf_map buf_map;
    buf_map.reserve(1); // ml.files.size()
    //if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft)
    //for (uint32_t idx = 0; idx < ml.files.size(); idx++)
    // only the mmap region containing the tensors in the model is mapped to the backend buffer
    // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
    // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
    void * addr = nullptr;
    size_t first, last; // NOLINT
    --> llama_model_loader::get_mapping_range;
    --> ggml_backend_dev_buffer_from_host_ptr --> ggml_backend_cpu_buffer_from_ptr --> ggml_backend_buffer_t buf;
    // indicate that this buffer contains weights, this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
    pimpl->bufs.emplace_back(buf);
    buf_map.emplace(0, buf);

    ctx_bufs.emplace_back(ctx, buf_map);

// populate tensors_by_name
for (auto & ctx : pimpl->ctxs) for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) tensors_by_name.emplace_back(ggml_get_name(cur), cur);

// load tensor data
for (auto & it : ctx_bufs)
    ggml_context * ctx = it.first;
    auto & bufs = it.second;
    --> llama_model_loader::load_all_data;

return true;
```

#### [`weight_buft_supported`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L138)

```c++
ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);

// find the first buffer type in the list that can use the tensor
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
    for (const auto & cur : buft_list)
        --> if (weight_buft_supported(hparams, tensor, op, cur_buft = cur.second, cur_dev = cur.first)) {
            return cur_buft;
}

// checks if the weight tensor can be used with the specified buffer type and device
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {}
```

```c++
ggml_init_params params = {
    /*.mem_size   =*/ ggml_tensor_overhead()*8,
    /*.mem_buffer =*/ NULL,
    /*.no_alloc   =*/ true,
};
ggml_context_ptr ctx_ptr { ggml_init(params) };

ggml_context * ctx = ctx_ptr.get();
ggml_tensor * op_tensor = nullptr;
```

```c++
switch (op) {
    case GGML_OP_GET_ROWS:
        {
            ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
            op_tensor = ggml_get_rows(ctx, w, b);
        } break;
    case GGML_OP_MUL_MAT:
        {
            ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
            op_tensor = ggml_mul_mat(ctx, w, b);
        } break;
    case GGML_OP_ADD:
        {
            ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
            op_tensor = ggml_add(ctx, a, w);
        } break;
    case GGML_OP_MUL:
        {
            ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
            op_tensor = ggml_mul(ctx, a, w);
        } break;
    case GGML_OP_DIV:
        {
            ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
            op_tensor = ggml_div(ctx, a, w);
        } break;
    case GGML_OP_ROPE:
        {
            int n_embd_head = hparams.n_embd_head_v;
            int n_head = hparams.n_head();
            ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
            ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
            op_tensor = ggml_rope_ext(
                ctx, a, b, w,
                0, 0, 0, 0, 0,
                0, 0, 0, 0
            );
        } break;
}
```


<details>
<summary>ggml_new_tensor</summary>

```c++
struct ggml_tensor * ggml_new_tensor_1d()
    return ggml_new_tensor(ctx, type, 1, &ne0);

struct ggml_tensor * ggml_new_tensor_3d()
    const int64_t ne[3] = { ne0, ne1, ne2 };
    return ggml_new_tensor(ctx, type, 3, ne);

struct ggml_tensor * ggml_new_tensor_4d()
    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
    return ggml_new_tensor(ctx, type, 4, ne);

struct ggml_tensor * ggml_get_rows()
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
    result->op     = GGML_OP_GET_ROWS;
    result->src[0] = a;
    result->src[1] = b;
    return result;

static struct ggml_tensor * ggml_rope_impl()
    int sections[4] = {0, 0, 0, 0};

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
    memcpy(params +  5, &freq_base,    sizeof(float));
    memcpy(params +  6, &freq_scale,   sizeof(float));
    memcpy(params +  7, &ext_factor,   sizeof(float));
    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
    memcpy(params + 11, &sections,     sizeof(int)*4);
    ggml_set_op_params(result, params, sizeof(params));

    result->op     = GGML_OP_ROPE;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;

    return result;
```
    
</details>

```c++
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
ggml_backend_buffer_free(w->buffer);
return op_supported;
```

#### [`llama_model_loader::create_tensor`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model-loader.cpp#L789)

```c++
return ml.create_tensor(ctx, tn, ne, flags);

struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {}
```

```c++
--> check_tensor_dims --> cur;
bool duplicated = flags & TENSOR_DUPLICATED;
--> ggml_dup_tensor --> tensor;
ggml_set_name(tensor, ggml_get_name(cur));
if (duplicated) {
    size_data += ggml_nbytes(cur);
} else {
    n_created++;
}
return tensor;
```

[`llama_model_loader::check_tensor_dims`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model-loader.cpp#L759)

```c++
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));

const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
    const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
    return cur;
}
```

[`ggml_dup_tensor`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml.c#L1698)

```c++
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);

struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
}
```

#### [`llama_model_loader::init_mappings`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model-loader.cpp#L845)

```c++
ml.init_mappings(true, nullptr);

void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {}
```

```c++
--> llama_mmap::llama_mmap --> llama_mmap::impl::impl --> mapping;
llama_model_loader:: std::vector<std::pair<size_t, size_t>> mmaps_used;
mmaps_used.emplace_back(mapping->size(), 0);

using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
llama_model_loader:: llama_mmaps mappings;
mappings.emplace_back(std::move(mapping));
```

```c++
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);


llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch, bool numa) : pimpl(std::make_unique<impl>(file, prefetch, numa)) {}
struct llama_mmap::impl {
    std::vector<std::pair<size_t, size_t>> mapped_fragments;
    impl(struct llama_file * file, size_t prefetch, bool numa) {
        size = file->size();
        int fd = file->file_id();
        int flags = MAP_SHARED;
        if (prefetch) { flags |= MAP_POPULATE; }
        --> addr = mmap(NULL, file->size(), PROT_READ, flags, fd, 0);
        if (prefetch > 0) {
            if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
                LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
        mapped_fragments.emplace_back(0, file->size());
    }
}
```

#### [`llama_model_loader::get_mapping_range`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model-loader.cpp#L878)

```c++
ml.get_mapping_range(&first, &last, &addr, idx=0, ctx);

void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {}
```

```c++
const auto & mapping = mappings.at(idx);

*first = mapping->size();
*last  = 0;
*addr = mapping->addr();
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
    const auto * weight = get_weight(ggml_get_name(tensor));
    if (!weight || weight->idx != idx) {
        continue;
    }
    *first = std::min(*first, weight->offs);
    *last  = std::max(*last,  weight->offs + ggml_nbytes(tensor));
}
```

<details>
<summary>struct llama_model_loader::llama_tensor_weight</summary>

```c++
// Holds information on a model weight
struct llama_tensor_weight {
    uint16_t  idx; // source file index
    size_t   offs; // tensor data offset in the original file
    ggml_tensor * tensor;
}

const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
    auto pos = weights_map.find(name);
    if (pos != weights_map.end()) {
        return &pos->second;
    }

    return nullptr;
}
```

</details>


#### [`ggml_backend_cpu_buffer_from_ptr`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-backend.cpp#L2013)

```c++
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, ggml_get_max_tensor_size(ctx));

ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
    --> return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
}

static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
    --> return ggml_backend_cpu_buffer_from_ptr(ptr, size);
}

ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
    --> return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
}
```

<details>
<summary>struct ggml_backend_buffer</summary>

```c++
struct ggml_backend_buffer {
    struct ggml_backend_buffer_i  iface;
    ggml_backend_buffer_type_t    buft;
    void * context;
    size_t size;
    enum ggml_backend_buffer_usage usage;
};

typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
```

</details>

```c++
// backend buffer

ggml_backend_buffer_t ggml_backend_buffer_init(
               ggml_backend_buffer_type_t buft,
        struct ggml_backend_buffer_i      iface,
               void *                     context,
               size_t                     size) {
    ggml_backend_buffer_t buffer = new ggml_backend_buffer {
        /* .interface = */ iface,
        /* .buft      = */ buft,
        /* .context   = */ context,
        /* .size      = */ size,
        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
    };

    return buffer;
}
```

<details>
<summary>struct ggml_backend_buffer_type</summary>
    
```c++
struct ggml_backend_buffer_type {
    struct ggml_backend_buffer_type_i  iface;
    ggml_backend_dev_t device;
    void * context;
};

typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
```

</details>

```c++

static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
        /* .iface   = */ {
            /* .get_name         = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
        },
        /* .device  = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
        /* .context = */ NULL,
    };

    return &ggml_backend_cpu_buffer_type;
}
```

#### [`llama_model_loader::load_all_data`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model-loader.cpp#L918)

```c++
ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data);

bool llama_model_loader::load_all_data(
        struct ggml_context * ctx,
        llama_buf_map & bufs,
        llama_mlocks * lmlocks,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {}
```

```c++
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur))
    const auto * weight = get_weight(ggml_get_name(cur));
    size_t n_size = ggml_nbytes(cur);
    const auto & mapping = mappings.at(weight->idx);
    ggml_backend_buffer_t buf_mmap = bufs.at(weight->idx);
    uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
    --> ggml_backend_tensor_alloc(buf_mmap, cur, data);
    auto & mmap_used = mmaps_used[weight->idx];
    mmap_used.first  = std::min(mmap_used.first,  weight->offs);
    mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
    size_done += n_size;
// check if this is the last call and do final cleanup
for (uint32_t idx = 0; idx < mappings.size(); idx++)
    const auto & mmap_used = mmaps_used.at(idx);
    auto & mapping = mappings.at(idx);
    --> mapping->unmap_fragment(0, mmap_used.first);
    --> if (mmap_used.second != 0) mapping->unmap_fragment(mmap_used.second, mapping->size());
return true;
```

```c++
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr)
    tensor->buffer = buffer;
    tensor->data = addr;
    return ggml_backend_buffer_init_tensor(buffer, tensor);

void llama_mmap::unmap_fragment(size_t first, size_t last) { pimpl->unmap_fragment(first, last); }

struct llama_mmap::impl {
    void unmap_fragment(size_t first, size_t last) {
        int page_size = sysconf(_SC_PAGESIZE);
        align_range(&first, &last, page_size);
        size_t len = last - first;
        void * next_page_start = (uint8_t *) addr + first;
        munmap(next_page_start, len);
        for (const auto & frag : mapped_fragments) new_mapped_fragments.emplace_back(last, frag.second);
        mapped_fragments = std::move(new_mapped_fragments);
    }
}
```

## [`llama_context::llama_context`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-context.cpp#L18)

```c++
llama_context * lctx = llama_init_from_model(model, cparams);

llama_context * llama_init_from_model(
                 llama_model * model,
        llama_context_params   params) {
    --> auto * ctx = new llama_context(*model, params);
    return ctx;
}

llama_context::llama_context(
        const llama_model & model,
              llama_context_params params) :
    model(model) {}
```

```c++
// GPU backends
// add ACCEL backends (such as BLAS)
// add CPU backend
llama_context:: ggml_backend_t backend_cpu = nullptr;
--> ggml_backend_init_by_type --> ggml_backend_dev_init --> ggml_backend_cpu_device_init_backend --> ggml_backend_cpu_init --> backend_cpu;
backends.emplace_back(backend_cpu);
// create a list of the set_n_threads functions in the backends
llama_context:: std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>>
set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
// graph outputs buffer
// resized during inference when a batch uses more outputs
--> llama_context::output_reserve;

// init the memory module
llama_memory_params params_mem = {
    /*.type_k   =*/ params.type_k,
    /*.type_v   =*/ params.type_v,
    /*.swa_full =*/ params.swa_full,
};
llama_context:: std::unique_ptr<llama_memory_i> memory;
--> llama_model::create_memory --> llama_context:: memory;

// init backends
backend_buft.clear();
backend_ptrs.clear();
//for (auto & backend : backends)
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
backend_buft.push_back(buft);
backend_ptrs.push_back(backend.get());

const size_t max_nodes = this->graph_max_nodes();
// memory buffers used to evaluate the model
llama_context:: std::vector<uint8_t> buf_compute_meta;
// buffer used to store the computation graph and the tensor meta data
--> ggml_graph_nbytes --> buf_compute_meta;
llama_context:: ggml_backend_sched_ptr sched;
--> ggml_backend_sched_new --> sched;

// reserve worst-case graph
const uint32_t n_seqs = cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
// simulate full KV cache
--> llama_kv_cache_unified_state::llama_kv_cache_unified_state --> mstate;

// reserve pp graph first so that buffers are only allocated once
--> llama_context::graph_reserve --> gf;
// reserve with tg graph to get the number of splits and nodes
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
```

### [`ggml_backend_cpu_init`]()

```c++
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);

ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
    if (!dev) {
        return nullptr;
    }
    --> return ggml_backend_dev_init(dev, params);
}


ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type)
    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
        if (ggml_backend_dev_type(dev) == type) {
            return dev;
        }
    }

ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params)
    --> return device->iface.init_backend(device, params);

static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params)
    --> return ggml_backend_cpu_init();

ggml_backend_t ggml_backend_cpu_init(void) {}
```

<details>
<summary>struct ggml_backend</summary>

```c++

struct ggml_backend {
    ggml_guid_t guid;
    struct ggml_backend_i iface;
    ggml_backend_dev_t device;
    void * context;
};

typedef struct ggml_backend * ggml_backend_t;
```
    
</details>

```c++
// initialize CPU backend now to avoid slowing the first graph computation
ggml_cpu_init();

struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;

ctx->n_threads           = GGML_DEFAULT_N_THREADS;
ctx->threadpool          = NULL;
ctx->work_data           = NULL;
ctx->work_size           = 0;
ctx->abort_callback      = NULL;
ctx->abort_callback_data = NULL;

ggml_backend_t cpu_backend = new ggml_backend {
    /* .guid      = */ ggml_backend_cpu_guid(),
    /* .interface = */ ggml_backend_cpu_i,
    /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
    /* .context   = */ ctx,
};

return cpu_backend;
```

### [`llama_context::output_reserve`](https://github.com/ggml-org/llama.cpp/blob/master/src/llama-context.cpp#L1239)

```c++
output_reserve(params.n_seq_max);

int32_t llama_context::output_reserve(int32_t n_outputs) {}
```

```c++
// map batch token positions to ids of the logits and embd buffers
llama_context:: std::vector<int32_t> output_ids;
output_ids.resize(n_batch);

const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t new_size  = (logits_size + embd_size) * sizeof(float);
auto * buft = ggml_backend_cpu_buffer_type();
// host buffer for the model output (logits and embeddings)
llama_context:: ggml_backend_buffer_ptr buf_output;
--> ggml_backend_buft_alloc_buffer --> ggml_backend_cpu_buffer_type_alloc_buffer --> buf_output;

--> ggml_backend_buffer_get_base --> ggml_backend_cpu_buffer_get_base --> output_base;
logits = has_logits ? output_base : nullptr;
// set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1);
this->n_outputs = 0;
this->n_outputs_max = n_outputs_max;
return n_outputs_max;
```

[`ggml_backend_cpu_buffer_type_alloc_buffer`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-backend.cpp#L1950)

```c++
ggml_backend_buffer_ptr llama_context:: buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));

ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    if (size == 0) {
        // return a dummy buffer for zero-sized allocations
        return ggml_backend_buffer_init(buft, {}, NULL, 0);
    }
    --> return buft->iface.alloc_buffer(buft, size);
}

static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    void * data = ggml_aligned_malloc(size);
    return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
}

```

[`ggml_backend_cpu_buffer_get_base`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-backend.cpp#L1869)

```c++
float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());

void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
    // get_base is optional if the buffer is zero-sized
    if (buffer->size == 0) {
        return NULL;
    }
    return buffer->iface.get_base(buffer);
}

static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
    uintptr_t data = (uintptr_t)buffer->context;

    // align the buffer
    if (data % TENSOR_ALIGNMENT != 0) {
        data = GGML_PAD(data, TENSOR_ALIGNMENT);
    }

    return (void *)data;
}
```

### [`llama_model::create_memory`](src/llama-model.cpp#L13206)


```c++
memory.reset(model.create_memory(params_mem, cparams));

llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {}

```

### [`ggml_graph_nbytes`](ggml/src/ggml.c#L5961)

```c++
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));

size_t ggml_graph_overhead_custom(size_t size, bool grads) {
    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
}

static size_t ggml_graph_nbytes(size_t size, bool grads) {}
```

```c++
size_t hash_size = ggml_hash_size(size * 2);
void * p = 0;
incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));

size_t nbytes = (size_t) p;
return nbytes;
```

### [`ggml_backend_sched_new`](ggml/src/ggml-backend.cpp#L1455)

```c++
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));

ggml_backend_sched_t ggml_backend_sched_new(
        ggml_backend_t * backends,
        ggml_backend_buffer_type_t * bufts,
        int n_backends,
        size_t graph_size,
        bool parallel,
        bool op_offload) {}
```

### [`llama_kv_cache_unified_state::llama_kv_cache_unified_state`](src/llama-kv-cache-unified.cpp#L1668)

```c++
const auto mstate = memory->init_full();

llama_memory_state_ptr llama_kv_cache_unified::init_full() {
    return std::make_unique<llama_kv_cache_unified_state>(this);
}

class llama_kv_cache_unified_state : public llama_memory_state_i {
public:
    // used to create a full-cache state
    llama_kv_cache_unified_state(
            llama_kv_cache_unified * kv);
}

llama_kv_cache_unified_state::llama_kv_cache_unified_state(
        llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
    n_kv = kv->get_size();
    head = 0;
}           
```

### [`llama_context::graph_reserve`](src/llama-context.cpp#L1331)

```c++
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mstate.get());
auto * gf = graph_reserve(1, 1, 1, mstate.get());

ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate) {}
```

## `llama_decode`

```c++
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));

int32_t llama_decode(
        llama_context * ctx,
        llama_batch   batch) {}
```

# `perplexity`

```c++
results = perplexity(ctx, params, n_ctx);

static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {}
```

# `llama_backend_free`