Skip to content

Commit

Permalink
Merge branch 'LostRuins:concedo' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Jun 22, 2023
2 parents eb094f0 + e6ddb15 commit b6ff890
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 10 deletions.
4 changes: 2 additions & 2 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
{
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers);
if(res==ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
Expand Down Expand Up @@ -734,7 +734,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}
else if(file_format==FileFormat::MPT_1)
{
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab);
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers);
if(res==false)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
Expand Down
23 changes: 23 additions & 0 deletions otherarch/gpt2_v3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,29 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g

fin.close();

//gpu offload
#if defined(GGML_USE_CLBLAST)
if(gpulayers>0)
{
const auto & hparams = model.hparams;
size_t vram_total = 0;
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
}
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#endif

return ModelLoadResult::SUCCESS;
}

Expand Down
31 changes: 29 additions & 2 deletions otherarch/gptj_v3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

#include "model_adapter.h"


#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif

// load the model's weights from a file
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
Expand Down Expand Up @@ -331,7 +333,32 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g

fin.close();


//gpu offload
#if defined(GGML_USE_CLBLAST)
if(gpulayers>0)
{
const auto & hparams = model.hparams;
size_t vram_total = 0;
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
}
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#endif

return ModelLoadResult::SUCCESS;
}
Expand Down
31 changes: 28 additions & 3 deletions otherarch/mpt_v3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@

#include "model_adapter.h"


#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif

// load the model's weights from a file
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

auto fin = std::ifstream(fname, std::ios::binary);
Expand Down Expand Up @@ -75,7 +77,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
std::string word;
std::vector<char> buf(128);

for (int i = 0; i < n_vocab; i++) {
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));

Expand Down Expand Up @@ -278,6 +280,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo

fin.close();

//gpu offload
#if defined(GGML_USE_CLBLAST)
if(gpulayers>0)
{
const auto & hparams = model.hparams;
size_t vram_total = 0;
const int n_gpu = std::min(gpulayers, int(hparams.n_layers));
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
}
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#endif

return true;
}

Expand Down
29 changes: 27 additions & 2 deletions otherarch/neox_v3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
#include <vector>
#include <iostream>


#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif

// load the model's weights from a file
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

auto fin = std::ifstream(fname, std::ios::binary);
Expand Down Expand Up @@ -318,6 +320,29 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &

fin.close();

//gpu offload
#if defined(GGML_USE_CLBLAST)
if(gpulayers>0)
{
const auto & hparams = model.hparams;
size_t vram_total = 0;
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
}
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#endif

return ModelLoadResult::SUCCESS;
}

Expand Down
1 change: 0 additions & 1 deletion otherarch/otherarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ struct gptj_layer {
struct ggml_tensor * c_mlp_fc_b;

struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility
struct ggml_tensor * c_mlp_proj_b;
};
struct gptj_layer_v2 {
Expand Down

0 comments on commit b6ff890

Please sign in to comment.