Use mmap to load GGUF models

cafaxo · Apr 22, 2024 · 97e867b · 97e867b
1 parent 77f7b8f
commit 97e867b
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 3 deletions.
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.1.0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

diff --git a/src/Llama2.jl b/src/Llama2.jl
@@ -8,6 +8,7 @@ using SIMD
 using LoopVectorization
 using Random
 using Distributions
+using Mmap
 
 export ModelConfig, CharTokenizer, LanguageModel
 export load_gguf_model, load_karpathy_model, encode, sample

diff --git a/src/load_gguf.jl b/src/load_gguf.jl
@@ -163,7 +163,7 @@ function align_offset(offset, alignment)
     return offset + (alignment - (offset % alignment)) % alignment
 end
 
-function read_ggml_tensor(tensor_type::GGML_TYPE, size, file::IOStream)
+function _read_ggml_tensor(tensor_type::GGML_TYPE, size, file::IOStream)
     N = length(size)
 
     if tensor_type == GGML_TYPE_F32
@@ -188,7 +188,44 @@ function read_ggml_tensor(tensor_type::GGML_TYPE, size, file::IOStream)
     return tensor
 end
 
-function load_gguf_model(filename::AbstractString)
+function _read_ggml_tensor_mmap(tensor_type::GGML_TYPE, size, file::IOStream)
+    N = length(size)
+
+    if tensor_type == GGML_TYPE_F32
+        size = Tuple(size)
+        tensor = mmap(file, Array{Float32,N}, size)
+    elseif tensor_type == GGML_TYPE_Q4_K
+        @assert size[1] % QK_K == 0
+        size = (size[1] ÷ QK_K, size[2:end]...)
+        tensor = mmap(file, Array{block_q4_K,N}, size)
+    elseif tensor_type == GGML_TYPE_Q5_K
+        @assert size[1] % QK_K == 0
+        size = (size[1] ÷ QK_K, size[2:end]...)
+        tensor = mmap(file, Array{block_q5_K,N}, size)
+    elseif tensor_type == GGML_TYPE_Q6_K
+        @assert size[1] % QK_K == 0
+        size = (size[1] ÷ QK_K, size[2:end]...)
+        tensor = mmap(file, Array{block_q6_K,N}, size)
+    else
+        error("tensor type $tensor_type not implemented")
+    end
+
+    #read!(file, tensor)
+
+    seek(file, position(file) + sizeof(tensor))
+
+    return tensor
+end
+
+function read_ggml_tensor(tensor_type::GGML_TYPE, size, file::IOStream, mmap)
+    if mmap
+        return _read_ggml_tensor_mmap(tensor_type, size, file)
+    end
+
+    return _read_ggml_tensor(tensor_type, size, file)
+end
+
+function load_gguf_model(filename::AbstractString; mmap=true)
     header = nothing
     tensor_dict = nothing
 
@@ -204,7 +241,7 @@ function load_gguf_model(filename::AbstractString)
         # read tensors
         @showprogress desc="Loading model..." for tensor_info in tensor_info_list
             seek(file, pad_offset + tensor_info.offset)
-            tensor_dict[tensor_info.name] = read_ggml_tensor(tensor_info.typ, tensor_info.dimensions, file)
+            tensor_dict[tensor_info.name] = read_ggml_tensor(tensor_info.typ, tensor_info.dimensions, file, mmap)
         end
     end