Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
403 commits
Select commit Hold shift + click to select a range
f6553b6
linenoise.cpp refactoring (#11301)
ericcurtin Jan 21, 2025
e168c69
rpc : better caching of the base buffer pointer (#11331)
rgerganov Jan 21, 2025
72d643c
export-lora : fix tok_embd tensor (#11330)
ngxson Jan 21, 2025
e114217
Add Jinja template support (#11016)
ochafik Jan 21, 2025
d86ffd0
llava : support Minicpm-omni (#11289)
tc-mb Jan 22, 2025
c01b442
`common`: utils to split / join / repeat strings (from json converter…
ochafik Jan 22, 2025
8d944ed
Adding logprobs to /v1/completions (#11344)
jpodivin Jan 22, 2025
f65ef3e
`minja`: sync at https://github.com/google/minja/commit/0f5f7f2b3770e…
ochafik Jan 22, 2025
49cf58f
server : fix draft context not being released (#11354)
slaren Jan 22, 2025
108c0fc
readme : add plugin links (#11355)
ggerganov Jan 22, 2025
d3d8ae3
main : update README documentation for batch size (#11353)
slaren Jan 22, 2025
42b02c9
vulkan: fix diag_mask_inf (#11323)
jeffbolznv Jan 23, 2025
779d3ef
vulkan: sort shaders for more deterministic binary (#11315)
jeffbolznv Jan 23, 2025
d83ccfe
Vulkan-run-test: fix mmq_wg_denoms (#11343)
AMD-dwang Jan 23, 2025
60b0c92
Treat hf.co/ prefix the same as hf:// (#11350)
ericcurtin Jan 23, 2025
7e181b8
server : add more clean up when cancel_tasks is called (#11340)
ngxson Jan 23, 2025
e1bc50b
Add -ngl (#11372)
ericcurtin Jan 23, 2025
261849c
Update documentation (#11373)
ericcurtin Jan 23, 2025
aaa017e
tests: fix some mul_mat test gaps (#11375)
jeffbolznv Jan 23, 2025
199111c
server : (webui) put DeepSeek R1 CoT in a collapsible <details> eleme…
stduhpf Jan 24, 2025
49667f6
Update llama-run README.md (#11386)
ericcurtin Jan 24, 2025
606696e
cmake : avoid -march=native when reproducible build is wanted (#11366)
bmwiedemann Jan 24, 2025
ff4f398
CPU/CUDA: fix (GQA) mul mat back, add CUDA support (#11380)
JohannesGaessler Jan 24, 2025
db9c9cc
docs : Update readme to build targets for local docker build (#11368)
JafarAbdi Jan 24, 2025
9d760e9
release : pack /lib in the packages (#11392)
ggerganov Jan 24, 2025
b56f691
rocBLAS: Avoid fp32->fp16->fp32 conversion on cdna (#11356)
IMbackK Jan 24, 2025
95413b2
CUDA: fix FP16 cuBLAS GEMM (#11396)
JohannesGaessler Jan 24, 2025
6a9aeb2
hip : Add hipGraph and VMM support to ROCM (#11362)
IMbackK Jan 24, 2025
399f530
CANN: Add Ascend CANN build ci (#10217)
xuedinge233 Jan 24, 2025
f528852
ci : fix line breaks on windows builds (#11409)
ggerganov Jan 25, 2025
4892a37
docker : fix CPU ARM build (#11403)
slaren Jan 25, 2025
60e9350
server : fix cleaning up stream task (#11418)
ngxson Jan 25, 2025
61decf8
docker : add GGML_CPU_ARM_ARCH arg to select ARM architecture to buil…
slaren Jan 25, 2025
1755c91
build: add /bigobj to MSVC build (#11407)
jeffbolznv Jan 25, 2025
db99bda
Hip: disable VMM on hip as it seams that it dosent work in some confi…
IMbackK Jan 25, 2025
88d00e3
vulkan: compile shaders on-demand (#11406)
jeffbolznv Jan 25, 2025
91ef385
build: apply MSVC /bigobj option to c/cpp files only (#11423)
jeffbolznv Jan 26, 2025
c17f246
readme : update hot topics
ggerganov Jan 26, 2025
681c8ab
rpc: fix register position (#11424)
thxCode Jan 26, 2025
115672b
cmake: add ggml find package (#11369)
bandoti Jan 26, 2025
d000fe1
docker: add missing vulkan library to base layer and update to 24.04 …
rare-magma Jan 26, 2025
7aa1c11
metal : use residency sets (#11427)
ggerganov Jan 26, 2025
25dbd30
docker : fix ARM build and Vulkan build (#11434)
ngxson Jan 26, 2025
d050cba
metal: Handle null returned from MTLCreateSystemDefaultDevice() (#11441)
Jan 27, 2025
07f8474
llama: refactor llama_decode_impl (#11381)
JohannesGaessler Jan 27, 2025
6d43fd3
llama : minor fixes for up llama load model speed (#11448)
lexasub Jan 27, 2025
9a19ea0
AMD: parse the architecture as supplied by gcnArchName (#11244)
Haus1 Jan 27, 2025
25d07fe
Add new hf protocol for ollama (#11449)
ericcurtin Jan 27, 2025
95d5df0
Handle missing model in CLI parameters for llama-run (#11399)
engelmi Jan 28, 2025
6bb7538
SYCL : SOFTMAX F16 mask support and other fixes (#11261)
qnixsynapse Jan 28, 2025
ecd9266
docker: add perplexity and bench commands to full image (#11438)
rare-magma Jan 28, 2025
2cf5a67
cmake : don't fail on `GGML_CPU=OFF` (#11457)
someone13574 Jan 28, 2025
4e9762c
docker: allow installing pip packages system-wide (#11437)
rare-magma Jan 28, 2025
4ded7c5
Add github protocol pulling and http:// (#11465)
ericcurtin Jan 28, 2025
0fecf56
HIP: Only call rocblas_initialize on rocblas versions with the multip…
sARY77 Jan 28, 2025
0c55745
HIP: Supress transformation warning in softmax.cu
IMbackK Jan 28, 2025
6f71350
ci : fix build CPU arm64 (#11472)
ngxson Jan 28, 2025
6d56592
server : Fixed wrong function name in llamacpp server unit test (#11473)
peidaqi Jan 28, 2025
f9326b9
cmake: add hints for locating ggml on Windows using Llama find-packag…
Emreerdog Jan 28, 2025
29a94ae
llama: fix missing k_cache store for rwkv6qwen2 (#11445)
MollySophia Jan 29, 2025
dd3fc99
embedding : enable --no-warmup option (#11475)
danbev Jan 29, 2025
9f0a464
ggml-cpu : fix ggml_graph_compute_thread did not terminate on abort. …
issixx Jan 17, 2025
5d8d9e9
ggml : add option to not print stack on abort (ggml/1081)
WilliamTambellini Jan 23, 2025
49b473c
sync : ggml
ggerganov Jan 29, 2025
6156441
Parse https://ollama.com/library/ syntax (#11480)
ericcurtin Jan 29, 2025
3228998
vulkan: Catch pipeline creation failure and print an error message (#…
jeffbolznv Jan 29, 2025
c9bc513
server : update auto gen files comments [no ci] (#11484)
danbev Jan 29, 2025
66e995d
vulkan: implement initial support for IQ2 and IQ3 quantizations (#11360)
remyoudompheng Jan 29, 2025
3274221
server : add /apply-template endpoint for additional use cases of Min…
pnb Jan 29, 2025
5b0c80f
server : update json snippets in README.md [no ci] (#11492)
danbev Jan 30, 2025
b7f8ef8
readme : reference examples relative links (#11505)
guspan-tanadi Jan 30, 2025
ceb5bda
server : (docs) added response format for /apply-template [no ci] (#1…
isaac-mcfadyen Jan 30, 2025
c6f62f8
server : use lambda instead of std::bind (#11507)
danbev Jan 30, 2025
e205f34
vocab : correctly identify LF token for GPT-2 style BPE tokenizer (#1…
mgroeber9110 Jan 30, 2025
cd3a67c
sync: minja (#11499)
ochafik Jan 30, 2025
083c041
CUDA/HIP: add warp_size to cuda_device_info
IMbackK Jan 29, 2025
efac32f
HIP: Prepare reduction operators for wave 64
IMbackK Jan 29, 2025
48ece9d
HIP: require at least HIP 5.5
IMbackK Jan 29, 2025
475cf6f
Tool call support (generic + native for Llama, Functionary, Hermes, M…
ochafik Jan 30, 2025
2821281
`ci`: ccache for all github worfklows (#11516)
ochafik Jan 30, 2025
b56116b
server : update help metrics processing/deferred (#11512)
danbev Jan 31, 2025
6732c9f
common: Add missing va_end (#11529)
stevegrubb Jan 31, 2025
7bb3baf
server : fix --jinja when there's no tools or schema (typo was forcin…
ochafik Jan 31, 2025
87cdd2a
Fix chatml fallback for unsupported builtin templates (when --jinja n…
ochafik Jan 31, 2025
0f92342
fix stop regression (#11543)
ochafik Jan 31, 2025
b200d49
`tool-call`: fix llama 3.x and functionary 3.2, play nice w/ pydantic…
ochafik Jan 31, 2025
7a5f7fa
`ci`: use sccache on windows instead of ccache (#11545)
ochafik Jan 31, 2025
0c2584c
ci: simplify cmake build commands (#11548)
ochafik Feb 1, 2025
b959690
Implement s3:// protocol (#11511)
ericcurtin Feb 1, 2025
c31719a
`sync`: minja (https://github.com/google/minja/commit/418a2364b56dc9b…
ochafik Feb 1, 2025
4a62c55
ci: use sccache on windows HIP jobs (#11553)
ochafik Feb 1, 2025
be4e8dd
llama : add support for GLM-Edge and GLM-Edge-V series models (#10573)
piDack Feb 2, 2025
ec7d0d6
sampling : support for llguidance grammars (#10224)
mmoskal Feb 2, 2025
25b2b4a
Fix exotic ci env that lacks ostringstream::str (#11581)
ochafik Feb 2, 2025
3a8d0f6
`tool-call`: support Command R7B (+ return tool_plan "thoughts" in AP…
ochafik Feb 2, 2025
26bf6a4
Name colors (#11573)
ericcurtin Feb 2, 2025
3910c03
CUDA: use mma PTX instructions for FlashAttention (#11583)
JohannesGaessler Feb 2, 2025
f0e2073
nit: more informative crash when grammar sampler fails (#11593)
ochafik Feb 2, 2025
fa058fb
HIP: add GGML_CUDA_CC_IS_* for amd familys as increasing cc archtectu…
IMbackK Feb 2, 2025
5707afb
CUDA/HIP: add support for selectable warp size to mmv (#11519)
IMbackK Feb 2, 2025
b915c51
HIP: fix flash_attn_stream_k_fixup warning (#11604)
JohannesGaessler Feb 2, 2025
b4db35d
server : (webui) Fix Shift+Enter handling (#11609)
mashdragon Feb 3, 2025
9ead527
CUDA: fix Volta FlashAttention logic (#11615)
JohannesGaessler Feb 3, 2025
981cf59
sync : ggml
ggerganov Feb 3, 2025
6f5c477
server : remove CPPHTTPLIB_NO_EXCEPTIONS define (#11622)
danbev Feb 3, 2025
0646894
server : (webui) allow typing and submitting during llm response (#11…
woof-dog Feb 3, 2025
8b1f10a
server : (webui) revert hacky solution from #11626 (#11634)
ngxson Feb 3, 2025
bca40bc
`tool-call`: allow `--chat-template chatml` w/ `--jinja`, default to …
ochafik Feb 3, 2025
9993103
ci : do not stale-close roadmap issues
ggerganov Feb 4, 2025
a3c15e3
cmake: Add ability to pass in GGML_BUILD_NUMBER (ggml/1096)
ckastner Feb 3, 2025
5ad0acf
sync : ggml
ggerganov Feb 4, 2025
bbcacff
authors : update
ggerganov Feb 4, 2025
31ba24f
metal : use residency set for other platforms (#11648)
jhen0409 Feb 4, 2025
10f895e
swift : fix llama-vocab api usage (#11645)
jhen0409 Feb 4, 2025
4698791
readme : add llm_client Rust crate to readme bindings (#11628)
ShelbyJenkins Feb 4, 2025
164b091
`tool-call`: command r7b fix for normal responses (#11608)
ochafik Feb 4, 2025
5232f83
arg : list RPC devices first when using --list-devices (#11655)
rgerganov Feb 4, 2025
42870f1
server : add try..catch to places not covered by set_exception_handle…
ngxson Feb 4, 2025
6539dfd
HIP: force max threads per block to be 1024 (#11621)
fxzjshm Feb 4, 2025
253c3b7
CUDA: non-contiguous (RMS) norm support (#11659)
JohannesGaessler Feb 4, 2025
b1d71c7
`sync`: minja (#11641)
ochafik Feb 5, 2025
a8f9d53
llava: add quantization for the visual projector LLAVA, Qwen2VL (#11644)
samkoesnadi Feb 5, 2025
a429492
CUDA: support for mat. mul. with ne03 != ne13 (#11656)
JohannesGaessler Feb 5, 2025
2064610
metal : adjust support conditions for norm operators (#11671)
ggerganov Feb 5, 2025
79a1137
readme : add link to Autopen under UIs (#11684)
blackhole89 Feb 6, 2025
291de72
metal : avoid breaking build when metal API predates TARGET_OS_VISION…
charles-dyfis-net Feb 6, 2025
0bc4b41
vulkan: use smaller combined allocations to avoid fragmentation (#11551)
jeffbolznv Feb 6, 2025
f5852d3
vulkan: initial support for IQ4_XS quantization (#11501)
remyoudompheng Feb 6, 2025
1fba5cf
vulkan: optimize coopmat2 iq2/iq3 callbacks (#11521)
jeffbolznv Feb 6, 2025
ed40f3a
ggml : fix LoongArch compile error with 128-bit SIMD (#11701)
junchao-loongson Feb 6, 2025
635d815
build : fix llama.pc (#11658)
angt Feb 6, 2025
c77e367
llama : add log about loading model tensors (#11699)
ggerganov Feb 6, 2025
ee19491
SYCL: Adjust support condition for norm operators (#11674)
qnixsynapse Feb 6, 2025
3a169b4
docs: update fedora cuda guide for 12.8 release (#11393)
teihome Feb 6, 2025
de783dc
server : (webui) migrate project to ReactJS with typescript (#11688)
ngxson Feb 6, 2025
9383022
rpc: fix known RCE in rpc-server (ggml/1103)
retr0reg Feb 6, 2025
048f0b2
sync : ggml
ggerganov Feb 6, 2025
a806ef3
llama : fix old glm4 models (#11670)
tv1wnd Feb 6, 2025
f2a6480
ggml : optimize and build warning fix for LoongArch (#11709)
MQ-mengqing Feb 7, 2025
cc712a6
common : add default embeddings presets (#11677)
danbev Feb 7, 2025
cf7f1c1
SYCL: remove XMX info from print devices (#11712)
qnixsynapse Feb 7, 2025
bf14ca7
llama : add llama_sampler_init for safe usage of llama_sampler_free (…
cfillion Feb 7, 2025
718eeea
vulkan: print shared memory size (#11719)
jeffbolznv Feb 7, 2025
379f703
llama : fix progress dots (#11730)
magicse Feb 7, 2025
5935bce
vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729)
cfillion Feb 7, 2025
b9d4ef0
llama : fix defrag logic (#11707)
ggerganov Feb 7, 2025
52e8682
Make logging more verbose (#11714)
ericcurtin Feb 7, 2025
11d2abd
server : (webui) fix numeric settings being saved as string (#11739)
ngxson Feb 8, 2025
28455fe
readme : update front-end framework (#11753)
pothitos Feb 8, 2025
aabc1da
CUDA: fix min. version for movmatrix (#11751)
JohannesGaessler Feb 8, 2025
d746ee0
ggml: Fix data race in ggml threadpool (#11736)
kkontny Feb 8, 2025
2ec90cb
cont : fix mmap flag print (#11699)
ggerganov Feb 8, 2025
648a7f3
server : minor log updates (#11760)
ggerganov Feb 8, 2025
f712b05
server : (webui) increase edit textarea size (#11763)
woof-dog Feb 8, 2025
fe3cf95
server : (webui) revamp Settings dialog, add Pyodide interpreter (#11…
ngxson Feb 8, 2025
cbd23ea
vulkan: account for lookup tables when checking shared memory size (#…
jeffbolznv Feb 9, 2025
1b432c5
There's a better way of clearing lines (#11756)
ericcurtin Feb 9, 2025
a51d625
vulkan: add environment variable GGML_VK_PREFER_HOST_MEMORY to avoid …
wbruna Feb 10, 2025
a19a443
vulkan: Make Vulkan optional at runtime (#11493). (#11494)
daym Feb 10, 2025
a822bab
Update README.md [no ci] (#11781)
pascal-lc Feb 10, 2025
c8ad386
sync: minja (https://github.com/google/minja/commit/a72057e5190de2c61…
ochafik Feb 10, 2025
b9fa290
server : correct signal handler (#11795)
ngxson Feb 10, 2025
9a046b7
llama-mmap: fix missing include (#11796)
wgottwalt Feb 10, 2025
77508dc
server : (webui) introduce conversation branching + idb storage (#11792)
ngxson Feb 10, 2025
86d0ede
docs: utilize the forward slash (/) as the path separator for Unix-li…
MambaWong Feb 10, 2025
ad3d7ba
fix: typos in documentation files (#11791)
maximevtush Feb 10, 2025
505301c
CUDA: use arch list for compatibility check (#11775)
JohannesGaessler Feb 10, 2025
cf1c48e
server : use common_token_to_piece instead of common_detokenize (#11740)
danbev Feb 11, 2025
29559ee
Fix #11802: Compile bug - RegQueryValueExA changed to RegQueryValueEx…
sheldonrobinson Feb 11, 2025
a9f3977
docs: add OpenCL (#11697)
lhez Feb 11, 2025
f587a54
llama : fix typo in llama-grammar.h [no ci] (#11816)
danbev Feb 12, 2025
a9d8dfe
CUDA: fix CUDART_VERSION checks (#11821)
JohannesGaessler Feb 12, 2025
b036636
ggml-cpu: Fix duplicate MATMUL_INT8 (#11817)
ownia Feb 12, 2025
7f412bc
ggml : fix multi-threaded clamp_f32 (#11824)
Burton2000 Feb 12, 2025
71fc726
cleanup: fix compile warnings associated with gnu_printf (#11811)
bandoti Feb 12, 2025
a20cf2a
HIP: Switch to std::vector in rocblas version check (#11820)
IMbackK Feb 12, 2025
e464a0d
sync : ggml
ggerganov Feb 12, 2025
91f4699
Fix: Compile failure due to Microsoft STL breaking change (#11836)
MrSMlT Feb 12, 2025
9e84838
HIP: Remove GCN from list of devices that avoid MMQ (#11831)
IMbackK Feb 12, 2025
af746ea
server : (webui) Give copy button back to all message bubbles (#11814)
woof-dog Feb 12, 2025
0d927c3
ggml : x2 speed for WASM by optimizing SIMD (#11453)
ngxson Feb 12, 2025
0fed52c
ggml-cpu : add chunking support to mul_mat_id (#11666)
slaren Feb 13, 2025
61f6c1f
llama : update llama_decode_internal ref [no ci] (#11840)
danbev Feb 13, 2025
fed584c
llama.cpp: fix warning message (#11839)
okuvshynov Feb 13, 2025
943fb01
sampling: add Top-nσ sampler (#11223)
VJHack Feb 13, 2025
b8f7379
`server`: fix tool-call of DeepSeek R1 Qwen, return reasoning_content…
ochafik Feb 13, 2025
e54c613
musa: bump MUSA SDK version to rc3.1.1 (#11822)
yeahdongcn Feb 13, 2025
9e8a353
llama : add --completion-bash option (#11846)
danbev Feb 13, 2025
323aaad
server : (docs) Update wrong tool calling example (#11809)
RezaRahemtola Feb 13, 2025
a32b415
llamafile: use member variable instead of constant for iq4nlt (#11780)
jmorganca Feb 13, 2025
af928ae
readme : minor
ggerganov Feb 13, 2025
8ab2037
llama-bench : fix unexpected global variable initialize sequence issu…
theraininsky Feb 14, 2025
6b7ffed
vulkan: linux builds + small subgroup size fixes (#11767)
netrunnereve Feb 14, 2025
899fbde
ggml: optimize some vec dot functions for LoongArch ASX (#11842)
MQ-mengqing Feb 14, 2025
864a125
llama : add completion for --chat-template-file (#11860)
danbev Feb 14, 2025
6398451
docker : drop to CUDA 12.4 (#11869)
ggerganov Feb 14, 2025
ba699f3
cuda : add ampere to the list of default architectures (#11870)
slaren Feb 14, 2025
4488428
opencl: Fix rope and softmax (#11833)
lhez Feb 14, 2025
05c7c60
llguidance build fixes for Windows (#11664)
mmoskal Feb 14, 2025
de1e3ca
vulkan: initial support for IQ1_S and IQ1_M quantizations (#11528)
remyoudompheng Feb 15, 2025
d502583
server: fix type promotion typo causing crashes w/ --jinja w/o tools …
ochafik Feb 15, 2025
edab2bd
repo : update links to new url (#11886)
ggerganov Feb 15, 2025
4fdf52b
readme : add notice about new package registry (#11890)
ggerganov Feb 15, 2025
66225e1
metal : optimize dequant q6_K kernel (#11892)
akretz Feb 15, 2025
72b3caf
examples: fix typo in imatrix/README.md (#11884)
708-145 Feb 15, 2025
cf220f8
scripts: fix compare-llama-bench commit hash logic (#11891)
JohannesGaessler Feb 15, 2025
b5ecc39
metal : fix the crash caused by the lack of residency set support on …
halechan Feb 16, 2025
8420ef4
vulkan: support multi/vision rope, and noncontiguous rope (#11902)
jeffbolznv Feb 16, 2025
fbd066c
ci : fix (again) arm64 build fails (#11895)
ngxson Feb 16, 2025
00772cd
common : Fix a typo in help (#11899)
standby24x7 Feb 16, 2025
1cd1f04
server : bump httplib to 0.19.0 (#11908)
ngxson Feb 16, 2025
f802799
vulkan: implement several ops relevant for ggml_opt (#11769)
remyoudompheng Feb 17, 2025
09420fe
server : fix divide-by-zero in metrics reporting (#11915)
aviallon Feb 17, 2025
7135cd9
update release requirements (#11897)
netrunnereve Feb 17, 2025
94c9e1d
CUDA: use async data loading for FlashAttention (#11894)
JohannesGaessler Feb 17, 2025
bc2dbff
docs : Fix duplicated file extension in test command (#11935)
xiaobing318 Feb 18, 2025
49c7a16
scripts: corrected encoding when getting chat template (#11866) (#11907)
MoonRide303 Feb 18, 2025
5379399
server : add TEI API format for /rerank endpoint (#11942)
ngxson Feb 18, 2025
862fb4f
tool-call: refactor common chat / tool-call api (+ tests / fixes) (#1…
ochafik Feb 18, 2025
b380791
server : (webui) Enable communication with parent html (if webui is i…
igardev Feb 18, 2025
dc93cb8
llama : fix indentation in llama-grammar [no ci] (#11943)
danbev Feb 19, 2025
d00a1ba
speculative : update default params (#11954)
ggerganov Feb 19, 2025
0f55fb1
common : add llama.vim preset for Qwen2.5 Coder (#11945)
danbev Feb 19, 2025
37c3a25
doc: add links to ggml examples [no ci] (#11958)
JohannesGaessler Feb 19, 2025
dc57856
run : add --chat-template-file (#11961)
engelmi Feb 20, 2025
50a7a06
ggml: aarch64: implement SVE kernels for q3_K_q8_K vector dot (#11917)
Vithulep Feb 20, 2025
5f0377e
ggml-cpu: Add CPU backend support for KleidiAI library (#11390)
chaxu01 Feb 20, 2025
9ee29d2
server (webui): Fix Premature Submission During IME Conversion (#11971)
mmngays Feb 20, 2025
c81507b
clip : fix visual encoders with no CLS (#11982)
alex-jw-brooks Feb 21, 2025
5cc0d24
MUSA: support ARM64 and enable dp4a .etc (#11843)
BodhiHu Feb 21, 2025
19b54ba
CUDA: correct the lowest Maxwell supported by CUDA 12 (#11984)
PureJourney Feb 21, 2025
f590441
doc: update contributing guidelines [no ci] (#11969)
JohannesGaessler Feb 21, 2025
822e451
llama : skip loading unused tensors (#12004)
ggerganov Feb 21, 2025
01f573a
llama.swiftui : add "Done" dismiss button to help view (#11998)
danbev Feb 22, 2025
6438b72
cuda: Add Q5_1, Q5_0, Q4_1 and Q4_0 to F32 conversion support. (#12000)
gcp Feb 22, 2025
6a05c4c
server : disable Nagle's algorithm (#12020)
ggerganov Feb 22, 2025
0b16703
ci : Build on Github-hosted arm64 runners (#12009)
Rohanjames1997 Feb 22, 2025
cded583
CUDA: optimize FA for GQA + large batches (#12014)
JohannesGaessler Feb 22, 2025
732014b
ci : fix arm upload artifacts (#12024)
ggerganov Feb 22, 2025
00179de
llava: build clip image from pixels (#11999)
tinglou Feb 22, 2025
4369168
CUDA: app option to compile without FlashAttention (#12025)
JohannesGaessler Feb 22, 2025
844e42c
ggml-cpu: Support s390x SIMD Instruction Set (#12019)
taronaeo Feb 22, 2025
edc4851
Some llama-run cleanups (#11973)
ericcurtin Feb 23, 2025
2c6f90d
run: allow to customize prompt by env var LLAMA_PROMPT_PREFIX (#12041)
benoitf Feb 23, 2025
7fecf7f
SYCL: Fix GGML_SYCL_DEBUG macro (#11995)
qnixsynapse Feb 24, 2025
89b48a8
gguf_convert_endian.py: implement byteswapping for q4_k and q6_k (#11…
AlekseiNikiforovIBM Feb 24, 2025
420e3b3
[SYCL] Optimize mul_mat for Q4_0 on Intel GPU (#12035)
NeoZhangJianyu Feb 24, 2025
4ccc7f1
llava : Add Granite Vision Support (#11794)
alex-jw-brooks Feb 24, 2025
6d4dab5
opencl: fix for small models (#11950)
lhez Feb 24, 2025
7ba151d
add new line at end of file
NeoZhangJianyu Feb 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
92 changes: 92 additions & 0 deletions .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
ARG UBUNTU_VERSION=22.04

FROM ubuntu:$UBUNTU_VERSION AS build

ARG TARGETARCH

ARG GGML_CPU_ARM_ARCH=armv8-a

RUN apt-get update && \
apt-get install -y build-essential git cmake libcurl4-openssl-dev

WORKDIR /app

COPY . .

RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \
fi && \
cmake --build build -j $(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

## Base image
FROM ubuntu:$UBUNTU_VERSION AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY --from=build /app/lib/ /app

### Full
FROM base AS full

COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app

WORKDIR /app

ENTRYPOINT [ "/app/llama-cli" ]

### Server, Server only
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

WORKDIR /app

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
94 changes: 94 additions & 0 deletions .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=12.4.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default

RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1

WORKDIR /app

COPY . .

RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)

RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;

RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base

RUN apt-get update \
&& apt-get install -y libgomp1 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY --from=build /app/lib/ /app

### Full
FROM base AS full

COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete


ENTRYPOINT ["/app/tools.sh"]

### Light, CLI only
FROM base AS light

COPY --from=build /app/full/llama-cli /app

WORKDIR /app

ENTRYPOINT [ "/app/llama-cli" ]

### Server, Server only
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

WORKDIR /app

HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
33 changes: 0 additions & 33 deletions .devops/full-cuda.Dockerfile

This file was deleted.

33 changes: 0 additions & 33 deletions .devops/full-musa.Dockerfile

This file was deleted.

50 changes: 0 additions & 50 deletions .devops/full-rocm.Dockerfile

This file was deleted.

38 changes: 0 additions & 38 deletions .devops/full.Dockerfile

This file was deleted.

Loading
Loading