Skip to content
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ add_compile_definitions(GGML_MAX_NAME=128)
# CUDA architectures: cover Turing to Blackwell for distributed binaries.
# Users can override with -DCMAKE_CUDA_ARCHITECTURES=native for local builds.
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;120a-real;121a-real")
else()
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real")
endif()
endif()

# ggml as subdirectory, inherits GGML_CUDA, GGML_METAL, etc. from cmake flags
Expand Down
188 changes: 163 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# acestep.cpp

Portable C++17 implementation of ACE-Step 1.5 music generation using GGML.
Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, Metal, Vulkan.
Text + lyrics in, stereo 48kHz WAV out. Runs on CPU, CUDA, ROCm, Metal, Vulkan.

## Build

Expand All @@ -16,6 +16,9 @@ cmake ..
# Linux with NVIDIA GPU
cmake .. -DGGML_CUDA=ON

# Linux with AMD GPU (ROCm)
cmake .. -DGGML_HIP=ON

# Linux with Vulkan
cmake .. -DGGML_VULKAN=ON

Expand Down Expand Up @@ -94,13 +97,13 @@ EOF
# LLM: request.json -> request0.json (enriched with lyrics + codes)
./build/ace-qwen3 \
--request /tmp/request.json \
--model models/acestep-5Hz-lm-4B-BF16.gguf
--model models/acestep-5Hz-lm-4B-Q8_0.gguf

# DiT+VAE: request0.json -> request00.wav
./build/dit-vae \
--request /tmp/request0.json \
--text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \
--dit models/acestep-v15-turbo-BF16.gguf \
--text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
--dit models/acestep-v15-turbo-Q8_0.gguf \
--vae models/vae-BF16.gguf
```

Expand All @@ -111,16 +114,16 @@ Generate multiple songs at once with `--batch`:
# -> request0.json, request1.json (different lyrics/codes, seeds auto+0, auto+1)
./build/ace-qwen3 \
--request /tmp/request.json \
--model models/acestep-5Hz-lm-4B-BF16.gguf \
--model models/acestep-5Hz-lm-4B-Q8_0.gguf \
--batch 2

# DiT+VAE: (2 DiT variations of LM output 1 and 2)
# -> request0.json -> request00.wav, request01.wav
# -> request1.json -> request10.wav, request11.wav
./build/dit-vae \
--request /tmp/request0.json /tmp/request1.json \
--text-encoder models/Qwen3-Embedding-0.6B-BF16.gguf \
--dit models/acestep-v15-turbo-BF16.gguf \
--text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
--dit models/acestep-v15-turbo-Q8_0.gguf \
--vae models/vae-BF16.gguf \
--batch 2
```
Expand Down Expand Up @@ -151,34 +154,43 @@ Empty field = "fill it". Filled = "don't touch".
All modes always output numbered files (`request0.json` .. `requestN-1.json`).
The input JSON is never modified.

**Caption only**: the LLM generates lyrics, metadata (bpm, key, time
signature, duration) and audio codes. With `--batch N`, each element
generates its own lyrics and metadata from a different seed, producing
N completely different songs. See `examples/simple.json`.
**Caption only** (`lyrics=""`): two LLM passes. Phase 1 uses the "Expand"
prompt to generate lyrics and metadata (bpm, keyscale, timesignature,
duration) via CoT. Phase 2 reinjects the CoT and generates audio codes using
the "Generate tokens" prompt. CFG is forced to 1.0 in phase 1 (free
sampling); `lm_cfg_scale` only applies in phase 2. With `--batch N`, each
element runs its own phase 1 from a different seed, producing N completely
different songs. See `examples/simple.json`.

**Caption + lyrics (+ optional metadata)**: the LLM fills missing
metadata via CoT, then generates audio codes. User provided fields
are preserved. See `examples/partial.json`.
**Caption + lyrics (+ optional metadata)**: single LLM pass. The "Generate
tokens" prompt is used directly. Missing metadata is filled via CoT, then
audio codes are generated. User-provided fields are never overwritten.
`lm_cfg_scale` applies to both CoT and code generation. See
`examples/partial.json`.

**Everything provided** (caption, lyrics, bpm, duration, keyscale,
timesignature): the LLM skips CoT and generates audio codes directly.
With `--batch N`, all elements share the same prompt (single prefill,
KV cache copied). See `examples/full.json`.

**Instrumental** (`lyrics="[Instrumental]"`): treated as "lyrics provided",
so the single-pass "Generate tokens" path is used. No lyrics generation.
The DiT was trained with this exact string as the no-vocal condition.

**Passthrough** (`audio_codes` present): LLM is skipped entirely.
Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.

## Request JSON reference

All fields with defaults. Only `caption` is required.
Only `caption` is required. All other fields default to "unset" which means
the LLM fills them, or a sensible runtime default is applied.

```json
{
"caption": "",
"lyrics": "",
"instrumental": false,
"bpm": 0,
"duration": -1,
"duration": 0,
"keyscale": "",
"timesignature": "",
"vocal_language": "unknown",
Expand All @@ -190,18 +202,98 @@ All fields with defaults. Only `caption` is required.
"lm_negative_prompt": "",
"audio_codes": "",
"inference_steps": 8,
"guidance_scale": 7.0,
"guidance_scale": 0.0,
"shift": 3.0
}
```

Key fields: `seed` -1 means random (resolved once, then +1 per batch
element). `audio_codes` is generated by ace-qwen3 and consumed by
dit-vae (comma separated FSQ token IDs). When present, the LLM is
skipped entirely.
### Text conditioning (ace-qwen3 + dit-vae)

**`caption`** (string, required)
Natural language description of the music style, mood, instruments, etc.
Fed to both the LLM and the DiT text encoder.

**`lyrics`** (string, default `""`)
Controls vocal generation. Three valid states:
- `""`: LLM generates lyrics from the caption (phase 1 "Expand" prompt).
- `"[Instrumental]"`: no vocals. Passed directly to the DiT, LLM skips lyrics generation.
- Any other string: user-provided lyrics used as-is, LLM only fills missing metadata.

There is no `instrumental` flag. This field is the single source of truth for
vocal content.

### Metadata (LLM-filled if unset)

**`bpm`** (int, default `0` = unset)
Beats per minute. LLM generates one if 0.

**`duration`** (float seconds, default `0` = unset)
Target audio duration. `0` means the LLM picks it. Clamped to [1, 600]s after
generation. `1` means 1 second.

**`keyscale`** (string, default `""` = unset)
Musical key and scale, e.g. `"C major"`, `"F# minor"`. LLM fills if empty.

**`timesignature`** (string, default `""` = unset)
Time signature numerator as a string, e.g. `"4"` for 4/4, `"3"` for 3/4.
LLM fills if empty.

**`vocal_language`** (string, default `"unknown"`)
BCP-47 language code for lyrics, e.g. `"en"`, `"fr"`, `"ja"`. When set and
lyrics are being generated, the FSM constrains the LLM output to that language.
`"unknown"` lets the LLM decide.

### Generation control

**`seed`** (int64, default `-1` = random)
RNG seed. Resolved once at startup to a random value if -1. Batch elements
use `seed+0`, `seed+1`, ... `seed+N-1`.

**`audio_codes`** (string, default `""`)
Comma-separated FSQ token IDs produced by ace-qwen3. When non-empty, the
entire LLM pass is skipped and dit-vae decodes these codes directly
(passthrough / cover mode).

### LM sampling (ace-qwen3)

**`lm_temperature`** (float, default `0.85`)
Sampling temperature for both phase 1 (lyrics/metadata) and phase 2 (audio
codes). Lower = more deterministic.

**`lm_cfg_scale`** (float, default `2.0`)
Classifier-Free Guidance scale for the LM. Only active in phase 2 (audio
code generation) and in phase 1 when lyrics are already provided. When
`lyrics` is empty, phase 1 always runs with `cfg=1.0` (free sampling).
`1.0` disables CFG.

**`lm_top_p`** (float, default `0.9`)
Nucleus sampling cutoff. `1.0` disables. When `top_k=0`, an internal
pre-filter of 256 tokens is applied before top_p for performance.

Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.
**`lm_top_k`** (int, default `0` = disabled)
Top-K sampling. `0` disables hard top-K (top_p still applies).

**`lm_negative_prompt`** (string, default `""`)
Negative caption for CFG in phase 2. Empty string falls back to a
caption-less unconditional prompt.

### DiT flow matching (dit-vae)

**`inference_steps`** (int, default `8`)
Number of diffusion denoising steps. Turbo preset: `8`. SFT preset: `50`.

**`guidance_scale`** (float, default `0.0` = auto)
CFG scale for the DiT. `0.0` is resolved at runtime:
- Turbo models: forced to `1.0` (CFG disabled, turbo was trained without it).
- SFT/base models: `7.0`.
Any value > 1.0 on a turbo model is overridden to 1.0 with a warning.

**`shift`** (float, default `3.0`)
Flow-matching schedule shift. Controls the timestep distribution.
`shift = s*t / (1 + (s-1)*t)`. Turbo preset: `3.0`. SFT preset: `6.0`.

Turbo preset: `inference_steps=8, shift=3.0` (guidance_scale auto-resolved to 1.0).
SFT preset: `inference_steps=50, guidance_scale=7.0, shift=6.0`.

## ace-qwen3 reference

Expand Down Expand Up @@ -278,6 +370,39 @@ dit-vae
WAV stereo 48kHz
```

## Roadmap

This project started from a simple idea: a Telegram bot using llama.cpp to
prompt a music generator, and the desire to make GGML sing. No more, no less.
No cloud, no black box, scriptable and nothing between you and the model.

### LLM modes
- [ ] Remaining modes: Understand, Rewrite (single-pass, no audio codes)
- [ ] Reference audio input: repaint and cover tasks (src_audio + cover_strength)

### Audio I/O
Current: raw PCM f32 WAV via hand-rolled writer, no external deps.
Trade-off to document:
- **Keep as-is**: zero dependencies, clean licensing, works everywhere
- **ffmpeg pipe**: trivial bash wrapper handles any codec/format, no C++ codec hell
- pro: MP3/FLAC/OGG out of the box, input resampling for reference audio
- con: runtime dependency, not embedded
Conclusion pending. Likely ffmpeg as optional external pipe, documented in README.

### API and interface
- [ ] JSON HTTP server (minimal, well-documented, stable contract)
- [ ] Web interface on top - vibecodeable by anyone, API stays simple
Goal: document the internals and how the model actually works,
not reproduce the Python spaghetti. Expert-first, no commercial fluff.

### Documentation
Current README is technical study + API reference, intentional.
- [ ] Split when a user-facing interface exists: README (user) + ARCHITECTURE.md (internals)

### Future models
- [ ] ACE-Step 2.0: evaluate architecture delta, add headers/weights as needed
No commitment, easy to adapt by adding headers or new compilation units as needed.

## LM specifics

ace-qwen3 is not a general-purpose chat engine. It is a two-phase autoregressive
Expand Down Expand Up @@ -318,7 +443,7 @@ python3 debug-dit-cossim.py # DiT: per-layer cossim GGML vs Python (turbo/
## Patched GGML fork

Uses a patched GGML fork (submodule) with two new ops, a Metal im2col optimization, and
a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, Metal, Vulkan.
a CUDA bugfix for the Oobleck VAE decoder. All backends: CPU, CUDA, ROCm, Metal, Vulkan.
F32/F16/BF16 data types. The DiT uses only standard GGML ops and needs no patches.

The VAE reconstructs audio from latent space through 5 upsampling blocks (total 1920x),
Expand Down Expand Up @@ -373,6 +498,19 @@ Upstream `im2col_kernel` uses OW directly as grid dimension Y, which exceeds the
times per tile at output widths up to 491520. Fixed with a grid-stride loop on OW and
`MIN(OW, MAX_GRIDDIM_Z)` clamping.

### Upstream divergence

The GGML submodule diverges from upstream only by the addition of
`GGML_OP_SNAKE` and `GGML_OP_COL2IM_1D`. No existing upstream kernel is
modified. These ops are required; the VAE does not work without them.

An earlier approach patched the upstream naive ops instead of adding custom
ones. Those patches were dropped. They are documented here in case someone
wants to study the naive path:

- `conv_transpose_1d`: bounded loop replacing O(T_in) brute-force, CUDA and Metal
- `im2col`: grid-stride loop on OW to fix gridDim.y overflow for large tensors

## Acknowledgements

Independent implementation based on ACE-Step 1.5 by ACE Studio and StepFun.
Expand Down
16 changes: 15 additions & 1 deletion src/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
extern "C" int cudaDeviceGetAttribute(int *, int, int);
#endif
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <thread>

Expand Down Expand Up @@ -41,6 +42,10 @@ static BackendPair backend_init(const char * label) {
ggml_backend_load_all();
BackendPair bp = {};
bp.backend = ggml_backend_init_best();
if (!bp.backend) {
fprintf(stderr, "[Load] FATAL: no backend available\n");
exit(1);
}
int n_threads = (int)std::thread::hardware_concurrency() / 2;
if (n_threads < 1) n_threads = 1;
// [GGML] If best backend is already CPU, reuse it (avoid 2 CPU instances
Expand All @@ -51,6 +56,10 @@ static BackendPair backend_init(const char * label) {
ggml_backend_cpu_set_n_threads(bp.backend, n_threads);
} else {
bp.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
if (!bp.cpu_backend) {
fprintf(stderr, "[Load] FATAL: failed to init CPU backend\n");
exit(1);
}
ggml_backend_cpu_set_n_threads(bp.cpu_backend, n_threads);
}
fprintf(stderr, "[Load] %s backend: %s (CPU threads: %d)\n",
Expand Down Expand Up @@ -87,5 +96,10 @@ static void backend_release(ggml_backend_t backend, ggml_backend_t cpu_backend)
static ggml_backend_sched_t backend_sched_new(BackendPair bp, int max_nodes) {
ggml_backend_t backends[2] = { bp.backend, bp.cpu_backend };
int n = (bp.backend == bp.cpu_backend) ? 1 : 2;
return ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
ggml_backend_sched_t sched = ggml_backend_sched_new(backends, NULL, n, max_nodes, false, true);
if (!sched) {
fprintf(stderr, "[Load] FATAL: failed to create scheduler\n");
exit(1);
}
return sched;
}
5 changes: 4 additions & 1 deletion src/cond-enc.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,10 @@ static void cond_ggml_forward(CondGGML * m,
if (timbre_out) ggml_build_forward_expand(gf, timbre_out);

// Allocate and set inputs
ggml_backend_sched_alloc_graph(m->sched, gf);
if (!ggml_backend_sched_alloc_graph(m->sched, gf)) {
fprintf(stderr, "[CondEncoder] FATAL: failed to allocate graph\n");
exit(1);
}

ggml_backend_tensor_set(t_lyric_in, lyric_embed, 0, 1024 * S_lyric * sizeof(float));
ggml_backend_tensor_set(t_text_in, text_hidden, 0, 1024 * S_text * sizeof(float));
Expand Down
2 changes: 0 additions & 2 deletions src/debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include <cstdio>
#include <cstdint>
#include <cmath>
#include <cstring>
#include <string>
#include <vector>

struct DebugDumper {
Expand Down
3 changes: 0 additions & 3 deletions src/dit-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@

#include "dit.h"

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <vector>

// Helper: ensure tensor is f32 (cast if bf16/f16)
static struct ggml_tensor * dit_ggml_f32(
Expand Down
4 changes: 0 additions & 4 deletions src/dit-sampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@
#include "dit-graph.h"
#include "debug.h"

#include "ggml.h"
#include "ggml-backend.h"
#include "ggml-alloc.h"

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <vector>
Expand Down
Loading