Skip to content

Commit

Permalink
Merge remote-tracking branch 'ggerganov/master'
Browse files Browse the repository at this point in the history
* ggerganov/master: (86 commits)
  server : fix building and simplify lib deps on Windows (ggerganov#1772)
  talk-llama : sync llama.cpp
  talk-llama : llama.cpp
  sync : ggml
  metal : correctly set SIMD support flags on iOS (llama/4923)
  2-bit quantizations (llama/4897)
  scripts : sync-ggml-am.sh add option to skip commits
  talk-llama : sync llama.cpp
  sync : ggml
  examples : adapt to metal API
  ggml: cache sin/cos for RoPE (llama/4908)
  metal : remove old API (llama/4919)
  metal : disable log for loaded kernels (llama/4794)
  gguf : fix potential infinite for-loop (llama/4600)
  metal : refactor kernel loading code (llama/4794)
  CUDA: faster q8_0 -> f16 dequantization (llama/4895)
  talk-llama : add optional CLI arg to set the bot name (ggerganov#1764)
  examples : add python example for transcription (ggerganov#1744)
  whisper : load the model into multiple buffers of max size 1GB (ggerganov#1763)
  talk-llama : sync llama.cpp
  ...
  • Loading branch information
bygreencn committed Jan 16, 2024
2 parents b87e0b8 + f5f159c commit eda6990
Show file tree
Hide file tree
Showing 56 changed files with 9,076 additions and 4,642 deletions.
4 changes: 4 additions & 0 deletions .devops/main-cuda.Dockerfile
Expand Up @@ -20,6 +20,10 @@ RUN apt-get update && \
apt-get install -y build-essential \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

# Ref: https://stackoverflow.com/a/53464012
ENV CUDA_MAIN_VERSION=12.3
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH

COPY .. .
RUN make

Expand Down
22 changes: 21 additions & 1 deletion .github/workflows/build.yml
Expand Up @@ -223,9 +223,12 @@ jobs:
- arch: Win32
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
s2arc: x86
clblast: OFF
- arch: x64
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
s2arc: x64
clblast: ON
clver: 1.6.1
- sdl2: ON
s2ver: 2.28.5

Expand All @@ -252,13 +255,26 @@ jobs:
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Install OpenCL
if: matrix.clblast == 'ON'
run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl

- name: Fetch CLBlast and set CLBlast_DIR
if: matrix.clblast == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
7z x clblast.zip
7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
-DWHISPER_SDL2=${{ matrix.sdl2 }}
-DWHISPER_CLBLAST=${{ matrix.clblast }}
- name: Build
run: |
Expand All @@ -273,11 +289,15 @@ jobs:
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

- name: Copy clblast.dll
if: matrix.clblast == 'ON'
run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}

- name: Upload binaries
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v1
with:
name: whisper-blas-bin-${{ matrix.arch }}
name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}

windows-cublas:
Expand Down
14 changes: 10 additions & 4 deletions CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.5)

project(whisper.cpp VERSION 1.5.2)
project(whisper.cpp VERSION 1.5.4)

# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
Expand Down Expand Up @@ -223,11 +223,17 @@ if (WHISPER_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)

if (WHISPER_STATIC)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
else()
message(FATAL_ERROR "cuBLAS not found")
endif()
Expand Down Expand Up @@ -343,8 +349,8 @@ else()
endif()
else()
if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
else()
if(NOT WHISPER_NO_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
Expand Down
14 changes: 12 additions & 2 deletions Makefile
Expand Up @@ -99,6 +99,16 @@ ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$
CXXFLAGS += -pthread
endif

# detect Windows
ifneq ($(findstring _NT,$(UNAME_S)),)
_WIN32 := 1
endif

# Windows Sockets 2 (Winsock) for network-capable apps
ifeq ($(_WIN32),1)
LWINSOCK2 := -lws2_32
endif

# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
Expand Down Expand Up @@ -206,7 +216,7 @@ ifdef WHISPER_CUBLAS

CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
WHISPER_OBJ += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
Expand Down Expand Up @@ -360,7 +370,7 @@ quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)

server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS) $(LWINSOCK2)

stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
Expand Down
10 changes: 4 additions & 6 deletions Package.swift
Expand Up @@ -13,9 +13,13 @@ let package = Package(
products: [
.library(name: "whisper", targets: ["whisper"]),
],
dependencies: [
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
],
targets: [
.target(
name: "whisper",
dependencies: ["ggml"],
path: ".",
exclude: [
"bindings",
Expand All @@ -32,14 +36,8 @@ let package = Package(
"Makefile"
],
sources: [
"ggml.c",
"whisper.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m"
],
resources: [.process("ggml-metal.metal")],
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -6,7 +6,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

Stable: [v1.5.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

Expand Down
6 changes: 6 additions & 0 deletions bindings/go/params.go
Expand Up @@ -123,6 +123,11 @@ func (p *Params) SetAudioCtx(n int) {
p.audio_ctx = C.int(n)
}

// Set initial prompt
func (p *Params) SetInitialPrompt(prompt string) {
p.initial_prompt = C.CString(prompt)
}

///////////////////////////////////////////////////////////////////////////////
// PRIVATE METHODS

Expand All @@ -147,6 +152,7 @@ func (p *Params) String() string {
str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
if p.translate {
str += " translate"
}
Expand Down
5 changes: 5 additions & 0 deletions bindings/go/pkg/whisper/context.go
Expand Up @@ -130,6 +130,11 @@ func (context *context) SetAudioCtx(n uint) {
context.params.SetAudioCtx(int(n))
}

// Set initial prompt
func (context *context) SetInitialPrompt(prompt string) {
context.params.SetInitialPrompt(prompt)
}

// ResetTimings resets the mode timings. Should be called before processing
func (context *context) ResetTimings() {
context.model.ctx.Whisper_reset_timings()
Expand Down
23 changes: 12 additions & 11 deletions bindings/go/pkg/whisper/interface.go
Expand Up @@ -38,17 +38,18 @@ type Context interface {
IsMultilingual() bool // Return true if the model is multilingual.
Language() string // Get language

SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSpeedup(bool) // Set speedup flag
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters
SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSpeedup(bool) // Set speedup flag
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters
SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context
SetInitialPrompt(prompt string) // Set initial prompt

// Process mono audio data and return any errors.
// If defined, newly generated segments are passed to the
Expand Down
2 changes: 1 addition & 1 deletion bindings/javascript/package.json
@@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.5.2",
"version": "1.5.4",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {
Expand Down
2 changes: 1 addition & 1 deletion bindings/ruby/ext/ggml-backend-impl.h
Expand Up @@ -70,7 +70,7 @@ extern "C" {
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

// compute graph without a plan
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

// check if the backend supports an operation
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
Expand Down
4 changes: 2 additions & 2 deletions bindings/ruby/ext/ggml-backend.c
Expand Up @@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
backend->iface.graph_plan_compute(backend, plan);
}

void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
backend->iface.graph_compute(backend, cgraph);
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
Expand Down
2 changes: 1 addition & 1 deletion bindings/ruby/ext/ggml-backend.h
Expand Up @@ -52,7 +52,7 @@ extern "C" {

GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);

// tensor copy between different backends
Expand Down
4 changes: 2 additions & 2 deletions coreml/whisper-encoder.mm
Expand Up @@ -24,9 +24,9 @@

// select which device to run the Core ML model on
MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
config.computeUnits = MLComputeUnitsCPUAndGPU;
// config.computeUnits = MLComputeUnitsCPUAndGPU;
//config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
//config.computeUnits = MLComputeUnitsAll;
config.computeUnits = MLComputeUnitsAll;

const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

Expand Down
4 changes: 4 additions & 0 deletions examples/CMakeLists.txt
Expand Up @@ -15,6 +15,10 @@ if (WHISPER_SDL2)
message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
endif()

if (WHISPER_CLBLAST)
find_package(CLBlast REQUIRED)
endif()

# common

set(TARGET common)
Expand Down
6 changes: 5 additions & 1 deletion examples/common-ggml.cpp
Expand Up @@ -62,6 +62,8 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
case GGML_FTYPE_MOSTLY_IQ2_XXS:
case GGML_FTYPE_MOSTLY_IQ2_XS:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -182,7 +184,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
{
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
Expand All @@ -191,6 +193,8 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down

0 comments on commit eda6990

Please sign in to comment.