In [None]:
# This is the only setting you need to change.
#Set this to the actual Whisper model's name that you want to convert:
# tiny, base, small, medium, large for multilingual or *.en for English-only models
MODEL_NAME = "tiny"

# Model files URLs are here (in case there's a need to update in the future):
# https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27
_MODELS = {
    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
    "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
    "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
}
MODEL_URL = _MODELS[MODEL_NAME]

MEL_FILTERS_URL = "https://github.com/openai/whisper/raw/main/whisper/assets/mel_filters.npz"


import struct
from pathlib import Path

import torch
from torchsummary import summary
import numpy as np

# This is where the result will be saved - download it when ready
FNAME_OUT = Path(f"./ggml-model-{MODEL_NAME}.bin")



In [None]:
%%capture
!rm -f $f"{MODEL_NAME}.pt"
!rm mel_filters.npz
!wget $MODEL_URL
!wget $MEL_FILTERS_URL

In [None]:
# Load the model checkpoint
checkpoint = torch.load(f"{MODEL_NAME}.pt", map_location="cpu")

# Filter out params that don't pertain to the encoder
enc_pars = ('n_mels', 'n_audio_ctx', 'n_audio_state',
            'n_audio_head', 'n_audio_layer')
params = {p : checkpoint["dims"][p] for p in enc_pars}

# Filter out blocks that don't pertain to the encoder
encoder = {k:v for k,v in checkpoint["model_state_dict"].items()
            if k.startswith('encoder.') }

# Load MEL filters
with np.load("mel_filters.npz", allow_pickle=True) as f:
  filters = f[f'mel_{params["n_mels"]}']


In [None]:
params

{'n_mels': 80,
 'n_audio_ctx': 1500,
 'n_audio_state': 384,
 'n_audio_head': 6,
 'n_audio_layer': 4}

In [None]:
fout = FNAME_OUT.open("wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex

for k in (
          'n_audio_ctx',
          'n_audio_state',
          'n_audio_head',
          'n_audio_layer',
          'n_mels',
          ):
  v = params.get(k)
  fout.write(struct.pack("i", v))
fout.write(struct.pack("i", True)) # signifies the use of float16

# write mel filters
fout.write(struct.pack("i", filters.shape[0]))
fout.write(struct.pack("i", filters.shape[1]))
for i in range(filters.shape[0]):
    for j in range(filters.shape[1]):
        fout.write(struct.pack("f", filters[i][j]))

# write model blocks
for name in encoder.keys():
    data = encoder[name].squeeze().numpy()
    print("Processing variable: " , name ,  " with shape: ", data.shape)

    # reshape conv bias from [n] to [n, 1]
    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
        data = data.reshape(data.shape[0], 1)
        print(f"  Reshaped variable: {name} to shape: ", data.shape)

    n_dims = len(data.shape)

    # looks like the whisper models are in f16 by default
    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype = 1
    if n_dims < 2 or \
            name == "encoder.conv1.bias"   or \
            name == "encoder.conv2.bias"   or \
            name == "encoder.positional_embedding" or \
            name == "decoder.positional_embedding":
        print("  Converting to float32")
        data = data.astype(np.float32)
        ftype = 0

    str_ = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str_)

    # data
    data.tofile(fout)

fout.close()
