In [1]:
import shutil
from pathlib import Path
import os
os.environ["HF_HOME"] = "./cache/huggingface"
os.environ['TRANSFORMERS_CACHE'] = "./cache/huggingface/t_cache"
from ctranslate2.converters import TransformersConverter
from transformers.models.whisper.convert_openai_to_hf import (
    convert_openai_whisper_to_tfms,
)
from huggingface_hub import HfApi
from faster_whisper import WhisperModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import wandb

def download_model_from_wandb(run_path: str, file_path: str, save_dir: str) -> str:
    """
    Download a model file from Weights & Biases and return the local file path.

    Parameters:
    - run_path: str. Path to the W&B run, e.g., "i4ds/whisper4sg/runs/28z8x0k4".
    - file_path: str. Path to the file in the W&B run, e.g., "40569234_output/last_model.pt".
    - save_dir: str. Local directory to save the file.

    Returns:
    - str: The local path to the downloaded file.
    """
    # Initialize W&B API
    api = wandb.Api()

    # Fetch the run
    run = api.run(run_path)

    # File save path
    save_path = f"{save_dir}/{file_path.split('/')[-1]}"

    # Download the file
    run.file(file_path).download(root=save_dir, replace=True)

    return save_path

# Example usage
run_path = "i4ds/whisper4sg/runs/wmfl1o4x"
file_path = "51270024_output/last_model.pt"
save_dir = "./downloaded_models"
hu_model_path = "i4ds/whisper4sg-sg-corpus-timewarping"

model_local_path = download_model_from_wandb(run_path, file_path, save_dir)
print(model_local_path)

./downloaded_models/last_model.pt


In [4]:
hf_model_folder = Path(save_dir, 'hf_model')
hf_model_folder.mkdir(exist_ok=True)
ctranslate2_model_folder = Path('ct2_output')

# Convert to Huggingface Model
hf_model = convert_openai_whisper_to_tfms(os.path.join(save_dir, file_path), hf_model_folder)

encoder.positional_embedding -> encoder.embed_positions.weight
encoder.conv1.weight -> encoder.conv1.weight
encoder.conv1.bias -> encoder.conv1.bias
encoder.conv2.weight -> encoder.conv2.weight
encoder.conv2.bias -> encoder.conv2.bias
encoder.blocks.0.attn.query.weight -> encoder.layers.0.self_attn.q_proj.weight
encoder.blocks.0.attn.query.bias -> encoder.layers.0.self_attn.q_proj.bias
encoder.blocks.0.attn.key.weight -> encoder.layers.0.self_attn.k_proj.weight
encoder.blocks.0.attn.value.weight -> encoder.layers.0.self_attn.v_proj.weight
encoder.blocks.0.attn.value.bias -> encoder.layers.0.self_attn.v_proj.bias
encoder.blocks.0.attn.out.weight -> encoder.layers.0.self_attn.out_proj.weight
encoder.blocks.0.attn.out.bias -> encoder.layers.0.self_attn.out_proj.bias
encoder.blocks.0.attn_ln.weight -> encoder.layers.0.self_attn_layer_norm.weight
encoder.blocks.0.attn_ln.bias -> encoder.layers.0.self_attn_layer_norm.bias
encoder.blocks.0.mlp.0.weight -> encoder.layers.0.fc1.weight
encoder.b



In [5]:
hf_model[0].save_pretrained(hf_model_folder)

Non-default generation parameters: {'begin_suppress_tokens': [220, 50256]}


In [6]:
shutil.copyfile("cache/tokenizer.json", Path(hf_model_folder, "tokenizer.json"))
shutil.copyfile("cache/config.json", Path(hf_model_folder, "config.json"))

# Create readme
readme_content = f"""
# Model Information

This folder contains a converted model using ctranslate2.

## Wandb log
https://wandb.ai/{run_path}

## Files
- `tokenizer.json`: Tokenizer file.
- `config.json`: Configuration file.

## Conversion Details
The model was converted to ctranslate2 format with float16 quantization.

## Data
Model was trained on the full sg corpus, with part of the mozilla common voice 13.0 dataset and SRG data translated by faster-whisper-v2.
"""
with open(Path(hf_model_folder, "README.md"), 'w') as f:
    f.write(readme_content)

# Convert to ctranslate2
converter = TransformersConverter(
    hf_model_folder,
    copy_files=["tokenizer.json", "README.md"],
    load_as_float16=True 
)

converter.convert(output_dir=ctranslate2_model_folder, quantization="float16", force=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.96s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PosixPath('ct2_output')

In [8]:
api = HfApi()
api.upload_folder(
    folder_path="ct2_output",
    repo_id=hu_model_path,
    repo_type='model',
)

model.bin: 100%|██████████| 3.09G/3.09G [03:16<00:00, 15.7MB/s]


CommitInfo(commit_url='https://huggingface.co/i4ds/whisper4sg-sg-corpus-timewarping/commit/cdb5727589f4f59ab8cdb5af5d339a0556b5df03', commit_message='Upload folder using huggingface_hub', commit_description='', oid='cdb5727589f4f59ab8cdb5af5d339a0556b5df03', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
model = WhisperModel(hu_model_path, device="cuda", compute_type="float16")

vocabulary.json: 100%|██████████| 1.07M/1.07M [00:00<00:00, 2.77MB/s]
tokenizer.json: 100%|██████████| 2.48M/2.48M [00:01<00:00, 2.05MB/s]
config.json: 100%|██████████| 12.1k/12.1k [00:00<00:00, 2.73MB/s]
model.bin: 100%|██████████| 3.09G/3.09G [02:37<00:00, 19.6MB/s]


In [10]:
segments, info = model.transcribe("01d2eb96-4aa2-488d-ae29-22a57c3acc10_79311_109311.mp3", beam_size=5, language='de')

In [11]:
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

Detected language 'de' with probability 1.000000
[0.00s -> 3.66s]  Jetzt passiert dasselbe auf einem neuen Kontinent.
[4.36s -> 9.26s]  Das ist logischerweise ein Gang in eine steigende Verschuldung.
[9.82s -> 13.70s]  Ich bitte Sie, diesem Geschäft zuzustimmen.
[15.86s -> 19.44s]  Dem Lithium wurde dabei Priorität eingeräumt.
[21.26s -> 24.02s]  Diese Produkte liefern wir auch ins Ausland.
[24.02s -> 27.78s]  Im Tennis gab es auch schon andere witzige Donner-Vorfälle.
