diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..3d37afc --- /dev/null +++ b/404.html @@ -0,0 +1,349 @@ + + + +
+ + + + + + + + + + + + + +️🎞 Subtitles generation tool (Web-UI + CLI + Python package) powered by OpenAI's Whisper and its variants 🎞️
+ + +SubsAI: Subtitles AI +Subtitles generation tool powered by OpenAI's Whisper and its variants.
+This program is free software: you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation, either version 3 of the License, or (at your option) any later +version. +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +You should have received a copy of the GNU General Public License along with +this program. If not, see https://www.gnu.org/licenses/.
+ + + +Subs AI class
+Example usage: +
file = './assets/test1.mp4'
+subs_ai = SubsAI()
+model = subs_ai.create_model('openai/whisper', {'model_type': 'base'})
+subs = subs_ai.transcribe(file, model)
+subs.save('test1.srt')
+
staticmethod
+
+
+available_models()
+
Returns the supported models
+ +Returns:
+list
+ –
+ list of available models
+src/subsai/main.py
53 +54 +55 +56 +57 +58 +59 +60 |
|
staticmethod
+
+
+model_info(model)
+
Returns general infos about the model (brief description and url)
+ +Parameters:
+str
)
+ –
+ model name
+Returns:
+dict
+ –
+ dict of infos
+src/subsai/main.py
62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 |
|
staticmethod
+
+
+config_schema(model)
+
Returns the configs associated with a model
+ +Parameters:
+str
)
+ –
+ model name
+Returns:
+dict
+ –
+ dict of configs
+src/subsai/main.py
74 +75 +76 +77 +78 +79 +80 +81 +82 +83 |
|
staticmethod
+
+
+create_model(model_name, model_config={})
+
Returns a model instance
+ +Parameters:
+str
)
+ –
+ the name of the model
+dict
)
+ –
+ the configuration dict
+Returns:
+AbstractModel
+ –
+ the model instance
+src/subsai/main.py
85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 |
|
staticmethod
+
+
+transcribe(media_file, model, model_config={})
+
Takes the model instance (created by :func:create_model
) or the model name.
+Returns a :class:pysubs2.SSAFile
https://pysubs2.readthedocs.io/en/latest/api-reference.html#ssafile-a-subtitle-file`_
Parameters:
+str
)
+ –
+ path of the media file (video/audio)
+Union[AbstractModel, str]
)
+ –
+ model instance or model name
+dict
)
+ –
+ model configs' dict
+Returns:
+SSAFile
+ –
+ SSAFile: list of subtitles
+src/subsai/main.py
97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 |
|
Tools()
+
Some tools related to subtitles processing (ex: translation)
+ + +src/subsai/main.py
122 +123 |
|
staticmethod
+
+
+available_translation_models()
+
Returns available translation models
+A simple link to :func:utils.available_translation_models
for easy access
Returns:
+list
+ –
+ list of available models
+src/subsai/main.py
125 +126 +127 +128 +129 +130 +131 +132 +133 +134 |
|
staticmethod
+
+
+available_translation_languages(model)
+
Returns the languages supported by the translation model
+ +Parameters:
+Union[str, TranslationModel]
)
+ –
+ the name of the model
+Returns:
+list
+ –
+ list of available languages
+src/subsai/main.py
136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 |
|
staticmethod
+
+
+create_translation_model(
+ model_name="m2m100", model_family=None
+)
+
Creates and returns a translation model instance.
+ +Parameters:
+str
)
+ –
+ name of the model. To get available models use :func:available_translation_models
str
)
+ –
+ Either "mbart50" or "m2m100". By default, See dl-translate
docs
Returns:
+TranslationModel
+ –
+ A translation model instance
+src/subsai/main.py
150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 |
|
staticmethod
+
+
+translate(
+ subs,
+ source_language,
+ target_language,
+ model="m2m100",
+ model_family=None,
+ translation_configs={},
+)
+
Translates a subtitles SSAFile
object, what :func:SubsAI.transcribe
is returning
Parameters:
+SSAFile
)
+ –
+ SSAFile
object
str
)
+ –
+ the language of the subtitles
+str
)
+ –
+ the target language
+Union[str, TranslationModel]
)
+ –
+ the translation model, either an str
or the model instance created by :func:create_translation_model
str
)
+ –
+ Either "mbart50" or "m2m100". By default, See dl-translate
docs
dict
)
+ –
+ dict of translation configs (see :attr:configs.ADVANCED_TOOLS_CONFIGS
)
Returns:
+SSAFile
+ –
+ returns an SSAFile
subtitles translated to the target language
src/subsai/main.py
162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 |
|
staticmethod
+
+
+auto_sync(subs, media_file, **kwargs)
+
Uses (ffsubsync)[https://github.com/smacke/ffsubsync] to auto-sync subtitles to the media file
+ +Parameters:
+SSAFile
)
+ –
+ SSAFile
file
str
)
+ –
+ path of the media_file
+configs to pass to ffsubsync (see :attr:configs.ADVANCED_TOOLS_CONFIGS
)
Returns:
+SSAFile
+ –
+ SSAFile
auto-synced
src/subsai/main.py
200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 +221 +222 +223 +224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 |
|
Whisper Model
+See openai/whisper
+ + + +WhisperModel(model_config)
+
+ Bases: AbstractModel
src/subsai/models/whisper_model.py
181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 |
|
instance-attribute
+ class-attribute
+
+
+model_name = 'openai/whisper'
+
instance-attribute
+ class-attribute
+
+
+config_schema = {
+ "model_type": {
+ "type": list,
+ "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
+ "options": whisper.available_models(),
+ "default": "base",
+ },
+ "device": {
+ "type": list,
+ "description": "The PyTorch device to put the model into",
+ "options": [None, get_available_devices()],
+ "default": None,
+ },
+ "download_root": {
+ "type": str,
+ "description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
+ "options": None,
+ "default": None,
+ },
+ "in_memory": {
+ "type": bool,
+ "description": "whether to preload the model weights into host memory",
+ "options": None,
+ "default": False,
+ },
+ "verbose": {
+ "type": bool,
+ "description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
+ "options": None,
+ "default": None,
+ },
+ "temperature": {
+ "type": Tuple,
+ "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
+ "options": None,
+ "default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+ },
+ "compression_ratio_threshold": {
+ "type": float,
+ "description": "If the gzip compression ratio is above this value, treat as failed",
+ "options": None,
+ "default": 2.4,
+ },
+ "logprob_threshold": {
+ "type": float,
+ "description": "If the average log probability over sampled tokens is below this value, treat as failed",
+ "options": None,
+ "default": -1.0,
+ },
+ "no_speech_threshold": {
+ "type": float,
+ "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
+ "options": None,
+ "default": 0.6,
+ },
+ "condition_on_previous_text": {
+ "type": bool,
+ "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
+ "options": None,
+ "default": True,
+ },
+ "task": {
+ "type": list,
+ "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
+ "options": ["transcribe", "translate"],
+ "default": "transcribe",
+ },
+ "language": {
+ "type": str,
+ "description": "language that the audio is in; uses detected language if None",
+ "options": None,
+ "default": None,
+ },
+ "sample_len": {
+ "type": int,
+ "description": "maximum number of tokens to sample",
+ "options": None,
+ "default": None,
+ },
+ "best_of": {
+ "type": int,
+ "description": "number of independent samples to collect, when t > 0",
+ "options": None,
+ "default": None,
+ },
+ "beam_size": {
+ "type": int,
+ "description": "number of beams in beam search, when t == 0",
+ "options": None,
+ "default": None,
+ },
+ "patience": {
+ "type": float,
+ "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
+ "options": None,
+ "default": None,
+ },
+ "length_penalty": {
+ "type": float,
+ "description": "'alpha' in Google NMT, None defaults to length norm",
+ "options": None,
+ "default": None,
+ },
+ "prompt": {
+ "type": str,
+ "description": "text or tokens for the previous context",
+ "options": None,
+ "default": None,
+ },
+ "prefix": {
+ "type": str,
+ "description": "text or tokens to prefix the current context",
+ "options": None,
+ "default": None,
+ },
+ "suppress_blank": {
+ "type": bool,
+ "description": "this will suppress blank outputs",
+ "options": None,
+ "default": True,
+ },
+ "suppress_tokens": {
+ "type": str,
+ "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
+ "options": None,
+ "default": "-1",
+ },
+ "without_timestamps": {
+ "type": bool,
+ "description": "use <|notimestamps|> to sample text tokens only",
+ "options": None,
+ "default": False,
+ },
+ "max_initial_timestamp": {
+ "type": float,
+ "description": "the initial timestamp cannot be later than this",
+ "options": None,
+ "default": 1.0,
+ },
+ "fp16": {
+ "type": bool,
+ "description": "use fp16 for most of the calculation",
+ "options": None,
+ "default": True,
+ },
+}
+
instance-attribute
+
+
+model_type = _load_config(
+ "model_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+device = _load_config(
+ "device", model_config, self.config_schema
+)
+
instance-attribute
+
+
+download_root = _load_config(
+ "download_root", model_config, self.config_schema
+)
+
instance-attribute
+
+
+in_memory = _load_config(
+ "in_memory", model_config, self.config_schema
+)
+
instance-attribute
+
+
+verbose = _load_config(
+ "verbose", model_config, self.config_schema
+)
+
instance-attribute
+
+
+temperature = _load_config(
+ "temperature", model_config, self.config_schema
+)
+
instance-attribute
+
+
+compression_ratio_threshold = _load_config(
+ "compression_ratio_threshold",
+ model_config,
+ self.config_schema,
+)
+
instance-attribute
+
+
+logprob_threshold = _load_config(
+ "logprob_threshold", model_config, self.config_schema
+)
+
instance-attribute
+
+
+no_speech_threshold = _load_config(
+ "no_speech_threshold", model_config, self.config_schema
+)
+
instance-attribute
+
+
+condition_on_previous_text = _load_config(
+ "condition_on_previous_text",
+ model_config,
+ self.config_schema,
+)
+
instance-attribute
+
+
+decode_options = {
+ config: _load_config(
+ config, model_config, self.config_schema
+ )
+ for config in self.config_schema
+ if not hasattr(self, config)
+}
+
instance-attribute
+
+
+model = whisper.load_model(
+ name=self.model_type,
+ device=self.device,
+ download_root=self.download_root,
+ in_memory=self.in_memory,
+)
+
transcribe(media_file)
+
src/subsai/models/whisper_model.py
206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 |
|
WhisperX Model
+See m-bain/whisperX
+ + + +WhisperXModel(model_config)
+
+ Bases: AbstractModel
src/subsai/models/whisperX_model.py
104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 |
|
instance-attribute
+ class-attribute
+
+
+model_name = 'm-bain/whisperX'
+
instance-attribute
+ class-attribute
+
+
+config_schema = {
+ "model_type": {
+ "type": list,
+ "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
+ "options": whisper.available_models(),
+ "default": "base",
+ },
+ "device": {
+ "type": list,
+ "description": "The PyTorch device to put the model into",
+ "options": ["cpu", "cuda"],
+ "default": "cpu",
+ },
+ "compute_type": {
+ "type": list,
+ "description": "change to 'int8' if low on GPU mem (may reduce accuracy)",
+ "options": ["default", "float16", "int8"],
+ "default": "default",
+ },
+ "download_root": {
+ "type": str,
+ "description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
+ "options": None,
+ "default": None,
+ },
+ "language": {
+ "type": str,
+ "description": "language that the audio is in; uses detected language if None",
+ "options": None,
+ "default": None,
+ },
+ "segment_type": {
+ "type": list,
+ "description": "Word-level timestamps, Choose here between sentence-level and word-level",
+ "options": ["sentence", "word"],
+ "default": "sentence",
+ },
+ "batch_size": {
+ "type": int,
+ "description": "reduce if low on GPU mem",
+ "options": None,
+ "default": 16,
+ },
+ "return_char_alignments": {
+ "type": bool,
+ "description": "Whether to return char alignments",
+ "options": None,
+ "default": False,
+ },
+ "speaker_labels": {
+ "type": bool,
+ "description": "Run Diarization Pipeline",
+ "options": None,
+ "default": False,
+ },
+ "HF_TOKEN": {
+ "type": str,
+ "description": "if speaker labels is True, you will need Hugging Face access token to use the diarization models, https://github.com/m-bain/whisperX#speaker-diarization",
+ "options": None,
+ "default": None,
+ },
+ "min_speakers": {
+ "type": int,
+ "description": "min speakers",
+ "options": None,
+ "default": None,
+ },
+ "max_speakers": {
+ "type": int,
+ "description": "max speakers",
+ "options": None,
+ "default": None,
+ },
+}
+
instance-attribute
+
+
+model_type = _load_config(
+ "model_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+device = _load_config(
+ "device", model_config, self.config_schema
+)
+
instance-attribute
+
+
+compute_type = _load_config(
+ "compute_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+download_root = _load_config(
+ "download_root", model_config, self.config_schema
+)
+
instance-attribute
+
+
+language = _load_config(
+ "language", model_config, self.config_schema
+)
+
instance-attribute
+
+
+segment_type = _load_config(
+ "segment_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+batch_size = _load_config(
+ "batch_size", model_config, self.config_schema
+)
+
instance-attribute
+
+
+return_char_alignments = _load_config(
+ "return_char_alignments",
+ model_config,
+ self.config_schema,
+)
+
instance-attribute
+
+
+speaker_labels = _load_config(
+ "speaker_labels", model_config, self.config_schema
+)
+
instance-attribute
+
+
+HF_TOKEN = _load_config(
+ "HF_TOKEN", model_config, self.config_schema
+)
+
instance-attribute
+
+
+min_speakers = _load_config(
+ "min_speakers", model_config, self.config_schema
+)
+
instance-attribute
+
+
+max_speakers = _load_config(
+ "max_speakers", model_config, self.config_schema
+)
+
instance-attribute
+
+
+model = whisperx.load_model(
+ self.model_type,
+ device=self.device,
+ compute_type=self.compute_type,
+ download_root=self.download_root,
+ language=self.language,
+)
+
transcribe(media_file)
+
src/subsai/models/whisperX_model.py
128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 |
|
Faster Whisper Model
+See guillaumekln/faster-whisper
+ + + +FasterWhisperModel(model_config)
+
+ Bases: AbstractModel
src/subsai/models/faster_whisper_model.py
224 +225 +226 +227 +228 +229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 |
|
instance-attribute
+ class-attribute
+
+
+model_name = 'guillaumekln/faster-whisper'
+
instance-attribute
+ class-attribute
+
+
+config_schema = {
+ "model_size_or_path": {
+ "type": list,
+ "description": 'Size of the model to use (e.g. "large-v2", "small", "tiny.en", etc.)or a path to a converted model directory. When a size is configured, the convertedmodel is downloaded from the Hugging Face Hub.',
+ "options": whisper.available_models(),
+ "default": "base",
+ },
+ "device": {
+ "type": list,
+ "description": 'Device to use for computation ("cpu", "cuda", "auto")',
+ "options": ["auto", "cpu", "cuda"],
+ "default": "auto",
+ },
+ "device_index": {
+ "type": int,
+ "description": "Device ID to use.The model can also be loaded on multiple GPUs by passing a list of IDs(e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallelwhen transcribe() is called from multiple Python threads (see also num_workers).",
+ "options": None,
+ "default": 0,
+ },
+ "compute_type": {
+ "type": str,
+ "description": "Type to use for computation.See https://opennmt.net/CTranslate2/quantization.html.",
+ "options": None,
+ "default": "default",
+ },
+ "cpu_threads": {
+ "type": int,
+ "description": "Number of threads to use when running on CPU (4 by default).A non zero value overrides the OMP_NUM_THREADS environment variable.",
+ "options": None,
+ "default": 0,
+ },
+ "num_workers": {
+ "type": int,
+ "description": "When transcribe() is called from multiple Python threads,having multiple workers enables true parallelism when running the model(concurrent calls to self.model.generate() will run in parallel).This can improve the global throughput at the cost of increased memory usage.",
+ "options": None,
+ "default": 1,
+ },
+ "temperature": {
+ "type": Tuple,
+ "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
+ "options": None,
+ "default": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
+ },
+ "compression_ratio_threshold": {
+ "type": float,
+ "description": "If the gzip compression ratio is above this value, treat as failed",
+ "options": None,
+ "default": 2.4,
+ },
+ "log_prob_threshold": {
+ "type": float,
+ "description": "If the average log probability over sampled tokens is below this value, treat as failed",
+ "options": None,
+ "default": -1.0,
+ },
+ "no_speech_threshold": {
+ "type": float,
+ "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
+ "options": None,
+ "default": 0.6,
+ },
+ "condition_on_previous_text": {
+ "type": bool,
+ "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
+ "options": None,
+ "default": True,
+ },
+ "task": {
+ "type": list,
+ "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
+ "options": ["transcribe", "translate"],
+ "default": "transcribe",
+ },
+ "language": {
+ "type": str,
+ "description": "language that the audio is in; uses detected language if None",
+ "options": None,
+ "default": None,
+ },
+ "best_of": {
+ "type": int,
+ "description": "number of independent samples to collect, when t > 0",
+ "options": None,
+ "default": 5,
+ },
+ "beam_size": {
+ "type": int,
+ "description": "number of beams in beam search, when t == 0",
+ "options": None,
+ "default": 5,
+ },
+ "patience": {
+ "type": float,
+ "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
+ "options": None,
+ "default": 1.0,
+ },
+ "length_penalty": {
+ "type": float,
+ "description": "'alpha' in Google NMT, None defaults to length norm",
+ "options": None,
+ "default": 1.0,
+ },
+ "prefix": {
+ "type": str,
+ "description": "text or tokens to prefix the current context",
+ "options": None,
+ "default": None,
+ },
+ "suppress_blank": {
+ "type": bool,
+ "description": "this will suppress blank outputs",
+ "options": None,
+ "default": True,
+ },
+ "suppress_tokens": {
+ "type": Tuple,
+ "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
+ "options": None,
+ "default": [-1],
+ },
+ "without_timestamps": {
+ "type": bool,
+ "description": "use <|notimestamps|> to sample text tokens only",
+ "options": None,
+ "default": False,
+ },
+ "max_initial_timestamp": {
+ "type": float,
+ "description": "the initial timestamp cannot be later than this",
+ "options": None,
+ "default": 1.0,
+ },
+ "initial_prompt": {
+ "type": str,
+ "description": "Optional text to provide as a prompt for the first window.",
+ "options": None,
+ "default": None,
+ },
+ "word_timestamps": {
+ "type": bool,
+ "description": "Extract word-level timestamps using the cross-attention patternand dynamic time warping, and include the timestamps for each word in each segment.",
+ "options": None,
+ "default": False,
+ },
+ "prepend_punctuations": {
+ "type": str,
+ "description": "If word_timestamps is True, merge these punctuation symbolswith the next word",
+ "options": None,
+ "default": "\"'“¿([{-",
+ },
+ "append_punctuations": {
+ "type": str,
+ "description": "If word_timestamps is True, merge these punctuation symbolswith the previous word",
+ "options": None,
+ "default": "\"'.。,,!!??::”)]}、",
+ },
+ "vad_filter": {
+ "type": bool,
+ "description": "If True, use the integrated Silero VAD model to filter out parts of the audio without speech.",
+ "options": None,
+ "default": False,
+ },
+ "vad_parameters": {
+ "type": dict,
+ "description": "Parameters for splitting long audios into speech chunks using silero VAD.",
+ "options": None,
+ "default": {
+ "threshold": 0.5,
+ "min_speech_duration_ms": 250,
+ "max_speech_duration_s": float("inf"),
+ "min_silence_duration_ms": 2000,
+ "window_size_samples": 1024,
+ "speech_pad_ms": 400,
+ },
+ },
+}
+
instance-attribute
+
+
+transcribe_configs = {
+ config: _load_config(
+ config, model_config, self.config_schema
+ )
+ for config in self.config_schema
+ if not hasattr(self, f"_{config}")
+}
+
instance-attribute
+
+
+model = WhisperModel(
+ model_size_or_path=self._model_size_or_path,
+ device=self._device,
+ device_index=self._device_index,
+ compute_type=self._compute_type,
+ cpu_threads=self._cpu_threads,
+ num_workers=self._num_workers,
+)
+
transcribe(media_file)
+
src/subsai/models/faster_whisper_model.py
246 +247 +248 +249 +250 +251 +252 +253 +254 +255 +256 +257 +258 +259 +260 |
|
Whisper.cpp Model
+See whisper.cpp, +See pywhispercpp
+ + + +WhisperCppModel(model_config)
+
+ Bases: AbstractModel
src/subsai/models/whispercpp_model.py
230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 |
|
instance-attribute
+ class-attribute
+
+
+model_name = 'ggerganov/whisper.cpp'
+
instance-attribute
+ class-attribute
+
+
+config_schema = {
+ "model_type": {
+ "type": list,
+ "description": "Available whisper.cpp models",
+ "options": AVAILABLE_MODELS,
+ "default": "base",
+ },
+ "n_threads": {
+ "type": int,
+ "description": "Number of threads to allocate for the inferencedefault to min(4, available hardware_concurrency)",
+ "options": None,
+ "default": 4,
+ },
+ "n_max_text_ctx": {
+ "type": int,
+ "description": "max tokens to use from past text as prompt for the decoder",
+ "options": None,
+ "default": 16384,
+ },
+ "offset_ms": {
+ "type": int,
+ "description": "start offset in ms",
+ "options": None,
+ "default": 0,
+ },
+ "duration_ms": {
+ "type": int,
+ "description": "audio duration to process in ms",
+ "options": None,
+ "default": 0,
+ },
+ "translate": {
+ "type": bool,
+ "description": "whether to translate the audio to English",
+ "options": None,
+ "default": False,
+ },
+ "no_context": {
+ "type": bool,
+ "description": "do not use past transcription (if any) as initial prompt for the decoder",
+ "options": None,
+ "default": False,
+ },
+ "single_segment": {
+ "type": bool,
+ "description": "force single segment output (useful for streaming)",
+ "options": None,
+ "default": False,
+ },
+ "print_special": {
+ "type": bool,
+ "description": "print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)",
+ "options": None,
+ "default": False,
+ },
+ "print_progress": {
+ "type": bool,
+ "description": "print progress information",
+ "options": None,
+ "default": True,
+ },
+ "print_realtime": {
+ "type": bool,
+ "description": "print results from within whisper.cpp (avoid it, use callback instead)",
+ "options": None,
+ "default": False,
+ },
+ "print_timestamps": {
+ "type": bool,
+ "description": "print timestamps for each text segment when printing realtime",
+ "options": None,
+ "default": True,
+ },
+ "token_timestamps": {
+ "type": bool,
+ "description": "enable token-level timestamps",
+ "options": None,
+ "default": False,
+ },
+ "thold_pt": {
+ "type": float,
+ "description": "timestamp token probability threshold (~0.01)",
+ "options": None,
+ "default": 0.01,
+ },
+ "thold_ptsum": {
+ "type": float,
+ "description": "timestamp token sum probability threshold (~0.01)",
+ "options": None,
+ "default": 0.01,
+ },
+ "max_len": {
+ "type": int,
+ "description": "max segment length in characters",
+ "options": None,
+ "default": 0,
+ },
+ "split_on_word": {
+ "type": bool,
+ "description": "split on word rather than on token (when used with max_len)",
+ "options": None,
+ "default": False,
+ },
+ "max_tokens": {
+ "type": int,
+ "description": "max tokens per segment (0 = no limit)",
+ "options": None,
+ "default": 0,
+ },
+ "speed_up": {
+ "type": bool,
+ "description": "speed-up the audio by 2x using Phase Vocoder",
+ "options": None,
+ "default": False,
+ },
+ "audio_ctx": {
+ "type": int,
+ "description": "overwrite the audio context size (0 = use default)",
+ "options": None,
+ "default": 0,
+ },
+ "prompt_n_tokens": {
+ "type": int,
+ "description": "tokens to provide to the whisper decoder as initial prompt",
+ "options": None,
+ "default": 0,
+ },
+ "language": {
+ "type": str,
+ "description": 'for auto-detection, set to None, "" or "auto"',
+ "options": None,
+ "default": "en",
+ },
+ "suppress_blank": {
+ "type": bool,
+ "description": "common decoding parameters",
+ "options": None,
+ "default": True,
+ },
+ "suppress_non_speech_tokens": {
+ "type": bool,
+ "description": "common decoding parameters",
+ "options": None,
+ "default": False,
+ },
+ "temperature": {
+ "type": float,
+ "description": "initial decoding temperature",
+ "options": None,
+ "default": 0.0,
+ },
+ "max_initial_ts": {
+ "type": float,
+ "description": "max_initial_ts",
+ "options": None,
+ "default": 1.0,
+ },
+ "length_penalty": {
+ "type": float,
+ "description": "length_penalty",
+ "options": None,
+ "default": -1.0,
+ },
+ "temperature_inc": {
+ "type": float,
+ "description": "temperature_inc",
+ "options": None,
+ "default": 0.2,
+ },
+ "entropy_thold": {
+ "type": float,
+ "description": 'similar to OpenAI\'s "compression_ratio_threshold"',
+ "options": None,
+ "default": 2.4,
+ },
+ "logprob_thold": {
+ "type": float,
+ "description": "logprob_thold",
+ "options": None,
+ "default": -1.0,
+ },
+ "no_speech_thold": {
+ "type": float,
+ "description": "no_speech_thold",
+ "options": None,
+ "default": 0.6,
+ },
+ "greedy": {
+ "type": dict,
+ "description": "greedy",
+ "options": None,
+ "default": {"best_of": -1},
+ },
+ "beam_search": {
+ "type": dict,
+ "description": "beam_search",
+ "options": None,
+ "default": {"beam_size": -1, "patience": -1.0},
+ },
+}
+
instance-attribute
+
+
+model_type = _load_config(
+ "model_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+params = {}
+
instance-attribute
+
+
+model = Model(model=self.model_type, None=self.params)
+
transcribe(media_file)
+
src/subsai/models/whispercpp_model.py
246 +247 +248 +249 +250 +251 +252 +253 |
|
API that the transcription models should follow
+ + + +AbstractModel(model_name=None, model_config={})
+
+ Bases: ABC
Abstract Model class
+ + +src/subsai/models/abstract_model.py
15 +16 +17 |
|
instance-attribute
+
+
+model_name = model_name
+
instance-attribute
+
+
+model_config = model_config
+
abstractmethod
+
+
+transcribe(media_file)
+
Transcribe the media_file
to subtitles.
example use case from pysubs2.whisper:
+.. code-block:: python + :linenos:
+subs = SSAFile() +for segment in segments: + event = SSAEvent(start=make_time(s=segment["start"]), end=make_time(s=segment["end"])) + event.plaintext = segment["text"].strip() + subs.append(event)
+ +Parameters:
+Path of the media file
+Returns:
+SSAFile
+ –
+ Collection of SSAEvent(s) (see :mod:pysubs2.ssaevent
)
src/subsai/models/abstract_model.py
19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 |
|
whisper_timestamped
+See linto-ai/whisper-timestamped
+ + + +WhisperTimeStamped(model_config={})
+
+ Bases: AbstractModel
src/subsai/models/whisper_timestamped_model.py
229 +230 +231 +232 +233 +234 +235 +236 +237 +238 +239 +240 +241 +242 +243 +244 +245 +246 +247 +248 +249 +250 +251 +252 +253 |
|
instance-attribute
+ class-attribute
+
+
+model_name = 'linto-ai/whisper-timestamped'
+
instance-attribute
+ class-attribute
+
+
+config_schema = {
+ "model_type": {
+ "type": list,
+ "description": "One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.",
+ "options": whisper_timestamped.available_models(),
+ "default": "base",
+ },
+ "segment_type": {
+ "type": list,
+ "description": "Whisper_timestamps gives the ability to have word-level timestamps, Choose here between sentence-level and word-level",
+ "options": ["sentence", "word"],
+ "default": "sentence",
+ },
+ "device": {
+ "type": list,
+ "description": "The PyTorch device to put the model into",
+ "options": [None, get_available_devices()],
+ "default": None,
+ },
+ "download_root": {
+ "type": str,
+ "description": "Path to download the model files; by default, it uses '~/.cache/whisper'",
+ "options": None,
+ "default": None,
+ },
+ "in_memory": {
+ "type": bool,
+ "description": "whether to preload the model weights into host memory",
+ "options": None,
+ "default": False,
+ },
+ "verbose": {
+ "type": bool,
+ "description": "Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything",
+ "options": None,
+ "default": None,
+ },
+ "temperature": {
+ "type": Tuple,
+ "description": "Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.",
+ "options": None,
+ "default": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+ },
+ "compression_ratio_threshold": {
+ "type": float,
+ "description": "If the gzip compression ratio is above this value, treat as failed",
+ "options": None,
+ "default": 2.4,
+ },
+ "logprob_threshold": {
+ "type": float,
+ "description": "If the average log probability over sampled tokens is below this value, treat as failed",
+ "options": None,
+ "default": -1.0,
+ },
+ "no_speech_threshold": {
+ "type": float,
+ "description": "If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent",
+ "options": None,
+ "default": 0.6,
+ },
+ "condition_on_previous_text": {
+ "type": bool,
+ "description": "if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.",
+ "options": None,
+ "default": True,
+ },
+ "task": {
+ "type": list,
+ "description": "whether to perform X->X 'transcribe' or X->English 'translate'",
+ "options": ["transcribe", "translate"],
+ "default": "transcribe",
+ },
+ "language": {
+ "type": str,
+ "description": "language that the audio is in; uses detected language if None",
+ "options": None,
+ "default": None,
+ },
+ "sample_len": {
+ "type": int,
+ "description": "maximum number of tokens to sample",
+ "options": None,
+ "default": None,
+ },
+ "best_of": {
+ "type": int,
+ "description": "number of independent samples to collect, when t > 0",
+ "options": None,
+ "default": None,
+ },
+ "beam_size": {
+ "type": int,
+ "description": "number of beams in beam search, when t == 0",
+ "options": None,
+ "default": None,
+ },
+ "patience": {
+ "type": float,
+ "description": "patience in beam search (https://arxiv.org/abs/2204.05424)",
+ "options": None,
+ "default": None,
+ },
+ "length_penalty": {
+ "type": float,
+ "description": "'alpha' in Google NMT, None defaults to length norm",
+ "options": None,
+ "default": None,
+ },
+ "suppress_tokens": {
+ "type": str,
+ "description": 'list of tokens ids (or comma-separated token ids) to suppress "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',
+ "options": None,
+ "default": "-1",
+ },
+ "fp16": {
+ "type": bool,
+ "description": "use fp16 for most of the calculation",
+ "options": None,
+ "default": True,
+ },
+ "remove_punctuation_from_words": {
+ "type": bool,
+ "description": "If False, words will be glued with the next punctuation mark (if any).If True, there will be no punctuation mark in the `words[:]['text']` list.It only affects these strings; This has no influence on the computation of the word confidence, whatever the value of `include_punctuation_in_confidence` is.",
+ "options": None,
+ "default": False,
+ },
+ "refine_whisper_precision": {
+ "type": float,
+ "description": "How much can we refine Whisper segment positions, in seconds. Must be a multiple of 0.02.",
+ "options": None,
+ "default": 0.5,
+ },
+ "min_word_duration": {
+ "type": float,
+ "description": "Minimum duration of a word, in seconds. If a word is shorter than this, timestamps will be adjusted.",
+ "options": None,
+ "default": 0.04,
+ },
+ "plot_word_alignment": {
+ "type": bool,
+ "description": "Whether to plot the word alignment for each segment. matplotlib must be installed to use this option.",
+ "options": None,
+ "default": False,
+ },
+ "seed": {
+ "type": int,
+ "description": "Random seed to use for temperature sampling, for the sake of reproducibility.Choose None for unpredictable randomness",
+ "options": None,
+ "default": 1234,
+ },
+ "vad": {
+ "type": bool,
+ "description": "Whether to perform voice activity detection (VAD) on the audio file, to remove silent parts before transcribing with Whisper model. This should decrease hallucinations from the Whisper model.",
+ "options": None,
+ "default": False,
+ },
+ "detect_disfluencies": {
+ "type": bool,
+ "description": 'Whether to detect disfluencies (i.e. hesitations, filler words, repetitions, corrections, etc.) that Whisper model might have omitted in the transcription. This should make the word timestamp prediction more accurate.And probable disfluencies will be marked as special words "[*]"',
+ "options": None,
+ "default": False,
+ },
+ "trust_whisper_timestamps": {
+ "type": bool,
+ "description": "Whether to rely on Whisper's timestamps to get approximative first estimate of segment positions (up to refine_whisper_precision).",
+ "options": None,
+ "default": True,
+ },
+ "naive_approach": {
+ "type": bool,
+ "description": "Force the naive approach that consists in decoding twice the audio file, once to get the transcription and once with the decoded tokens to get the alignment. Note that this approach is used anyway when beam_size is not None and/or when the temperature is a list with more than one element.",
+ "options": None,
+ "default": False,
+ },
+}
+
instance-attribute
+
+
+model_type = _load_config(
+ "model_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+segment_type = _load_config(
+ "segment_type", model_config, self.config_schema
+)
+
instance-attribute
+
+
+device = _load_config(
+ "device", model_config, self.config_schema
+)
+
instance-attribute
+
+
+download_root = _load_config(
+ "download_root", model_config, self.config_schema
+)
+
instance-attribute
+
+
+in_memory = _load_config(
+ "in_memory", model_config, self.config_schema
+)
+
instance-attribute
+
+
+verbose = _load_config(
+ "verbose", model_config, self.config_schema
+)
+
instance-attribute
+
+
+temperature = _load_config(
+ "temperature", model_config, self.config_schema
+)
+
instance-attribute
+
+
+compression_ratio_threshold = _load_config(
+ "compression_ratio_threshold",
+ model_config,
+ self.config_schema,
+)
+
instance-attribute
+
+
+logprob_threshold = _load_config(
+ "logprob_threshold", model_config, self.config_schema
+)
+
instance-attribute
+
+
+no_speech_threshold = _load_config(
+ "no_speech_threshold", model_config, self.config_schema
+)
+
instance-attribute
+
+
+condition_on_previous_text = _load_config(
+ "condition_on_previous_text",
+ model_config,
+ self.config_schema,
+)
+
instance-attribute
+
+
+decode_options = {
+ config: _load_config(
+ config, model_config, self.config_schema
+ )
+ for config in self.config_schema
+ if not hasattr(self, config)
+}
+
instance-attribute
+
+
+model = whisper_timestamped.load_model(
+ name=self.model_type,
+ device=self.device,
+ download_root=self.download_root,
+ in_memory=self.in_memory,
+)
+
transcribe(media_file)
+
src/subsai/models/whisper_timestamped_model.py
255 +256 +257 +258 +259 +260 +261 +262 +263 +264 +265 +266 +267 +268 +269 +270 +271 +272 +273 +274 +275 +276 +277 +278 +279 +280 +281 |
|
Configurations file
+ + + +module-attribute
+
+
+AVAILABLE_MODELS = {
+ "openai/whisper": {
+ "class": WhisperModel,
+ "description": "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.",
+ "url": "https://github.com/openai/whisper",
+ "config_schema": WhisperModel.config_schema,
+ },
+ "linto-ai/whisper-timestamped": {
+ "class": WhisperTimeStamped,
+ "description": "Multilingual Automatic Speech Recognition with word-level timestamps and confidence.",
+ "url": "https://github.com/linto-ai/whisper-timestamped",
+ "config_schema": WhisperTimeStamped.config_schema,
+ },
+ "ggerganov/whisper.cpp": {
+ "class": WhisperCppModel,
+ "description": "High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model\n* Plain C/C++ implementation without dependencies\n* Runs on the CPU\n",
+ "url": "https://github.com/ggerganov/whisper.cpp\nhttps://github.com/abdeladim-s/pywhispercpp",
+ "config_schema": WhisperCppModel.config_schema,
+ },
+ "guillaumekln/faster-whisper": {
+ "class": FasterWhisperModel,
+ "description": "**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.\nThis implementation is up to 4 times faster than [openai/whisper]( https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.",
+ "url": "https://github.com/guillaumekln/faster-whisper",
+ "config_schema": FasterWhisperModel.config_schema,
+ },
+ "m-bain/whisperX": {
+ "class": WhisperXModel,
+ "description": "**whisperX** is a fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.",
+ "url": "https://github.com/m-bain/whisperX",
+ "config_schema": WhisperXModel.config_schema,
+ },
+}
+
module-attribute
+
+
+BASIC_TOOLS_CONFIGS = {
+ "set time": {
+ "description": "Set time to a subtitle",
+ "config_schema": {
+ "h": {
+ "type": float,
+ "description": "hours: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "m": {
+ "type": float,
+ "description": "minutes: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "s": {
+ "type": float,
+ "description": "seconds: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "ms": {
+ "type": float,
+ "description": "milliseconds: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ },
+ },
+ "shift": {
+ "description": "Shift all subtitles by constant time amount",
+ "config_schema": {
+ "h": {
+ "type": float,
+ "description": "hours: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "m": {
+ "type": float,
+ "description": "minutes: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "s": {
+ "type": float,
+ "description": "seconds: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "ms": {
+ "type": float,
+ "description": "milliseconds: Integer or float values, may be positive or negative",
+ "options": None,
+ "default": 0,
+ },
+ "frames": {
+ "type": int,
+ "description": "When specified, must be an integer number of frames",
+ "options": None,
+ "default": None,
+ },
+ "fps": {
+ "type": float,
+ "description": "When specified, must be a positive number.",
+ "options": None,
+ "default": None,
+ },
+ },
+ },
+}
+
module-attribute
+
+
+ADVANCED_TOOLS_CONFIGS = {
+ "ffsubsync": {
+ "description": "Language-agnostic automatic synchronization of subtitles with video, so that subtitles are aligned to the correct starting point within the video.",
+ "url": "https://github.com/smacke/ffsubsync",
+ "config_schema": {
+ "vad": {
+ "type": list,
+ "description": "Which voice activity detector to use for speech extraction (if using video / audio as a reference",
+ "options": [
+ "subs_then_webrtc",
+ "webrtc",
+ "subs_then_auditok",
+ "auditok",
+ "subs_then_silero",
+ "silero",
+ ],
+ "default": DEFAULT_VAD,
+ },
+ "max-subtitle-seconds": {
+ "type": float,
+ "description": "Maximum duration for a subtitle to appear on-screen",
+ "options": None,
+ "default": DEFAULT_MAX_SUBTITLE_SECONDS,
+ },
+ "start-seconds": {
+ "type": int,
+ "description": "Start time for processing",
+ "options": None,
+ "default": DEFAULT_START_SECONDS,
+ },
+ "max-offset-seconds": {
+ "type": float,
+ "description": "The max allowed offset seconds for any subtitle segment",
+ "options": None,
+ "default": DEFAULT_MAX_OFFSET_SECONDS,
+ },
+ "apply-offset-seconds": {
+ "type": float,
+ "description": "Apply a predefined offset in seconds to all subtitle segments",
+ "options": None,
+ "default": DEFAULT_APPLY_OFFSET_SECONDS,
+ },
+ "suppress-output-if-offset-less-than": {
+ "type": float,
+ "description": "Apply a predefined offset in seconds to all subtitle segments",
+ "options": None,
+ "default": None,
+ },
+ "frame-rate": {
+ "type": int,
+ "description": "Frame rate for audio extraction",
+ "options": None,
+ "default": DEFAULT_FRAME_RATE,
+ },
+ "output-encoding": {
+ "type": str,
+ "description": 'What encoding to use for writing output subtitles (default=utf-8). Can indicate "same" to use same encoding as that of the input.',
+ "options": None,
+ "default": "utf-8",
+ },
+ "skip-infer-framerate-ratio": {
+ "type": bool,
+ "description": "If set, do not try to infer framerate ratio based on duration ratio.",
+ "options": None,
+ "default": False,
+ },
+ "no-fix-framerate": {
+ "type": bool,
+ "description": "If specified, subsync will not attempt to correct a framerate",
+ "options": None,
+ "default": False,
+ },
+ "serialize-speech": {
+ "type": bool,
+ "description": "If specified, serialize reference speech to a numpy array.",
+ "options": None,
+ "default": False,
+ },
+ "gss": {
+ "type": bool,
+ "description": "If specified, use golden-section search to try to findthe optimal framerate ratio between video and subtitles.",
+ "options": None,
+ "default": False,
+ },
+ },
+ },
+ "Translation": {
+ "description": "Translate to different languages using AI",
+ "url": "https://github.com/xhluca/dl-translate",
+ "config_schema": {
+ "model": {
+ "type": list,
+ "description": "The model",
+ "options": available_translation_models(),
+ "default": available_translation_models()[
+ 0
+ ],
+ },
+ "device": {
+ "type": list,
+ "description": '"cpu", "gpu" or "auto". If it\'s set to "auto", will try to select a GPU when available or else fall back to CPU',
+ "options": [
+ "auto",
+ get_available_devices(),
+ ],
+ "default": "auto",
+ },
+ "batch_size": {
+ "type": int,
+ "description": "The number of samples to load at once. If set to `None`, it will process everything at once\nA smaller value is preferred for `batch_size` if your (video) RAM is limited",
+ "options": None,
+ "default": 32,
+ },
+ "verbose": {
+ "type": bool,
+ "description": "Whether to display the progress bar for every batch processed.",
+ "options": None,
+ "default": True,
+ },
+ },
+ },
+}
+
\ufe0f\ud83c\udf9e Subtitles generation tool (Web-UI + CLI + Python package) powered by OpenAI's Whisper and its variants \ud83c\udf9e\ufe0f
"},{"location":"#subsai.main","title":"subsai.main","text":"SubsAI: Subtitles AI Subtitles generation tool powered by OpenAI's Whisper and its variants.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/.
"},{"location":"#subsai.main.SubsAI","title":"SubsAI","text":"Subs AI class
Example usage:
file = './assets/test1.mp4'\nsubs_ai = SubsAI()\nmodel = subs_ai.create_model('openai/whisper', {'model_type': 'base'})\nsubs = subs_ai.transcribe(file, model)\nsubs.save('test1.srt')\n
"},{"location":"#subsai.main.SubsAI.available_models","title":"available_models staticmethod
","text":"available_models()\n
Returns the supported models
Returns:
list
\u2013 list of available models
src/subsai/main.py
@staticmethod\ndef available_models() -> list:\n\"\"\"\n Returns the supported models\n\n :return: list of available models\n \"\"\"\n return list(AVAILABLE_MODELS.keys())\n
"},{"location":"#subsai.main.SubsAI.model_info","title":"model_info staticmethod
","text":"model_info(model)\n
Returns general infos about the model (brief description and url)
Parameters:
str
) \u2013 model name
Returns:
dict
\u2013 dict of infos
src/subsai/main.py
@staticmethod\ndef model_info(model: str) -> dict:\n\"\"\"\n Returns general infos about the model (brief description and url)\n\n :param model: model name\n\n :return: dict of infos\n \"\"\"\n return {'description': AVAILABLE_MODELS[model]['description'],\n 'url': AVAILABLE_MODELS[model]['url']}\n
"},{"location":"#subsai.main.SubsAI.config_schema","title":"config_schema staticmethod
","text":"config_schema(model)\n
Returns the configs associated with a model
Parameters:
str
) \u2013 model name
Returns:
dict
\u2013 dict of configs
src/subsai/main.py
@staticmethod\ndef config_schema(model: str) -> dict:\n\"\"\"\n Returns the configs associated with a model\n\n :param model: model name\n\n :return: dict of configs\n \"\"\"\n return AVAILABLE_MODELS[model]['config_schema']\n
"},{"location":"#subsai.main.SubsAI.create_model","title":"create_model staticmethod
","text":"create_model(model_name, model_config={})\n
Returns a model instance
Parameters:
str
) \u2013 the name of the model
dict
) \u2013 the configuration dict
Returns:
AbstractModel
\u2013 the model instance
src/subsai/main.py
@staticmethod\ndef create_model(model_name: str, model_config: dict = {}) -> AbstractModel:\n\"\"\"\n Returns a model instance\n\n :param model_name: the name of the model\n :param model_config: the configuration dict\n\n :return: the model instance\n \"\"\"\n return AVAILABLE_MODELS[model_name]['class'](model_config)\n
"},{"location":"#subsai.main.SubsAI.transcribe","title":"transcribe staticmethod
","text":"transcribe(media_file, model, model_config={})\n
Takes the model instance (created by :func:create_model
) or the model name. Returns a :class:pysubs2.SSAFile
https://pysubs2.readthedocs.io/en/latest/api-reference.html#ssafile-a-subtitle-file`_
Parameters:
str
) \u2013 path of the media file (video/audio)
Union[AbstractModel, str]
) \u2013 model instance or model name
dict
) \u2013 model configs' dict
Returns:
SSAFile
\u2013 SSAFile: list of subtitles
src/subsai/main.py
@staticmethod\ndef transcribe(media_file: str, model: Union[AbstractModel, str], model_config: dict = {}) -> SSAFile:\n\"\"\"\n Takes the model instance (created by :func:`create_model`) or the model name.\n Returns a :class:`pysubs2.SSAFile` <https://pysubs2.readthedocs.io/en/latest/api-reference.html#ssafile-a-subtitle-file>`_\n\n :param media_file: path of the media file (video/audio)\n :param model: model instance or model name\n :param model_config: model configs' dict\n\n :return: SSAFile: list of subtitles\n \"\"\"\n if type(model) == str:\n stt_model = SubsAI.create_model(model, model_config)\n else:\n stt_model = model\n media_file = str(pathlib.Path(media_file).resolve())\n return stt_model.transcribe(media_file)\n
"},{"location":"#subsai.main.Tools","title":"Tools","text":"Tools()\n
Some tools related to subtitles processing (ex: translation)
Source code insrc/subsai/main.py
def __init__(self):\n pass\n
"},{"location":"#subsai.main.Tools.available_translation_models","title":"available_translation_models staticmethod
","text":"available_translation_models()\n
Returns available translation models A simple link to :func:utils.available_translation_models
for easy access
Returns:
list
\u2013 list of available models
src/subsai/main.py
@staticmethod\ndef available_translation_models() -> list:\n\"\"\"\n Returns available translation models\n A simple link to :func:`utils.available_translation_models` for easy access\n\n :return: list of available models\n \"\"\"\n\n return available_translation_models()\n
"},{"location":"#subsai.main.Tools.available_translation_languages","title":"available_translation_languages staticmethod
","text":"available_translation_languages(model)\n
Returns the languages supported by the translation model
Parameters:
Union[str, TranslationModel]
) \u2013 the name of the model
Returns:
list
\u2013 list of available languages
src/subsai/main.py
@staticmethod\ndef available_translation_languages(model: Union[str, TranslationModel]) -> list:\n\"\"\"\n Returns the languages supported by the translation model\n\n :param model: the name of the model\n :return: list of available languages\n \"\"\"\n if type(model) == str:\n langs = Tools.create_translation_model(model).available_languages()\n else:\n langs = model.available_languages()\n return langs\n
"},{"location":"#subsai.main.Tools.create_translation_model","title":"create_translation_model staticmethod
","text":"create_translation_model(\n model_name=\"m2m100\", model_family=None\n)\n
Creates and returns a translation model instance.
Parameters:
str
) \u2013 name of the model. To get available models use :func:available_translation_models
str
) \u2013 Either \"mbart50\" or \"m2m100\". By default, See dl-translate
docs
Returns:
TranslationModel
\u2013 A translation model instance
src/subsai/main.py
@staticmethod\ndef create_translation_model(model_name: str = \"m2m100\", model_family: str = None) -> TranslationModel:\n\"\"\"\n Creates and returns a translation model instance.\n\n :param model_name: name of the model. To get available models use :func:`available_translation_models`\n :param model_family: Either \"mbart50\" or \"m2m100\". By default, See `dl-translate` docs\n :return: A translation model instance\n \"\"\"\n mt = TranslationModel(model_or_path=model_name, model_family=model_family)\n return mt\n
"},{"location":"#subsai.main.Tools.translate","title":"translate staticmethod
","text":"translate(\n subs,\n source_language,\n target_language,\n model=\"m2m100\",\n model_family=None,\n translation_configs={},\n)\n
Translates a subtitles SSAFile
object, what :func:SubsAI.transcribe
is returning
Parameters:
SSAFile
) \u2013 SSAFile
object
str
) \u2013 the language of the subtitles
str
) \u2013 the target language
Union[str, TranslationModel]
) \u2013 the translation model, either an str
or the model instance created by :func:create_translation_model
str
) \u2013 Either \"mbart50\" or \"m2m100\". By default, See dl-translate
docs
dict
) \u2013 dict of translation configs (see :attr:configs.ADVANCED_TOOLS_CONFIGS
)
Returns:
SSAFile
\u2013 returns an SSAFile
subtitles translated to the target language
src/subsai/main.py
@staticmethod\ndef translate(subs: SSAFile,\n source_language: str,\n target_language: str,\n model: Union[str, TranslationModel] = \"m2m100\",\n model_family: str = None,\n translation_configs: dict = {}) -> SSAFile:\n\"\"\"\n Translates a subtitles `SSAFile` object, what :func:`SubsAI.transcribe` is returning\n\n :param subs: `SSAFile` object\n :param source_language: the language of the subtitles\n :param target_language: the target language\n :param model: the translation model, either an `str` or the model instance created by\n :func:`create_translation_model`\n :param model_family: Either \"mbart50\" or \"m2m100\". By default, See `dl-translate` docs\n :param translation_configs: dict of translation configs (see :attr:`configs.ADVANCED_TOOLS_CONFIGS`)\n\n :return: returns an `SSAFile` subtitles translated to the target language\n \"\"\"\n if type(model) == str:\n translation_model = Tools.create_translation_model(model_name=model, model_family=model_family)\n else:\n translation_model = model\n\n translated_subs = SSAFile()\n for sub in subs:\n translated_sub = sub.copy()\n translated_sub.text = translation_model.translate(text=sub.text,\n source=source_language,\n target=target_language,\n batch_size=translation_configs[\n 'batch_size'] if 'batch_size' in translation_configs else 32,\n verbose=translation_configs[\n 'verbose'] if 'verbose' in translation_configs else False)\n translated_subs.append(translated_sub)\n return translated_subs\n
"},{"location":"#subsai.main.Tools.auto_sync","title":"auto_sync staticmethod
","text":"auto_sync(subs, media_file, **kwargs)\n
Uses (ffsubsync)[https://github.com/smacke/ffsubsync] to auto-sync subtitles to the media file
Parameters:
SSAFile
) \u2013 SSAFile
file
str
) \u2013 path of the media_file
configs to pass to ffsubsync (see :attr:configs.ADVANCED_TOOLS_CONFIGS
)
Returns:
SSAFile
\u2013 SSAFile
auto-synced
src/subsai/main.py
@staticmethod\ndef auto_sync(subs: SSAFile,\n media_file: str,\n **kwargs\n ) -> SSAFile:\n\"\"\"\n Uses (ffsubsync)[https://github.com/smacke/ffsubsync] to auto-sync subtitles to the media file\n\n :param subs: `SSAFile` file\n :param media_file: path of the media_file\n :param kwargs: configs to pass to ffsubsync (see :attr:`configs.ADVANCED_TOOLS_CONFIGS`)\n\n :return: `SSAFile` auto-synced\n \"\"\"\n parser = make_parser()\n srtin_file = tempfile.NamedTemporaryFile(delete=False)\n srtout_file = tempfile.NamedTemporaryFile(delete=False)\n try:\n srtin = srtin_file.name + '.ass'\n srtout = srtout_file.name + '.srt'\n subs.save(srtin)\n cmd = [media_file,\n '-i', srtin,\n '-o', srtout]\n for config_name in kwargs:\n value = kwargs[config_name]\n if value is None or value is False:\n continue\n elif type(value) == bool and value is True:\n cmd.append(f'--{config_name}')\n else:\n cmd.append(f'--{config_name}')\n cmd.append(f'{value}')\n parsed_args = parser.parse_args(cmd)\n retval = run(parsed_args)[\"retval\"]\n synced_subs = pysubs2.load(srtout)\n return synced_subs\n finally:\n srtin_file.close()\n os.unlink(srtin_file.name)\n srtout_file.close()\n os.unlink(srtout_file.name)\n
"},{"location":"#subsai.models","title":"subsai.models","text":""},{"location":"#subsai.models.whisper_model","title":"whisper_model","text":"Whisper Model
See openai/whisper
"},{"location":"#subsai.models.whisper_model.WhisperModel","title":"WhisperModel","text":"WhisperModel(model_config)\n
Bases: AbstractModel
src/subsai/models/whisper_model.py
def __init__(self, model_config):\n super(WhisperModel, self).__init__(model_config=model_config,\n model_name=self.model_name)\n # config\n self.model_type = _load_config('model_type', model_config, self.config_schema)\n self.device = _load_config('device', model_config, self.config_schema)\n self.download_root = _load_config('download_root', model_config, self.config_schema)\n self.in_memory = _load_config('in_memory', model_config, self.config_schema)\n\n self.verbose = _load_config('verbose', model_config, self.config_schema)\n self.temperature = _load_config('temperature', model_config, self.config_schema)\n self.compression_ratio_threshold = _load_config('compression_ratio_threshold', model_config, self.config_schema)\n self.logprob_threshold = _load_config('logprob_threshold', model_config, self.config_schema)\n self.no_speech_threshold = _load_config('no_speech_threshold', model_config, self.config_schema)\n self.condition_on_previous_text = _load_config('condition_on_previous_text', model_config, self.config_schema)\n\n self.decode_options = \\\n {config: _load_config(config, model_config, self.config_schema)\n for config in self.config_schema if not hasattr(self, config)}\n\n self.model = whisper.load_model(name=self.model_type,\n device=self.device,\n download_root=self.download_root,\n in_memory=self.in_memory)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.model_name","title":"model_name instance-attribute
class-attribute
","text":"model_name = 'openai/whisper'\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.config_schema","title":"config_schema instance-attribute
class-attribute
","text":"config_schema = {\n \"model_type\": {\n \"type\": list,\n \"description\": \"One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.\",\n \"options\": whisper.available_models(),\n \"default\": \"base\",\n },\n \"device\": {\n \"type\": list,\n \"description\": \"The PyTorch device to put the model into\",\n \"options\": [None, get_available_devices()],\n \"default\": None,\n },\n \"download_root\": {\n \"type\": str,\n \"description\": \"Path to download the model files; by default, it uses '~/.cache/whisper'\",\n \"options\": None,\n \"default\": None,\n },\n \"in_memory\": {\n \"type\": bool,\n \"description\": \"whether to preload the model weights into host memory\",\n \"options\": None,\n \"default\": False,\n },\n \"verbose\": {\n \"type\": bool,\n \"description\": \"Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything\",\n \"options\": None,\n \"default\": None,\n },\n \"temperature\": {\n \"type\": Tuple,\n \"description\": \"Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.\",\n \"options\": None,\n \"default\": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),\n },\n \"compression_ratio_threshold\": {\n \"type\": float,\n \"description\": \"If the gzip compression ratio is above this value, treat as failed\",\n \"options\": None,\n \"default\": 2.4,\n },\n \"logprob_threshold\": {\n \"type\": float,\n \"description\": \"If the average log probability over sampled tokens is below this value, treat as failed\",\n \"options\": None,\n \"default\": -1.0,\n },\n \"no_speech_threshold\": {\n \"type\": float,\n \"description\": \"If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent\",\n \"options\": None,\n \"default\": 0.6,\n },\n \"condition_on_previous_text\": {\n \"type\": bool,\n \"description\": \"if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.\",\n \"options\": None,\n \"default\": True,\n },\n \"task\": {\n \"type\": list,\n \"description\": \"whether to perform X->X 'transcribe' or X->English 'translate'\",\n \"options\": [\"transcribe\", \"translate\"],\n \"default\": \"transcribe\",\n },\n \"language\": {\n \"type\": str,\n \"description\": \"language that the audio is in; uses detected language if None\",\n \"options\": None,\n \"default\": None,\n },\n \"sample_len\": {\n \"type\": int,\n \"description\": \"maximum number of tokens to sample\",\n \"options\": None,\n \"default\": None,\n },\n \"best_of\": {\n \"type\": int,\n \"description\": \"number of independent samples to collect, when t > 0\",\n \"options\": None,\n \"default\": None,\n },\n \"beam_size\": {\n \"type\": int,\n \"description\": \"number of beams in beam search, when t == 0\",\n \"options\": None,\n \"default\": None,\n },\n \"patience\": {\n \"type\": float,\n \"description\": \"patience in beam search (https://arxiv.org/abs/2204.05424)\",\n \"options\": None,\n \"default\": None,\n },\n \"length_penalty\": {\n \"type\": float,\n \"description\": \"'alpha' in Google NMT, None defaults to length norm\",\n \"options\": None,\n \"default\": None,\n },\n \"prompt\": {\n \"type\": str,\n \"description\": \"text or tokens for the previous context\",\n \"options\": None,\n \"default\": None,\n },\n \"prefix\": {\n \"type\": str,\n \"description\": \"text or tokens to prefix the current context\",\n \"options\": None,\n \"default\": None,\n },\n \"suppress_blank\": {\n \"type\": bool,\n \"description\": \"this will suppress blank outputs\",\n \"options\": None,\n \"default\": True,\n },\n \"suppress_tokens\": {\n \"type\": str,\n \"description\": 'list of tokens ids (or comma-separated token ids) to suppress \"-1\" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',\n \"options\": None,\n \"default\": \"-1\",\n },\n \"without_timestamps\": {\n \"type\": bool,\n \"description\": \"use <|notimestamps|> to sample text tokens only\",\n \"options\": None,\n \"default\": False,\n },\n \"max_initial_timestamp\": {\n \"type\": float,\n \"description\": \"the initial timestamp cannot be later than this\",\n \"options\": None,\n \"default\": 1.0,\n },\n \"fp16\": {\n \"type\": bool,\n \"description\": \"use fp16 for most of the calculation\",\n \"options\": None,\n \"default\": True,\n },\n}\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.model_type","title":"model_type instance-attribute
","text":"model_type = _load_config(\n \"model_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.device","title":"device instance-attribute
","text":"device = _load_config(\n \"device\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.download_root","title":"download_root instance-attribute
","text":"download_root = _load_config(\n \"download_root\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.in_memory","title":"in_memory instance-attribute
","text":"in_memory = _load_config(\n \"in_memory\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.verbose","title":"verbose instance-attribute
","text":"verbose = _load_config(\n \"verbose\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.temperature","title":"temperature instance-attribute
","text":"temperature = _load_config(\n \"temperature\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.compression_ratio_threshold","title":"compression_ratio_threshold instance-attribute
","text":"compression_ratio_threshold = _load_config(\n \"compression_ratio_threshold\",\n model_config,\n self.config_schema,\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.logprob_threshold","title":"logprob_threshold instance-attribute
","text":"logprob_threshold = _load_config(\n \"logprob_threshold\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.no_speech_threshold","title":"no_speech_threshold instance-attribute
","text":"no_speech_threshold = _load_config(\n \"no_speech_threshold\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.condition_on_previous_text","title":"condition_on_previous_text instance-attribute
","text":"condition_on_previous_text = _load_config(\n \"condition_on_previous_text\",\n model_config,\n self.config_schema,\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.decode_options","title":"decode_options instance-attribute
","text":"decode_options = {\n config: _load_config(\n config, model_config, self.config_schema\n )\n for config in self.config_schema\n if not hasattr(self, config)\n}\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.model","title":"model instance-attribute
","text":"model = whisper.load_model(\n name=self.model_type,\n device=self.device,\n download_root=self.download_root,\n in_memory=self.in_memory,\n)\n
"},{"location":"#subsai.models.whisper_model.WhisperModel.transcribe","title":"transcribe","text":"transcribe(media_file)\n
Source code in src/subsai/models/whisper_model.py
def transcribe(self, media_file) -> str:\n audio = whisper.load_audio(media_file)\n result = self.model.transcribe(audio, verbose=self.verbose,\n temperature=self.temperature,\n compression_ratio_threshold=self.compression_ratio_threshold,\n logprob_threshold=self.logprob_threshold,\n no_speech_threshold=self.no_speech_threshold,\n condition_on_previous_text=self.condition_on_previous_text,\n **self.decode_options)\n subs = pysubs2.load_from_whisper(result)\n return subs\n
"},{"location":"#subsai.models.whisperX_model","title":"whisperX_model","text":"WhisperX Model
See m-bain/whisperX
"},{"location":"#subsai.models.whisperX_model.WhisperXModel","title":"WhisperXModel","text":"WhisperXModel(model_config)\n
Bases: AbstractModel
src/subsai/models/whisperX_model.py
def __init__(self, model_config):\n super(WhisperXModel, self).__init__(model_config=model_config,\n model_name=self.model_name)\n # config\n self.model_type = _load_config('model_type', model_config, self.config_schema)\n self.device = _load_config('device', model_config, self.config_schema)\n self.compute_type = _load_config('compute_type', model_config, self.config_schema)\n self.download_root = _load_config('download_root', model_config, self.config_schema)\n self.language = _load_config('language', model_config, self.config_schema)\n self.segment_type = _load_config('segment_type', model_config, self.config_schema)\n # transcribe config\n self.batch_size = _load_config('batch_size', model_config, self.config_schema)\n self.return_char_alignments = _load_config('return_char_alignments', model_config, self.config_schema)\n self.speaker_labels = _load_config('speaker_labels', model_config, self.config_schema)\n self.HF_TOKEN = _load_config('HF_TOKEN', model_config, self.config_schema)\n self.min_speakers = _load_config('min_speakers', model_config, self.config_schema)\n self.max_speakers = _load_config('max_speakers', model_config, self.config_schema)\n\n self.model = whisperx.load_model(self.model_type,\n device=self.device,\n compute_type=self.compute_type,\n download_root=self.download_root,\n language=self.language)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.model_name","title":"model_name instance-attribute
class-attribute
","text":"model_name = 'm-bain/whisperX'\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.config_schema","title":"config_schema instance-attribute
class-attribute
","text":"config_schema = {\n \"model_type\": {\n \"type\": list,\n \"description\": \"One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.\",\n \"options\": whisper.available_models(),\n \"default\": \"base\",\n },\n \"device\": {\n \"type\": list,\n \"description\": \"The PyTorch device to put the model into\",\n \"options\": [\"cpu\", \"cuda\"],\n \"default\": \"cpu\",\n },\n \"compute_type\": {\n \"type\": list,\n \"description\": \"change to 'int8' if low on GPU mem (may reduce accuracy)\",\n \"options\": [\"default\", \"float16\", \"int8\"],\n \"default\": \"default\",\n },\n \"download_root\": {\n \"type\": str,\n \"description\": \"Path to download the model files; by default, it uses '~/.cache/whisper'\",\n \"options\": None,\n \"default\": None,\n },\n \"language\": {\n \"type\": str,\n \"description\": \"language that the audio is in; uses detected language if None\",\n \"options\": None,\n \"default\": None,\n },\n \"segment_type\": {\n \"type\": list,\n \"description\": \"Word-level timestamps, Choose here between sentence-level and word-level\",\n \"options\": [\"sentence\", \"word\"],\n \"default\": \"sentence\",\n },\n \"batch_size\": {\n \"type\": int,\n \"description\": \"reduce if low on GPU mem\",\n \"options\": None,\n \"default\": 16,\n },\n \"return_char_alignments\": {\n \"type\": bool,\n \"description\": \"Whether to return char alignments\",\n \"options\": None,\n \"default\": False,\n },\n \"speaker_labels\": {\n \"type\": bool,\n \"description\": \"Run Diarization Pipeline\",\n \"options\": None,\n \"default\": False,\n },\n \"HF_TOKEN\": {\n \"type\": str,\n \"description\": \"if speaker labels is True, you will need Hugging Face access token to use the diarization models, https://github.com/m-bain/whisperX#speaker-diarization\",\n \"options\": None,\n \"default\": None,\n },\n \"min_speakers\": {\n \"type\": int,\n \"description\": \"min speakers\",\n \"options\": None,\n \"default\": None,\n },\n \"max_speakers\": {\n \"type\": int,\n \"description\": \"max speakers\",\n \"options\": None,\n \"default\": None,\n },\n}\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.model_type","title":"model_type instance-attribute
","text":"model_type = _load_config(\n \"model_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.device","title":"device instance-attribute
","text":"device = _load_config(\n \"device\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.compute_type","title":"compute_type instance-attribute
","text":"compute_type = _load_config(\n \"compute_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.download_root","title":"download_root instance-attribute
","text":"download_root = _load_config(\n \"download_root\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.language","title":"language instance-attribute
","text":"language = _load_config(\n \"language\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.segment_type","title":"segment_type instance-attribute
","text":"segment_type = _load_config(\n \"segment_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.batch_size","title":"batch_size instance-attribute
","text":"batch_size = _load_config(\n \"batch_size\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.return_char_alignments","title":"return_char_alignments instance-attribute
","text":"return_char_alignments = _load_config(\n \"return_char_alignments\",\n model_config,\n self.config_schema,\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.speaker_labels","title":"speaker_labels instance-attribute
","text":"speaker_labels = _load_config(\n \"speaker_labels\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.HF_TOKEN","title":"HF_TOKEN instance-attribute
","text":"HF_TOKEN = _load_config(\n \"HF_TOKEN\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.min_speakers","title":"min_speakers instance-attribute
","text":"min_speakers = _load_config(\n \"min_speakers\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.max_speakers","title":"max_speakers instance-attribute
","text":"max_speakers = _load_config(\n \"max_speakers\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.model","title":"model instance-attribute
","text":"model = whisperx.load_model(\n self.model_type,\n device=self.device,\n compute_type=self.compute_type,\n download_root=self.download_root,\n language=self.language,\n)\n
"},{"location":"#subsai.models.whisperX_model.WhisperXModel.transcribe","title":"transcribe","text":"transcribe(media_file)\n
Source code in src/subsai/models/whisperX_model.py
def transcribe(self, media_file) -> str:\n audio = whisperx.load_audio(media_file)\n result = self.model.transcribe(audio, batch_size=self.batch_size)\n model_a, metadata = whisperx.load_align_model(language_code=result[\"language\"], device=self.device)\n result = whisperx.align(result[\"segments\"], model_a, metadata, audio, self.device, return_char_alignments=self.return_char_alignments)\n self._clear_gpu()\n del model_a\n if self.speaker_labels:\n diarize_model = whisperx.DiarizationPipeline(use_auth_token=self.HF_TOKEN, device=self.device)\n diarize_segments = diarize_model(audio, min_speakers=self.min_speakers, max_speakers=self.max_speakers)\n result = whisperx.assign_word_speakers(diarize_segments, result)\n self._clear_gpu()\n del diarize_model\n\n subs = SSAFile()\n if self.segment_type == 'word': # word level timestamps\n for segment in result['segments']:\n for word in segment['words']:\n event = SSAEvent(start=pysubs2.make_time(s=word[\"start\"]), end=pysubs2.make_time(s=word[\"end\"]))\n event.plaintext = word[\"word\"].strip()\n subs.append(event)\n elif self.segment_type == 'sentence':\n for segment in result['segments']:\n event = SSAEvent(start=pysubs2.make_time(s=segment[\"start\"]), end=pysubs2.make_time(s=segment[\"end\"]))\n event.plaintext = segment[\"text\"].strip()\n subs.append(event)\n else:\n raise Exception(f'Unknown `segment_type` value, it should be one of the following: '\n f' {self.config_schema[\"segment_type\"][\"options\"]}')\n return subs\n
"},{"location":"#subsai.models.faster_whisper_model","title":"faster_whisper_model","text":"Faster Whisper Model
See guillaumekln/faster-whisper
"},{"location":"#subsai.models.faster_whisper_model.FasterWhisperModel","title":"FasterWhisperModel","text":"FasterWhisperModel(model_config)\n
Bases: AbstractModel
src/subsai/models/faster_whisper_model.py
def __init__(self, model_config):\n super(FasterWhisperModel, self).__init__(model_config=model_config,\n model_name=self.model_name)\n # config\n self._model_size_or_path = _load_config('model_size_or_path', model_config, self.config_schema)\n self._device = _load_config('device', model_config, self.config_schema)\n self._device_index = _load_config('device_index', model_config, self.config_schema)\n self._compute_type = _load_config('compute_type', model_config, self.config_schema)\n self._cpu_threads = _load_config('cpu_threads', model_config, self.config_schema)\n self._num_workers = _load_config('num_workers', model_config, self.config_schema)\n\n self.transcribe_configs = \\\n {config: _load_config(config, model_config, self.config_schema)\n for config in self.config_schema if not hasattr(self, f\"_{config}\")}\n\n self.model = WhisperModel(model_size_or_path=self._model_size_or_path,\n device=self._device,\n device_index=self._device_index,\n compute_type=self._compute_type,\n cpu_threads=self._cpu_threads,\n num_workers=self._num_workers)\n
"},{"location":"#subsai.models.faster_whisper_model.FasterWhisperModel.model_name","title":"model_name instance-attribute
class-attribute
","text":"model_name = 'guillaumekln/faster-whisper'\n
"},{"location":"#subsai.models.faster_whisper_model.FasterWhisperModel.config_schema","title":"config_schema instance-attribute
class-attribute
","text":"config_schema = {\n \"model_size_or_path\": {\n \"type\": list,\n \"description\": 'Size of the model to use (e.g. \"large-v2\", \"small\", \"tiny.en\", etc.)or a path to a converted model directory. When a size is configured, the convertedmodel is downloaded from the Hugging Face Hub.',\n \"options\": whisper.available_models(),\n \"default\": \"base\",\n },\n \"device\": {\n \"type\": list,\n \"description\": 'Device to use for computation (\"cpu\", \"cuda\", \"auto\")',\n \"options\": [\"auto\", \"cpu\", \"cuda\"],\n \"default\": \"auto\",\n },\n \"device_index\": {\n \"type\": int,\n \"description\": \"Device ID to use.The model can also be loaded on multiple GPUs by passing a list of IDs(e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallelwhen transcribe() is called from multiple Python threads (see also num_workers).\",\n \"options\": None,\n \"default\": 0,\n },\n \"compute_type\": {\n \"type\": str,\n \"description\": \"Type to use for computation.See https://opennmt.net/CTranslate2/quantization.html.\",\n \"options\": None,\n \"default\": \"default\",\n },\n \"cpu_threads\": {\n \"type\": int,\n \"description\": \"Number of threads to use when running on CPU (4 by default).A non zero value overrides the OMP_NUM_THREADS environment variable.\",\n \"options\": None,\n \"default\": 0,\n },\n \"num_workers\": {\n \"type\": int,\n \"description\": \"When transcribe() is called from multiple Python threads,having multiple workers enables true parallelism when running the model(concurrent calls to self.model.generate() will run in parallel).This can improve the global throughput at the cost of increased memory usage.\",\n \"options\": None,\n \"default\": 1,\n },\n \"temperature\": {\n \"type\": Tuple,\n \"description\": \"Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.\",\n \"options\": None,\n \"default\": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],\n },\n \"compression_ratio_threshold\": {\n \"type\": float,\n \"description\": \"If the gzip compression ratio is above this value, treat as failed\",\n \"options\": None,\n \"default\": 2.4,\n },\n \"log_prob_threshold\": {\n \"type\": float,\n \"description\": \"If the average log probability over sampled tokens is below this value, treat as failed\",\n \"options\": None,\n \"default\": -1.0,\n },\n \"no_speech_threshold\": {\n \"type\": float,\n \"description\": \"If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent\",\n \"options\": None,\n \"default\": 0.6,\n },\n \"condition_on_previous_text\": {\n \"type\": bool,\n \"description\": \"if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.\",\n \"options\": None,\n \"default\": True,\n },\n \"task\": {\n \"type\": list,\n \"description\": \"whether to perform X->X 'transcribe' or X->English 'translate'\",\n \"options\": [\"transcribe\", \"translate\"],\n \"default\": \"transcribe\",\n },\n \"language\": {\n \"type\": str,\n \"description\": \"language that the audio is in; uses detected language if None\",\n \"options\": None,\n \"default\": None,\n },\n \"best_of\": {\n \"type\": int,\n \"description\": \"number of independent samples to collect, when t > 0\",\n \"options\": None,\n \"default\": 5,\n },\n \"beam_size\": {\n \"type\": int,\n \"description\": \"number of beams in beam search, when t == 0\",\n \"options\": None,\n \"default\": 5,\n },\n \"patience\": {\n \"type\": float,\n \"description\": \"patience in beam search (https://arxiv.org/abs/2204.05424)\",\n \"options\": None,\n \"default\": 1.0,\n },\n \"length_penalty\": {\n \"type\": float,\n \"description\": \"'alpha' in Google NMT, None defaults to length norm\",\n \"options\": None,\n \"default\": 1.0,\n },\n \"prefix\": {\n \"type\": str,\n \"description\": \"text or tokens to prefix the current context\",\n \"options\": None,\n \"default\": None,\n },\n \"suppress_blank\": {\n \"type\": bool,\n \"description\": \"this will suppress blank outputs\",\n \"options\": None,\n \"default\": True,\n },\n \"suppress_tokens\": {\n \"type\": Tuple,\n \"description\": 'list of tokens ids (or comma-separated token ids) to suppress \"-1\" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',\n \"options\": None,\n \"default\": [-1],\n },\n \"without_timestamps\": {\n \"type\": bool,\n \"description\": \"use <|notimestamps|> to sample text tokens only\",\n \"options\": None,\n \"default\": False,\n },\n \"max_initial_timestamp\": {\n \"type\": float,\n \"description\": \"the initial timestamp cannot be later than this\",\n \"options\": None,\n \"default\": 1.0,\n },\n \"initial_prompt\": {\n \"type\": str,\n \"description\": \"Optional text to provide as a prompt for the first window.\",\n \"options\": None,\n \"default\": None,\n },\n \"word_timestamps\": {\n \"type\": bool,\n \"description\": \"Extract word-level timestamps using the cross-attention patternand dynamic time warping, and include the timestamps for each word in each segment.\",\n \"options\": None,\n \"default\": False,\n },\n \"prepend_punctuations\": {\n \"type\": str,\n \"description\": \"If word_timestamps is True, merge these punctuation symbolswith the next word\",\n \"options\": None,\n \"default\": \"\\\"'\u201c\u00bf([{-\",\n },\n \"append_punctuations\": {\n \"type\": str,\n \"description\": \"If word_timestamps is True, merge these punctuation symbolswith the previous word\",\n \"options\": None,\n \"default\": \"\\\"'.\u3002,\uff0c!\uff01?\uff1f:\uff1a\u201d)]}\u3001\",\n },\n \"vad_filter\": {\n \"type\": bool,\n \"description\": \"If True, use the integrated Silero VAD model to filter out parts of the audio without speech.\",\n \"options\": None,\n \"default\": False,\n },\n \"vad_parameters\": {\n \"type\": dict,\n \"description\": \"Parameters for splitting long audios into speech chunks using silero VAD.\",\n \"options\": None,\n \"default\": {\n \"threshold\": 0.5,\n \"min_speech_duration_ms\": 250,\n \"max_speech_duration_s\": float(\"inf\"),\n \"min_silence_duration_ms\": 2000,\n \"window_size_samples\": 1024,\n \"speech_pad_ms\": 400,\n },\n },\n}\n
"},{"location":"#subsai.models.faster_whisper_model.FasterWhisperModel.transcribe_configs","title":"transcribe_configs instance-attribute
","text":"transcribe_configs = {\n config: _load_config(\n config, model_config, self.config_schema\n )\n for config in self.config_schema\n if not hasattr(self, f\"_{config}\")\n}\n
"},{"location":"#subsai.models.faster_whisper_model.FasterWhisperModel.model","title":"model instance-attribute
","text":"model = WhisperModel(\n model_size_or_path=self._model_size_or_path,\n device=self._device,\n device_index=self._device_index,\n compute_type=self._compute_type,\n cpu_threads=self._cpu_threads,\n num_workers=self._num_workers,\n)\n
"},{"location":"#subsai.models.faster_whisper_model.FasterWhisperModel.transcribe","title":"transcribe","text":"transcribe(media_file)\n
Source code in src/subsai/models/faster_whisper_model.py
def transcribe(self, media_file) -> str:\n segments, info = self.model.transcribe(media_file, **self.transcribe_configs)\n subs = SSAFile()\n if self.transcribe_configs['word_timestamps']: # word level timestamps\n for segment in segments:\n for word in segment.words:\n event = SSAEvent(start=pysubs2.make_time(s=word.start), end=pysubs2.make_time(s=word.end))\n event.plaintext = word.word.strip()\n subs.append(event)\n else:\n for segment in segments:\n event = SSAEvent(start=pysubs2.make_time(s=segment.start), end=pysubs2.make_time(s=segment.end))\n event.plaintext = segment.text.strip()\n subs.append(event)\n return subs\n
"},{"location":"#subsai.models.whispercpp_model","title":"whispercpp_model","text":"Whisper.cpp Model
See whisper.cpp, See pywhispercpp
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel","title":"WhisperCppModel","text":"WhisperCppModel(model_config)\n
Bases: AbstractModel
src/subsai/models/whispercpp_model.py
def __init__(self, model_config):\n super(WhisperCppModel, self).__init__(model_config=model_config,\n model_name=self.model_name)\n # config\n self.model_type = _load_config('model_type', model_config, self.config_schema)\n\n self.params = {}\n for config in self.config_schema:\n if not hasattr(self, config):\n config_value = _load_config(config, model_config, self.config_schema)\n if config_value is None:\n continue\n self.params[config] = config_value\n\n self.model = Model(model=self.model_type, **self.params)\n
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel.model_name","title":"model_name instance-attribute
class-attribute
","text":"model_name = 'ggerganov/whisper.cpp'\n
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel.config_schema","title":"config_schema instance-attribute
class-attribute
","text":"config_schema = {\n \"model_type\": {\n \"type\": list,\n \"description\": \"Available whisper.cpp models\",\n \"options\": AVAILABLE_MODELS,\n \"default\": \"base\",\n },\n \"n_threads\": {\n \"type\": int,\n \"description\": \"Number of threads to allocate for the inferencedefault to min(4, available hardware_concurrency)\",\n \"options\": None,\n \"default\": 4,\n },\n \"n_max_text_ctx\": {\n \"type\": int,\n \"description\": \"max tokens to use from past text as prompt for the decoder\",\n \"options\": None,\n \"default\": 16384,\n },\n \"offset_ms\": {\n \"type\": int,\n \"description\": \"start offset in ms\",\n \"options\": None,\n \"default\": 0,\n },\n \"duration_ms\": {\n \"type\": int,\n \"description\": \"audio duration to process in ms\",\n \"options\": None,\n \"default\": 0,\n },\n \"translate\": {\n \"type\": bool,\n \"description\": \"whether to translate the audio to English\",\n \"options\": None,\n \"default\": False,\n },\n \"no_context\": {\n \"type\": bool,\n \"description\": \"do not use past transcription (if any) as initial prompt for the decoder\",\n \"options\": None,\n \"default\": False,\n },\n \"single_segment\": {\n \"type\": bool,\n \"description\": \"force single segment output (useful for streaming)\",\n \"options\": None,\n \"default\": False,\n },\n \"print_special\": {\n \"type\": bool,\n \"description\": \"print special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.)\",\n \"options\": None,\n \"default\": False,\n },\n \"print_progress\": {\n \"type\": bool,\n \"description\": \"print progress information\",\n \"options\": None,\n \"default\": True,\n },\n \"print_realtime\": {\n \"type\": bool,\n \"description\": \"print results from within whisper.cpp (avoid it, use callback instead)\",\n \"options\": None,\n \"default\": False,\n },\n \"print_timestamps\": {\n \"type\": bool,\n \"description\": \"print timestamps for each text segment when printing realtime\",\n \"options\": None,\n \"default\": True,\n },\n \"token_timestamps\": {\n \"type\": bool,\n \"description\": \"enable token-level timestamps\",\n \"options\": None,\n \"default\": False,\n },\n \"thold_pt\": {\n \"type\": float,\n \"description\": \"timestamp token probability threshold (~0.01)\",\n \"options\": None,\n \"default\": 0.01,\n },\n \"thold_ptsum\": {\n \"type\": float,\n \"description\": \"timestamp token sum probability threshold (~0.01)\",\n \"options\": None,\n \"default\": 0.01,\n },\n \"max_len\": {\n \"type\": int,\n \"description\": \"max segment length in characters\",\n \"options\": None,\n \"default\": 0,\n },\n \"split_on_word\": {\n \"type\": bool,\n \"description\": \"split on word rather than on token (when used with max_len)\",\n \"options\": None,\n \"default\": False,\n },\n \"max_tokens\": {\n \"type\": int,\n \"description\": \"max tokens per segment (0 = no limit)\",\n \"options\": None,\n \"default\": 0,\n },\n \"speed_up\": {\n \"type\": bool,\n \"description\": \"speed-up the audio by 2x using Phase Vocoder\",\n \"options\": None,\n \"default\": False,\n },\n \"audio_ctx\": {\n \"type\": int,\n \"description\": \"overwrite the audio context size (0 = use default)\",\n \"options\": None,\n \"default\": 0,\n },\n \"prompt_n_tokens\": {\n \"type\": int,\n \"description\": \"tokens to provide to the whisper decoder as initial prompt\",\n \"options\": None,\n \"default\": 0,\n },\n \"language\": {\n \"type\": str,\n \"description\": 'for auto-detection, set to None, \"\" or \"auto\"',\n \"options\": None,\n \"default\": \"en\",\n },\n \"suppress_blank\": {\n \"type\": bool,\n \"description\": \"common decoding parameters\",\n \"options\": None,\n \"default\": True,\n },\n \"suppress_non_speech_tokens\": {\n \"type\": bool,\n \"description\": \"common decoding parameters\",\n \"options\": None,\n \"default\": False,\n },\n \"temperature\": {\n \"type\": float,\n \"description\": \"initial decoding temperature\",\n \"options\": None,\n \"default\": 0.0,\n },\n \"max_initial_ts\": {\n \"type\": float,\n \"description\": \"max_initial_ts\",\n \"options\": None,\n \"default\": 1.0,\n },\n \"length_penalty\": {\n \"type\": float,\n \"description\": \"length_penalty\",\n \"options\": None,\n \"default\": -1.0,\n },\n \"temperature_inc\": {\n \"type\": float,\n \"description\": \"temperature_inc\",\n \"options\": None,\n \"default\": 0.2,\n },\n \"entropy_thold\": {\n \"type\": float,\n \"description\": 'similar to OpenAI\\'s \"compression_ratio_threshold\"',\n \"options\": None,\n \"default\": 2.4,\n },\n \"logprob_thold\": {\n \"type\": float,\n \"description\": \"logprob_thold\",\n \"options\": None,\n \"default\": -1.0,\n },\n \"no_speech_thold\": {\n \"type\": float,\n \"description\": \"no_speech_thold\",\n \"options\": None,\n \"default\": 0.6,\n },\n \"greedy\": {\n \"type\": dict,\n \"description\": \"greedy\",\n \"options\": None,\n \"default\": {\"best_of\": -1},\n },\n \"beam_search\": {\n \"type\": dict,\n \"description\": \"beam_search\",\n \"options\": None,\n \"default\": {\"beam_size\": -1, \"patience\": -1.0},\n },\n}\n
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel.model_type","title":"model_type instance-attribute
","text":"model_type = _load_config(\n \"model_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel.params","title":"params instance-attribute
","text":"params = {}\n
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel.model","title":"model instance-attribute
","text":"model = Model(model=self.model_type, None=self.params)\n
"},{"location":"#subsai.models.whispercpp_model.WhisperCppModel.transcribe","title":"transcribe","text":"transcribe(media_file)\n
Source code in src/subsai/models/whispercpp_model.py
def transcribe(self, media_file) -> str:\n segments = self.model.transcribe(media=media_file)\n subs = SSAFile()\n for seg in segments:\n event = SSAEvent(start=seg.t0*10, end=seg.t1*10)\n event.plaintext = seg.text.strip()\n subs.append(event)\n return subs\n
"},{"location":"#subsai.models.abstract_model","title":"abstract_model","text":"API that the transcription models should follow
"},{"location":"#subsai.models.abstract_model.AbstractModel","title":"AbstractModel","text":"AbstractModel(model_name=None, model_config={})\n
Bases: ABC
Abstract Model class
Source code insrc/subsai/models/abstract_model.py
def __init__(self, model_name=None, model_config={}):\n self.model_name = model_name\n self.model_config = model_config\n
"},{"location":"#subsai.models.abstract_model.AbstractModel.model_name","title":"model_name instance-attribute
","text":"model_name = model_name\n
"},{"location":"#subsai.models.abstract_model.AbstractModel.model_config","title":"model_config instance-attribute
","text":"model_config = model_config\n
"},{"location":"#subsai.models.abstract_model.AbstractModel.transcribe","title":"transcribe abstractmethod
","text":"transcribe(media_file)\n
Transcribe the media_file
to subtitles.
example use case from pysubs2.whisper:
.. code-block:: python :linenos:
subs = SSAFile() for segment in segments: event = SSAEvent(start=make_time(s=segment[\"start\"]), end=make_time(s=segment[\"end\"])) event.plaintext = segment[\"text\"].strip() subs.append(event)
Parameters:
Path of the media file
Returns:
SSAFile
\u2013 Collection of SSAEvent(s) (see :mod:pysubs2.ssaevent
)
src/subsai/models/abstract_model.py
@abstractmethod\ndef transcribe(self, media_file) -> SSAFile:\n\"\"\"\n Transcribe the `media_file` to subtitles.\n\n example use case from pysubs2.whisper:\n\n .. code-block:: python\n :linenos:\n\n subs = SSAFile()\n for segment in segments:\n event = SSAEvent(start=make_time(s=segment[\"start\"]), end=make_time(s=segment[\"end\"]))\n event.plaintext = segment[\"text\"].strip()\n subs.append(event)\n\n :param media_file: Path of the media file\n :return: Collection of SSAEvent(s) (see :mod:`pysubs2.ssaevent`)\n \"\"\"\n pass\n
"},{"location":"#subsai.models.whisper_timestamped_model","title":"whisper_timestamped_model","text":"whisper_timestamped
See linto-ai/whisper-timestamped
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped","title":"WhisperTimeStamped","text":"WhisperTimeStamped(model_config={})\n
Bases: AbstractModel
src/subsai/models/whisper_timestamped_model.py
def __init__(self, model_config={}):\n super(WhisperTimeStamped, self).__init__(model_config=model_config,\n model_name=self.model_name)\n # config\n self.model_type = _load_config('model_type', model_config, self.config_schema)\n self.segment_type = _load_config('segment_type', model_config, self.config_schema)\n self.device = _load_config('device', model_config, self.config_schema)\n self.download_root = _load_config('download_root', model_config, self.config_schema)\n self.in_memory = _load_config('in_memory', model_config, self.config_schema)\n\n self.verbose = _load_config('verbose', model_config, self.config_schema)\n self.temperature = _load_config('temperature', model_config, self.config_schema)\n self.compression_ratio_threshold = _load_config('compression_ratio_threshold', model_config, self.config_schema)\n self.logprob_threshold = _load_config('logprob_threshold', model_config, self.config_schema)\n self.no_speech_threshold = _load_config('no_speech_threshold', model_config, self.config_schema)\n self.condition_on_previous_text = _load_config('condition_on_previous_text', model_config, self.config_schema)\n\n self.decode_options = \\\n {config: _load_config(config, model_config, self.config_schema)\n for config in self.config_schema if not hasattr(self, config)}\n\n self.model = whisper_timestamped.load_model(name=self.model_type,\n device=self.device,\n download_root=self.download_root,\n in_memory=self.in_memory)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.model_name","title":"model_name instance-attribute
class-attribute
","text":"model_name = 'linto-ai/whisper-timestamped'\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.config_schema","title":"config_schema instance-attribute
class-attribute
","text":"config_schema = {\n \"model_type\": {\n \"type\": list,\n \"description\": \"One of the official model names listed by `whisper.available_models()`, or path to a model checkpoint containing the model dimensions and the model state_dict.\",\n \"options\": whisper_timestamped.available_models(),\n \"default\": \"base\",\n },\n \"segment_type\": {\n \"type\": list,\n \"description\": \"Whisper_timestamps gives the ability to have word-level timestamps, Choose here between sentence-level and word-level\",\n \"options\": [\"sentence\", \"word\"],\n \"default\": \"sentence\",\n },\n \"device\": {\n \"type\": list,\n \"description\": \"The PyTorch device to put the model into\",\n \"options\": [None, get_available_devices()],\n \"default\": None,\n },\n \"download_root\": {\n \"type\": str,\n \"description\": \"Path to download the model files; by default, it uses '~/.cache/whisper'\",\n \"options\": None,\n \"default\": None,\n },\n \"in_memory\": {\n \"type\": bool,\n \"description\": \"whether to preload the model weights into host memory\",\n \"options\": None,\n \"default\": False,\n },\n \"verbose\": {\n \"type\": bool,\n \"description\": \"Whether to display the text being decoded to the console. If True, displays all the details,If False, displays minimal details. If None, does not display anything\",\n \"options\": None,\n \"default\": None,\n },\n \"temperature\": {\n \"type\": Tuple,\n \"description\": \"Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.\",\n \"options\": None,\n \"default\": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),\n },\n \"compression_ratio_threshold\": {\n \"type\": float,\n \"description\": \"If the gzip compression ratio is above this value, treat as failed\",\n \"options\": None,\n \"default\": 2.4,\n },\n \"logprob_threshold\": {\n \"type\": float,\n \"description\": \"If the average log probability over sampled tokens is below this value, treat as failed\",\n \"options\": None,\n \"default\": -1.0,\n },\n \"no_speech_threshold\": {\n \"type\": float,\n \"description\": \"If the no_speech probability is higher than this value AND the average log probability over sampled tokens is below `logprob_threshold`, consider the segment as silent\",\n \"options\": None,\n \"default\": 0.6,\n },\n \"condition_on_previous_text\": {\n \"type\": bool,\n \"description\": \"if True, the previous output of the model is provided as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.\",\n \"options\": None,\n \"default\": True,\n },\n \"task\": {\n \"type\": list,\n \"description\": \"whether to perform X->X 'transcribe' or X->English 'translate'\",\n \"options\": [\"transcribe\", \"translate\"],\n \"default\": \"transcribe\",\n },\n \"language\": {\n \"type\": str,\n \"description\": \"language that the audio is in; uses detected language if None\",\n \"options\": None,\n \"default\": None,\n },\n \"sample_len\": {\n \"type\": int,\n \"description\": \"maximum number of tokens to sample\",\n \"options\": None,\n \"default\": None,\n },\n \"best_of\": {\n \"type\": int,\n \"description\": \"number of independent samples to collect, when t > 0\",\n \"options\": None,\n \"default\": None,\n },\n \"beam_size\": {\n \"type\": int,\n \"description\": \"number of beams in beam search, when t == 0\",\n \"options\": None,\n \"default\": None,\n },\n \"patience\": {\n \"type\": float,\n \"description\": \"patience in beam search (https://arxiv.org/abs/2204.05424)\",\n \"options\": None,\n \"default\": None,\n },\n \"length_penalty\": {\n \"type\": float,\n \"description\": \"'alpha' in Google NMT, None defaults to length norm\",\n \"options\": None,\n \"default\": None,\n },\n \"suppress_tokens\": {\n \"type\": str,\n \"description\": 'list of tokens ids (or comma-separated token ids) to suppress \"-1\" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`',\n \"options\": None,\n \"default\": \"-1\",\n },\n \"fp16\": {\n \"type\": bool,\n \"description\": \"use fp16 for most of the calculation\",\n \"options\": None,\n \"default\": True,\n },\n \"remove_punctuation_from_words\": {\n \"type\": bool,\n \"description\": \"If False, words will be glued with the next punctuation mark (if any).If True, there will be no punctuation mark in the `words[:]['text']` list.It only affects these strings; This has no influence on the computation of the word confidence, whatever the value of `include_punctuation_in_confidence` is.\",\n \"options\": None,\n \"default\": False,\n },\n \"refine_whisper_precision\": {\n \"type\": float,\n \"description\": \"How much can we refine Whisper segment positions, in seconds. Must be a multiple of 0.02.\",\n \"options\": None,\n \"default\": 0.5,\n },\n \"min_word_duration\": {\n \"type\": float,\n \"description\": \"Minimum duration of a word, in seconds. If a word is shorter than this, timestamps will be adjusted.\",\n \"options\": None,\n \"default\": 0.04,\n },\n \"plot_word_alignment\": {\n \"type\": bool,\n \"description\": \"Whether to plot the word alignment for each segment. matplotlib must be installed to use this option.\",\n \"options\": None,\n \"default\": False,\n },\n \"seed\": {\n \"type\": int,\n \"description\": \"Random seed to use for temperature sampling, for the sake of reproducibility.Choose None for unpredictable randomness\",\n \"options\": None,\n \"default\": 1234,\n },\n \"vad\": {\n \"type\": bool,\n \"description\": \"Whether to perform voice activity detection (VAD) on the audio file, to remove silent parts before transcribing with Whisper model. This should decrease hallucinations from the Whisper model.\",\n \"options\": None,\n \"default\": False,\n },\n \"detect_disfluencies\": {\n \"type\": bool,\n \"description\": 'Whether to detect disfluencies (i.e. hesitations, filler words, repetitions, corrections, etc.) that Whisper model might have omitted in the transcription. This should make the word timestamp prediction more accurate.And probable disfluencies will be marked as special words \"[*]\"',\n \"options\": None,\n \"default\": False,\n },\n \"trust_whisper_timestamps\": {\n \"type\": bool,\n \"description\": \"Whether to rely on Whisper's timestamps to get approximative first estimate of segment positions (up to refine_whisper_precision).\",\n \"options\": None,\n \"default\": True,\n },\n \"naive_approach\": {\n \"type\": bool,\n \"description\": \"Force the naive approach that consists in decoding twice the audio file, once to get the transcription and once with the decoded tokens to get the alignment. Note that this approach is used anyway when beam_size is not None and/or when the temperature is a list with more than one element.\",\n \"options\": None,\n \"default\": False,\n },\n}\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.model_type","title":"model_type instance-attribute
","text":"model_type = _load_config(\n \"model_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.segment_type","title":"segment_type instance-attribute
","text":"segment_type = _load_config(\n \"segment_type\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.device","title":"device instance-attribute
","text":"device = _load_config(\n \"device\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.download_root","title":"download_root instance-attribute
","text":"download_root = _load_config(\n \"download_root\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.in_memory","title":"in_memory instance-attribute
","text":"in_memory = _load_config(\n \"in_memory\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.verbose","title":"verbose instance-attribute
","text":"verbose = _load_config(\n \"verbose\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.temperature","title":"temperature instance-attribute
","text":"temperature = _load_config(\n \"temperature\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.compression_ratio_threshold","title":"compression_ratio_threshold instance-attribute
","text":"compression_ratio_threshold = _load_config(\n \"compression_ratio_threshold\",\n model_config,\n self.config_schema,\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.logprob_threshold","title":"logprob_threshold instance-attribute
","text":"logprob_threshold = _load_config(\n \"logprob_threshold\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.no_speech_threshold","title":"no_speech_threshold instance-attribute
","text":"no_speech_threshold = _load_config(\n \"no_speech_threshold\", model_config, self.config_schema\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.condition_on_previous_text","title":"condition_on_previous_text instance-attribute
","text":"condition_on_previous_text = _load_config(\n \"condition_on_previous_text\",\n model_config,\n self.config_schema,\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.decode_options","title":"decode_options instance-attribute
","text":"decode_options = {\n config: _load_config(\n config, model_config, self.config_schema\n )\n for config in self.config_schema\n if not hasattr(self, config)\n}\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.model","title":"model instance-attribute
","text":"model = whisper_timestamped.load_model(\n name=self.model_type,\n device=self.device,\n download_root=self.download_root,\n in_memory=self.in_memory,\n)\n
"},{"location":"#subsai.models.whisper_timestamped_model.WhisperTimeStamped.transcribe","title":"transcribe","text":"transcribe(media_file)\n
Source code in src/subsai/models/whisper_timestamped_model.py
def transcribe(self, media_file) -> str:\n audio = whisper_timestamped.load_audio(media_file)\n results = whisper_timestamped.transcribe(self.model, audio,\n verbose=self.verbose,\n temperature=self.temperature,\n compression_ratio_threshold=self.compression_ratio_threshold,\n logprob_threshold=self.logprob_threshold,\n no_speech_threshold=self.no_speech_threshold,\n condition_on_previous_text=self.condition_on_previous_text,\n **self.decode_options\n )\n subs = SSAFile()\n if self.segment_type == 'word': # word level timestamps\n for segment in results['segments']:\n for word in segment['words']:\n event = SSAEvent(start=pysubs2.make_time(s=word[\"start\"]), end=pysubs2.make_time(s=word[\"end\"]))\n event.plaintext = word[\"text\"].strip()\n subs.append(event)\n elif self.segment_type == 'sentence':\n for segment in results['segments']:\n event = SSAEvent(start=pysubs2.make_time(s=segment[\"start\"]), end=pysubs2.make_time(s=segment[\"end\"]))\n event.plaintext = segment[\"text\"].strip()\n subs.append(event)\n else:\n raise Exception(f'Unknown `segment_type` value, it should be one of the following: '\n f' {self.config_schema[\"segment_type\"][\"options\"]}')\n return subs\n
"},{"location":"#subsai.configs","title":"subsai.configs","text":"Configurations file
"},{"location":"#subsai.configs.AVAILABLE_MODELS","title":"AVAILABLE_MODELSmodule-attribute
","text":"AVAILABLE_MODELS = {\n \"openai/whisper\": {\n \"class\": WhisperModel,\n \"description\": \"Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification.\",\n \"url\": \"https://github.com/openai/whisper\",\n \"config_schema\": WhisperModel.config_schema,\n },\n \"linto-ai/whisper-timestamped\": {\n \"class\": WhisperTimeStamped,\n \"description\": \"Multilingual Automatic Speech Recognition with word-level timestamps and confidence.\",\n \"url\": \"https://github.com/linto-ai/whisper-timestamped\",\n \"config_schema\": WhisperTimeStamped.config_schema,\n },\n \"ggerganov/whisper.cpp\": {\n \"class\": WhisperCppModel,\n \"description\": \"High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model\\n* Plain C/C++ implementation without dependencies\\n* Runs on the CPU\\n\",\n \"url\": \"https://github.com/ggerganov/whisper.cpp\\nhttps://github.com/abdeladim-s/pywhispercpp\",\n \"config_schema\": WhisperCppModel.config_schema,\n },\n \"guillaumekln/faster-whisper\": {\n \"class\": FasterWhisperModel,\n \"description\": \"**faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.\\nThis implementation is up to 4 times faster than [openai/whisper]( https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.\",\n \"url\": \"https://github.com/guillaumekln/faster-whisper\",\n \"config_schema\": FasterWhisperModel.config_schema,\n },\n \"m-bain/whisperX\": {\n \"class\": WhisperXModel,\n \"description\": \"**whisperX** is a fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.\",\n \"url\": \"https://github.com/m-bain/whisperX\",\n \"config_schema\": WhisperXModel.config_schema,\n },\n}\n
"},{"location":"#subsai.configs.BASIC_TOOLS_CONFIGS","title":"BASIC_TOOLS_CONFIGS module-attribute
","text":"BASIC_TOOLS_CONFIGS = {\n \"set time\": {\n \"description\": \"Set time to a subtitle\",\n \"config_schema\": {\n \"h\": {\n \"type\": float,\n \"description\": \"hours: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"m\": {\n \"type\": float,\n \"description\": \"minutes: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"s\": {\n \"type\": float,\n \"description\": \"seconds: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"ms\": {\n \"type\": float,\n \"description\": \"milliseconds: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n },\n },\n \"shift\": {\n \"description\": \"Shift all subtitles by constant time amount\",\n \"config_schema\": {\n \"h\": {\n \"type\": float,\n \"description\": \"hours: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"m\": {\n \"type\": float,\n \"description\": \"minutes: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"s\": {\n \"type\": float,\n \"description\": \"seconds: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"ms\": {\n \"type\": float,\n \"description\": \"milliseconds: Integer or float values, may be positive or negative\",\n \"options\": None,\n \"default\": 0,\n },\n \"frames\": {\n \"type\": int,\n \"description\": \"When specified, must be an integer number of frames\",\n \"options\": None,\n \"default\": None,\n },\n \"fps\": {\n \"type\": float,\n \"description\": \"When specified, must be a positive number.\",\n \"options\": None,\n \"default\": None,\n },\n },\n },\n}\n
"},{"location":"#subsai.configs.ADVANCED_TOOLS_CONFIGS","title":"ADVANCED_TOOLS_CONFIGS module-attribute
","text":"ADVANCED_TOOLS_CONFIGS = {\n \"ffsubsync\": {\n \"description\": \"Language-agnostic automatic synchronization of subtitles with video, so that subtitles are aligned to the correct starting point within the video.\",\n \"url\": \"https://github.com/smacke/ffsubsync\",\n \"config_schema\": {\n \"vad\": {\n \"type\": list,\n \"description\": \"Which voice activity detector to use for speech extraction (if using video / audio as a reference\",\n \"options\": [\n \"subs_then_webrtc\",\n \"webrtc\",\n \"subs_then_auditok\",\n \"auditok\",\n \"subs_then_silero\",\n \"silero\",\n ],\n \"default\": DEFAULT_VAD,\n },\n \"max-subtitle-seconds\": {\n \"type\": float,\n \"description\": \"Maximum duration for a subtitle to appear on-screen\",\n \"options\": None,\n \"default\": DEFAULT_MAX_SUBTITLE_SECONDS,\n },\n \"start-seconds\": {\n \"type\": int,\n \"description\": \"Start time for processing\",\n \"options\": None,\n \"default\": DEFAULT_START_SECONDS,\n },\n \"max-offset-seconds\": {\n \"type\": float,\n \"description\": \"The max allowed offset seconds for any subtitle segment\",\n \"options\": None,\n \"default\": DEFAULT_MAX_OFFSET_SECONDS,\n },\n \"apply-offset-seconds\": {\n \"type\": float,\n \"description\": \"Apply a predefined offset in seconds to all subtitle segments\",\n \"options\": None,\n \"default\": DEFAULT_APPLY_OFFSET_SECONDS,\n },\n \"suppress-output-if-offset-less-than\": {\n \"type\": float,\n \"description\": \"Apply a predefined offset in seconds to all subtitle segments\",\n \"options\": None,\n \"default\": None,\n },\n \"frame-rate\": {\n \"type\": int,\n \"description\": \"Frame rate for audio extraction\",\n \"options\": None,\n \"default\": DEFAULT_FRAME_RATE,\n },\n \"output-encoding\": {\n \"type\": str,\n \"description\": 'What encoding to use for writing output subtitles (default=utf-8). Can indicate \"same\" to use same encoding as that of the input.',\n \"options\": None,\n \"default\": \"utf-8\",\n },\n \"skip-infer-framerate-ratio\": {\n \"type\": bool,\n \"description\": \"If set, do not try to infer framerate ratio based on duration ratio.\",\n \"options\": None,\n \"default\": False,\n },\n \"no-fix-framerate\": {\n \"type\": bool,\n \"description\": \"If specified, subsync will not attempt to correct a framerate\",\n \"options\": None,\n \"default\": False,\n },\n \"serialize-speech\": {\n \"type\": bool,\n \"description\": \"If specified, serialize reference speech to a numpy array.\",\n \"options\": None,\n \"default\": False,\n },\n \"gss\": {\n \"type\": bool,\n \"description\": \"If specified, use golden-section search to try to findthe optimal framerate ratio between video and subtitles.\",\n \"options\": None,\n \"default\": False,\n },\n },\n },\n \"Translation\": {\n \"description\": \"Translate to different languages using AI\",\n \"url\": \"https://github.com/xhluca/dl-translate\",\n \"config_schema\": {\n \"model\": {\n \"type\": list,\n \"description\": \"The model\",\n \"options\": available_translation_models(),\n \"default\": available_translation_models()[\n 0\n ],\n },\n \"device\": {\n \"type\": list,\n \"description\": '\"cpu\", \"gpu\" or \"auto\". If it\\'s set to \"auto\", will try to select a GPU when available or else fall back to CPU',\n \"options\": [\n \"auto\",\n get_available_devices(),\n ],\n \"default\": \"auto\",\n },\n \"batch_size\": {\n \"type\": int,\n \"description\": \"The number of samples to load at once. If set to `None`, it will process everything at once\\nA smaller value is preferred for `batch_size` if your (video) RAM is limited\",\n \"options\": None,\n \"default\": 32,\n },\n \"verbose\": {\n \"type\": bool,\n \"description\": \"Whether to display the progress bar for every batch processed.\",\n \"options\": None,\n \"default\": True,\n },\n },\n },\n}\n
"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
new file mode 100644
index 0000000..0f8724e
--- /dev/null
+++ b/sitemap.xml
@@ -0,0 +1,3 @@
+
+