### Installs the required packages

In [None]:
# Installs packages
!pip install --q --upgrade pip flash-attn --no-build-isolation git+https://github.com/huggingface/transformers.git accelerate datasets[audio]

### Imports the required packages

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

### Sets up the device and data types

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

### Specifies the model

In [None]:
# Other available model variants can be found here: https://huggingface.co/openai/whisper-large-v3#:~:text=on%20the%20Hub%3A-,Size,%E2%9C%93,-Usage
model_id = "openai/whisper-large-v3"

### Initializes and configures the model and the processor

In [None]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=False, use_safetensors=True
    # Use Flash Attention if you have a GPU that supports it (Ampere and newer)
    # ,use_flash_attention_2=True
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

### Configures the pipeline

In [None]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=420,
    chunk_length_s=8, # Adjust this based on the type of audio content
    batch_size=8, # Adjust this based on your hardware (Fine for T4 GPU)
    return_timestamps=True, # Set this to false if you don't want/need timestamps
    torch_dtype=torch_dtype,
    device=device,
)

### Specifies the audio file path and filetype

In [None]:
# Place your audio file into Google Colabs "/content/" directory and
# change "your_file" to your files name
audio = "/content/" + "interview_mum" + ".mp3"

### Sets the language and task (Transcription, Translation)

In [None]:
# Use this for transcription (Change <"language": "german"> to your audio files language)
result = pipe(audio, generate_kwargs={"language": "german", "task": "transcribe"})

# Use this for translation to English (Change <"language": "german"> to your audio files language)
# result = pipe(audio, generate_kwargs={"language": "german", "task": "translate"})

### Formats the output and saves it to a text file

In [None]:
# Formats the timestamps to be more readable
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

# Saves the Models Output to a text file in Google Colabs "/content/" directory
with open("/content/whisper_output.txt", "w", encoding='utf-8') as file:
    for i, chunk in enumerate(result['chunks']):
        start_time, end_time = chunk['timestamp']
        formatted_start_time = format_time(start_time)
        formatted_end_time = format_time(end_time)
        text = chunk['text']
        file.write(f"Segment {i+1}:\n")
        file.write(f"Start Time: {formatted_start_time}, End Time: {formatted_end_time}\n\n")
        file.write(f"Text: {text}\n")