# Create transcripts with whisper-medium-et

In [None]:
import torch
import os
import librosa
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

# Load the processor and model
processor = AutoProcessor.from_pretrained("TalTechNLP/whisper-medium-et")
model = AutoModelForSpeechSeq2Seq.from_pretrained("TalTechNLP/whisper-medium-et")

In [None]:
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(model.device)

In [None]:
#teeme directory
def create_directory(path, folder_name):
    """
    Creates a directory with the given folder_name in the specified path.

    Parameters:
    path (str): The base path where the directory should be created (can be relative).
    folder_name (str): The name of the directory to create.
    """
    # Construct the full directory path
    full_path = os.path.join(path, folder_name)
    # Create the directory
    try:
        os.makedirs(full_path, exist_ok=True)
        print(f"Directory '{full_path}' created successfully.")
    except PermissionError:
        print(f"Permission denied: Unable to create '{full_path}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

def create_transcript(input_audio_path, output_file_name):
    # Load audio and split into chunks (example: 20-second chunks)
    audio, sr = librosa.load(input_audio_path, sr=16000)
    chunk_duration = 20  # in seconds
    chunk_size = chunk_duration * sr  # Convert to samples
    
    # Split audio into chunks
    chunks = [audio[i:i + chunk_size] for i in range(0, len(audio), chunk_size)]
    
    print(f"Number of chunks {len(chunks)}")
    f = open(output_file_name, "w")
    transcription = list()
    
    # Transcribe each chunk
    
    for i, chunk in enumerate(chunks):
        if i % 10 == 0:
            print(f"Processing chunk {i}...")
        inputs = processor(chunk, return_tensors="pt", sampling_rate=sr).to(device)
        with torch.no_grad():
            generated_ids = model.generate(inputs["input_features"])
        transcription_chunk = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        transcription.append(transcription_chunk)
        f.write(transcription_chunk+" ")
    f.close()
    print("Transcript complete") 

### Create transcripts without voice change

In [None]:
for doctor_index in range(2,5):
    for patient_index in range(1,11):
        print(doctor_index, patient_index)
        audiofile = f"Arst_{doctor_index:03}/Patsient_{patient_index:03}/toorfailid/arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}.WAV"
        save_path = f"whisper_medium_et_transcripts/arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-whisper-medium-et-transcript.txt"
        create_transcript(audiofile, save_path)

### Create transcripts with voice change

In [None]:
for doctor_index in range(4,5):
    for patient_index in range(1,11):
        print(doctor_index, patient_index)
        rates = [0.8, 1.2]
        steps = [2, -2]
        for i in range(2):
            print(steps[i],rates[i])
            audiofile = f"modified_audio/arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-step={steps[i]}-rate={rates[i]}.WAV"
            save_path = f"whisper_medium_et_transcripts/arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-step={steps[i]}-rate={rates[i]}-whisper-medium-et-transcript.txt"
            create_transcript(audiofile, save_path)
        formants = [0.8, 1.2]
        for formant in formants:
            print(formant)
            audiofile = f"modified_audio/arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-formant-shift-ratio={formant}.WAV"
            save_path = f"whisper_medium_et_transcripts/arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-formant-shift-ratio={formant}-whisper-medium-et-transcript.txt"
            create_transcript(audiofile, save_path)