In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install jiwer


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!unzip "/content/drive/MyDrive/Projects/AudioFiles_final/voices_final_trimmed_2.zip" -d "/content"


In [None]:
import whisper
model = whisper.load_model("small")


In [None]:
# Function to transcribe audio
def transcribe_audio(audio_path):
    result = model.transcribe(audio_path)
    return result["text"]

In [None]:
# Example of loading and parsing the audio_paths
with open('/content/audio_paths_file.txt', 'r') as f:
    # Assuming the full path is the second part after splitting by space
    audio_paths = [line.strip().split(' ')[1] for line in f.readlines()]


with open('/content/text_file.txt', 'r') as f:
    texts = [line.strip() for line in f.readlines()]


In [None]:
from jiwer import wer, cer

# Initialize lists to hold detailed results for each file
detailed_results = []

# Continue from your previous code...
total_wer, total_cer = 0, 0
num_files = len(audio_paths)

# Transcribe and calculate metrics, storing detailed results
for i, audio_path in enumerate(audio_paths):
    true_text = texts[i]
    predicted_text = transcribe_audio(audio_path)  # Utilize the full path directly
    file_wer = wer(true_text, predicted_text)
    file_cer = cer(true_text, predicted_text)
    total_wer += file_wer
    total_cer += file_cer
    detailed_results.append({
        'Reference': true_text,
        'Predicted': predicted_text,
        'WER': file_wer,
        'CER': file_cer
    })
    print(f"Processed {i+1}/{num_files} files.")

# Calculate averages
avg_wer = total_wer / num_files * 100
avg_cer = total_cer / num_files * 100

# Output file path
results_file_path = '/content/drive/MyDrive/transcription_evaluation_detailed_results_whisper.txt'

# Write detailed results to the file
with open(results_file_path, 'w') as f:
    f.write(f"Dataset: /content/output_data_directory/eval_dataset_vosk\n")
    f.write(f"WER: {avg_wer:.2f}%, CER: {avg_cer:.2f}%\n\n")
    for result in detailed_results:
        f.write(f'Reference: "{result["Reference"]}"\n')
        f.write(f'Predicted: "{result["Predicted"]}"\n')
        f.write(f'WER: {result["WER"]*100:.2f}%, CER: {result["CER"]*100:.2f}%\n\n')

print(f"Results saved to {results_file_path}")


In [None]:
import os


In [None]:
# Define your new folder path and name
new_folder_path = '/content/drive/MyDrive/TranscriptionResultsWhisper'

# Create the folder if it doesn't exist
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)
    print(f"Created folder: {new_folder_path}")
else:
    print(f"Folder already exists: {new_folder_path}")

In [None]:
import shutil
# Define the path to the new results file location
new_results_file_path = os.path.join(new_folder_path, 'transcription_evaluation_detailed_results_whisper.txt')

# Move the file
shutil.move(results_file_path, new_results_file_path)

print(f"Results moved to {new_results_file_path}")

