In [None]:
from jiwer import wer
import os
import re
import csv

# Function to remove all symbols (non-alphanumeric characters) from text
def remove_symbols(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9\söäõüÖÄÕÜ]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # This regex keeps only letters (a-z, A-Z), digits (0-9), and whitespace
    return cleaned_text.lower()



def open_txt(path, filename):
    """
    Checks if a given path exists. If it does, opens a .txt file inside it.

    Parameters:
    path (str): The directory path to check.
    filename (str): The name of the .txt file to open.
    """
    if os.path.exists(path) and os.path.isdir(path):
        file_path = os.path.join(path, filename)

        if os.path.exists(file_path) and os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                return content
        else:
            print(f"File '{filename}' does not exist in '{path}'.")
    else:
        print(f"Path '{path}' does not exist.")


In [None]:
transcripts_directory = "transcript_transcripts"
model = "est-asr"


csv_filename = "wer_transcript_all1.csv"

with open(csv_filename, mode="w", newline="") as file:
    writer = csv.writer(file, delimiter="\t")
    writer.writerow(["Doctor", "Patient", 
                     "WER unmodified", "WER step 2 rate 0.8",
                     "WER step -2 rate 1.2", "WER formant shift 0.8",
                     "WER formant shift 1.2"])


rates = [0.8, 1.2]
steps = [2, -2]
formants = [0.8, 1.2]

for doctor_index in range(1,11):
    for patient_index in range(10,11):
        row = list()
        row.append(doctor_index)
        row.append(patient_index)
        print(f"Proccessing doctor: {doctor_index}, patient: {patient_index}.")
        directory = f"Arst_{doctor_index:03}/Patsient_{patient_index:03}"
        clean_directory = "parandatud"
        path = f"{directory}/{clean_directory}"
        if os.path.exists(path):
            for filename in os.listdir(path):
                if "parandatud".lower() in filename.lower() and filename.lower().endswith(".txt"):
                    file_path = os.path.join(path, filename)
                    try:
                        with open(file_path, 'r', encoding='utf-8') as file:
                            content = file.read()
                            clean_transcript = remove_symbols(content)
                            transcript = open_txt(f"{transcripts_directory}",f"arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-{model}-transcript.txt")
                            transcript_symbols_removed = remove_symbols(transcript)
                            calculated_wer = round(wer(clean_transcript, transcript_symbols_removed), 3)
                            row.append(calculated_wer)
                            print(f"WER {calculated_wer}")
                            for i in range(2):
                                rate = rates[i]
                                step = steps[i]
                                transcript = open_txt(f"{transcripts_directory}",f"arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-step={step}-rate={rate}-{model}-transcript.txt")
                                transcript_symbols_removed = remove_symbols(transcript)
                                calculated_wer = round(wer(clean_transcript, transcript_symbols_removed), 3)
                                row.append(calculated_wer)
                                print(f"WER {calculated_wer}")
                            for formant in formants:
                                rate = rates[i]
                                step = steps[i]
                                transcript = open_txt(f"{transcripts_directory}",f"arsti_salvestus_orig_{doctor_index:02}_{patient_index:02}-formant-shift-ratio={formant}-{model}-transcript.txt")
                                transcript_symbols_removed = remove_symbols(transcript)
                                calculated_wer = round(wer(clean_transcript, transcript_symbols_removed), 3)
                                row.append(calculated_wer)
                                print(f"WER {calculated_wer}")
                    except Exception as e:
                        print(f"Error reading file '{filename}': {e}")
        
        else:
            print(f"Could not find path: {path}")
            continue

        with open(csv_filename, mode="a", newline="") as file:
            writer = csv.writer(file, delimiter="\t")  # Use tab separator
            writer.writerow(row)
        