In [3]:
# Define a function that takes in tur.out, test the vowel harmony info - frontness, replace the vowel if needed
# bonus: compare to tur.dev (correct version) to see if the problem is solved

def noun_correction(estimated_file, correct_file):
    def is_front_vowel(v):
        return v in "eiöü"

    def is_back_vowel(v):
        return v in "aıou"
    
    def is_round_vowel(v):
        return v in "ouöü"
    
    def is_unround_vowel(v):
        return v in "eiaı"

    def noun_pl_suffix(lemma):
        # Find the last vowel in the lemma to determine the correct plural suffix
        for char in reversed(lemma):
            if is_back_vowel(char):
                return "lar"
            if is_front_vowel(char):
                return "ler"
        return "" 

    def noun_pl_correction(output, lemma, msd):
        suffix_start = len(lemma)
        suffix = output[suffix_start:]

        expected_suffix = noun_pl_suffix(lemma)

        # Apply correction only if msd indicates plural noun
        if "PL" in msd and suffix in ["lar", "ler"] and suffix != expected_suffix:
            return output[:suffix_start] + expected_suffix
        return output

    with open(estimated_file, 'r', encoding='utf8') as output, open(correct_file, 'r', encoding='utf8') as correct:
        estimated_lines = [line.strip() for line in output if line.strip()]
        correct_lines = [line.strip() for line in correct if line.strip()]

    total = 0
    correct_before = 0
    correct_after = 0

    for estimated_line, correct_line in zip(estimated_lines, correct_lines):
        lemma, msd, predicted = estimated_line.split('\t')
        lemma_correct, msd_correct, correct = correct_line.split('\t')

        assert lemma == lemma_correct and msd == msd_correct

        if predicted == correct:
            correct_before += 1

        corrected = predicted

        if 'N' in msd:
            corrected = noun_pl_correction(corrected, lemma, msd)

        if corrected == correct:
            correct_after += 1

        total += 1

    accuracy_before = correct_before / total
    accuracy_after = correct_after / total

    return accuracy_before, accuracy_after

accuracy_before, accuracy_after = noun_correction('data/tur.out', 'data/tur.dev')
print(f"Accuracy Before Correction: {accuracy_before:.2%}")
print(f"Accuracy After Correction: {accuracy_after:.2%}")

Accuracy Before Correction: 68.90%
Accuracy After Correction: 69.10%
