In [30]:
# FSA states and transitions for Turkish vowel harmony and suffixation
states = set([
    "pre-root",
    "1_-B, -R",
    "3_-B, +R",
    "2_+B, -R",
    "4_+B, +R",
    "5_PL",
    "6_PL",
    "7_PSS_non3PL",
    "8_PSS_non3PL",
    "9_PSS_3PL",
    "10_PSS_3PL",
    "11_str",
    "12_str",
    "13_obl",
    "14_obl"
])

sigma = ["i", "u", "ı", "ü", "e", "a", "o", "ö"]

start_state = "pre-root"

final_states = set([
    "11_str",
    "12_str",
    "13_obl",
    "14_obl"
])

# Transitions with output symbols (suffix characters)
dict_transitions = {
    ("pre-root", "e"): ("1_-B, -R", ""),
    ("pre-root", "i"): ("1_-B, -R", ""),
    ("pre-root", "ö"): ("3_-B, +R", ""),
    ("pre-root", "ü"): ("3_-B, +R", ""),
    ("pre-root", "a"): ("2_+B, -R", ""),
    ("pre-root", "ı"): ("2_+B, -R", ""),
    ("pre-root", "o"): ("4_+B, +R", ""),
    ("pre-root", "u"): ("4_+B, +R", ""),
    ("1_-B, -R", "e"): ("5_PL", "ler"),
    ("3_-B, +R", "e"): ("5_PL", "ler"),
    ("2_+B, -R", "a"): ("6_PL", "lar"),
    ("4_+B, +R", "a"): ("6_PL", "lar"),
    ("5_PL", "i"): ("7_PSS_non3PL", "n"),
    ("6_PL", "ı"): ("8_PSS_non3PL", "n"),
    ("1_-B, -R", "i"): ("7_PSS_non3PL", "n"),
    ("3_-B, +R", "ü"): ("7_PSS_non3PL", "n"),
    ("2_+B, -R", "ı"): ("8_PSS_non3PL", "n"),
    ("4_+B, +R", "u"): ("8_PSS_non3PL", "n"),
    ("5_PL", "e"): ("9_PSS_3PL", "n"),
    ("6_PL", "a"): ("10_PSS_3PL", "n"),
    ("1_-B, -R", "e"): ("9_PSS_3PL", "n"),
    ("3_-B, +R", "e"): ("9_PSS_3PL", "n"),
    ("2_+B, -R", "a"): ("10_PSS_3PL", "n"),
    ("4_+B, +R", "a"): ("10_PSS_3PL", "n"),
    ("7_PSS_non3PL", "i"): ("11_str", "imiz"),
    ("8_PSS_non3PL", "ı"): ("12_str", "ımız"),
    ("7_PSS_non3PL", "e"): ("13_obl", "imiz"),
    ("8_PSS_non3PL", "a"): ("14_obl", "ımız"),
    ("9_PSS_3PL", "i"): ("11_str", "leri"),
    ("10_PSS_3PL", "ı"): ("12_str", "ları"),
    ("9_PSS_3PL", "e"): ("13_obl", "leri"),
    ("10_PSS_3PL", "a"): ("14_obl", "ları"),
    ("5_PL", "i"): ("11_str", "i"),
    ("6_PL", "ı"): ("12_str", "ı"),
    ("5_PL", "e"): ("13_obl", "e"),
    ("6_PL", "a"): ("14_obl", "a"),
    ("1_-B, -R", "i"): ("11_str", "i"),
    ("3_-B, +R", "i"): ("11_str", "i"),
    ("1_-B, -R", "e"): ("13_obl", "e"),
    ("3_-B, +R", "e"): ("13_obl", "e"),
    ("2_+B, -R", "ı"): ("12_str", "ı"),
    ("2_+B, -R", "a"): ("14_obl", "a"),
    ("4_+B, +R", "ı"): ("12_str", "ı"),
    ("4_+B, +R", "a"): ("14_obl", "a"),
}

fsa = (states, sigma, start_state, final_states, dict_transitions)

In [31]:
def determine_suffix(lemma):
    for char in reversed(lemma):
        if char in "aeıioöuü":
            return char
    return ""

def is_str_case(msd):
    return any(case in msd for case in ["DAT", "LOC", "ABL"])

def is_obl_case(msd):
    return any(case in msd for case in ["ACC", "GEN"])

In [32]:
# function that is trying to go through the fsa and record all suffixes - not completed
def inflection_suffix(fsa, lemma, msd):
    curr_state = fsa[2]  
    dict_transitions = fsa[4] 
    symbol = determine_suffix(lemma)
    expected_suffix = ""

    if (curr_state, symbol) in dict_transitions:
        next_state, output = dict_transitions[(curr_state, symbol)]
        curr_state = next_state
    else:
        print("Vowel not listed in transitions.")
        return ""

    if "PL" in msd:
        if (curr_state, "e") in dict_transitions or (curr_state, "a") in dict_transitions:
            plural_symbol = "e" if curr_state in ["1_-B, -R", "3_-B, +R"] else "a"
            next_state, output = dict_transitions[(curr_state, plural_symbol)]
            curr_state = next_state
            expected_suffix += output  

    if "PSS(3,PL)" in msd:
        possessive_symbol = "e" if curr_state in ["1_-B, -R", "3_-B, +R"] else "a"
        if (curr_state, possessive_symbol) in dict_transitions:
            next_state, output = dict_transitions[(curr_state, possessive_symbol)]
            curr_state = next_state
            expected_suffix += output 

    if "PSS" in msd and "PSS(3,PL)" not in msd:
        possessive_symbol = "i" if curr_state in ["1_-B, -R", "3_-B, +R"] else "ı"
        if (curr_state, possessive_symbol) in dict_transitions:
            next_state, output = dict_transitions[(curr_state, possessive_symbol)]
            curr_state = next_state
            expected_suffix += output  

    if is_str_case(msd):
        case_symbol = "i" if curr_state in ["7_PSS_non3PL", "9_PSS_3PL"] else "ı"
        if (curr_state, case_symbol) in dict_transitions:
            next_state, output = dict_transitions[(curr_state, case_symbol)]
            curr_state = next_state
            expected_suffix += output  

    if is_obl_case(msd):
        case_symbol = "e" if curr_state in ["7_PSS_non3PL", "9_PSS_3PL"] else "a"
        if (curr_state, case_symbol) in dict_transitions:
            next_state, output = dict_transitions[(curr_state, case_symbol)]
            curr_state = next_state
            expected_suffix += output 

    return expected_suffix

In [33]:
# test words pulled from the error analysis examples
sample_data = [
    ("anahtarlık", "N;DAT(PL;PSS(1,PL))", "anahtarlıklerimize", "anahtarlıklarımıza"), # line 7
    ("asimptot", "N;ABL(PL;PSS(1,PL))", "asimptotlerimizden", "asimptotlarımızdan"), # line 8
    ("zavar", "N;ACC(PL;PSS(2,SG))", "zavarlerini", "zavarlarını") # line 271
]

for lemma, msd, predicted, correct in sample_data:
    expected_suffix = inflection_suffix(fsa, lemma, msd)
    suffix_start = len(lemma)
    suffix = predicted[suffix_start:]

    if suffix != expected_suffix:
        corrected_prediction = predicted[:suffix_start] + expected_suffix
    else:
        corrected_prediction = predicted

    print(f"Lemma: {lemma}, MSD: {msd}, Predicted: {predicted}, Corrected: {corrected_prediction}, Correct: {correct}")

Lemma: anahtarlık, MSD: N;DAT(PL;PSS(1,PL)), Predicted: anahtarlıklerimize, Corrected: anahtarlıka, Correct: anahtarlıklarımıza
Lemma: asimptot, MSD: N;ABL(PL;PSS(1,PL)), Predicted: asimptotlerimizden, Corrected: asimptota, Correct: asimptotlarımızdan
Lemma: zavar, MSD: N;ACC(PL;PSS(2,SG)), Predicted: zavarlerini, Corrected: zavara, Correct: zavarlarını


In [34]:
# test codes with the entire output file
with open('data/tur.out', 'r') as output, open('data/tur.dev', 'r') as correct:
    output_lines = [line.strip() for line in output if line.strip()]
    correct_lines = [line.strip() for line in correct if line.strip()]

total = 0
correct_before = 0
correct_after = 0

for output_line, correct_line in zip(output_lines, correct_lines):
    lemma, msd, predicted = output_line.split('\t')
    lemma_correct, msd_correct, correct_form = correct_line.split('\t')

    assert lemma == lemma_correct and msd == msd_correct

    if predicted == correct_form:
        correct_before += 1

    expected_suffix = inflection_suffix(fsa, lemma, msd)
    corrected_prediction = lemma + expected_suffix

    if corrected_prediction == correct_form:
        correct_after += 1

    total += 1

accuracy_before = correct_before / total
accuracy_after = correct_after / total

print(f"Accuracy Before Correction: {accuracy_before:.2%}")
print(f"Accuracy After Correction: {accuracy_after:.2%}")


Accuracy Before Correction: 68.90%
Accuracy After Correction: 1.70%
