<a href="https://colab.research.google.com/github/bogdanbabych/experiments_NLTK/blob/main/fst_script_70.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Converting the input/output format -- FST

In [None]:
!wget https://heibox.uni-heidelberg.de/f/7101da9baf2b4197b115/?dl=1

In [None]:
!mv index.html?dl=1 TbiLLC-data.tgz
!tar xvzf TbiLLC-data.tgz

In [3]:
!cp /content/TbiLLC-data/FST.txt /content/FST.txt

In [20]:
import sys, os


In [16]:
# alternative 1: write to file
fout = open("out70.txt", "w")

In [26]:
# alternative 2: write to standard output (screen)
fout = sys.stdout

In [23]:
#CHAT optimized SCRIPT

import re

def convert_string(input_string: str) -> str:
    """Convert a string from TIGER format to NLTK format."""
    # Extract the values for case, pos, and SF from the input string
    match = re.match(r"\[case=([^ ]*?) pos=([^ ]*?)\]", input_string)
    if match:
        case = match.group(1)
        pos = match.group(2)

        # only keep the last part of `case`,
        # e.g. `*.Stm` -> `Stm` and `Sg*.Inst` -> `Inst`
        case = case.split('.')[-1]



        # I'm making up grammar rules here
        sf = "HEAD" if pos == "NN" else "MD"

        # Build the output string based on the values extracted from the input string
        if pos == "ADJ":
            output_string = f"Adj1[SF={sf}, case={case}]"
        elif pos == "PRN_DT":
            output_string = f"PRN_DT[SF={sf}, case={case}]"
        elif pos == "NN":
            output_string = f"NN[SF={sf}, case={case}]"
        elif pos == "TADV":
            output_string = f"Adv1[SF={sf}]"
        elif pos == "IADV":
            output_string = f"Adv2[SF={sf}, case={case}]"
        elif pos == "$.":
            output_string = f"$.[case={case}]"
        else:
            output_string = "Invalid input string"

    else:
        output_string = input_string

    return output_string

def parse_ffst_file(file: str):
    with open(file) as f:
        lines = [x.strip() for x in f.readlines()]

    sections = []
    current_section = []
    for line in lines:
        if re.match(r"IV01\[", line):  # line contains verb grammatical description
            current_section.append(line)
            agr = 'IV01'
        elif re.match(r"\[case", line):  # line contains grammatical description
            current_section.append(line)
        else:  # line contains a word
            if current_section:  # current section is not empty, i.e. this word starts a new section
                sections.append(current_section)
                current_section = [line]
            else:  # first word of the file
                current_section.append(line)

    # Adding SF=HEAD in Grammatical description for non Noun input/converted strings
    for section in sections:
        word = section[0]
        for gram_desc in section[1:]:
            gram_desc = convert_string(gram_desc)
            match = re.match(r'^(.*\[SF=)(HEAD|MD)(.*)\]$', gram_desc)
            if match:
                prefix = match.group(1)
                sf_value = match.group(2)
                suffix = match.group(3)
                if sf_value == "HEAD":
                    print(gram_desc, "->", "'" + word + "'", file=fout)  # SF=HEAD, print as is
                    print('', file=fout)
                elif sf_value == "MD":
                    print(gram_desc, "->", "'" + word + "'", file=fout)  # SF=MD, print original string
                    new_desc = f"{prefix}HEAD{suffix}"  # Substitute SF=MD with SF=HEAD
                    print(new_desc, "->", "'" + word + "'", file=fout)  # Print modified string
                    print('', file=fout)
            else:
                print("Invalid string format:", gram_desc, file=fout)



In [None]:
parse_ffst_file('FST.txt')

In [25]:
fout.close()