## Course:     Data Science for Linguists
## Winter semester 23/24
## Assignment: Project (part 3)
## Student:    Alla Savinkina

In [None]:
import re
import itertools
import string
import unicodedata

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
cd /content/gdrive/MyDrive/rus_phraseology

/content/gdrive/MyDrive/rus_phraseology


### **Note**
In order to extract idioms from a .pdf file, I applied filename, however, the extracted forms were not in consistent notation in the pdf file and contained redundant information extracted. Therefore, I manually cleaned up the redundant entries (those that do not have register) and saved into rus_phraseology.txt.<
If a phrase segment is in parentheses, the preceding element (different lemma) can be substituted by the element in () (except elements that start with "-" and represent possible inflection endings, we do not need this, since the inflections are part of the same lemma).  If an element is in square brackets, it is an optional element and the phrase can be with or without the optional element.

In [None]:
def variation_phrase(phrase):
    """
    Process a phrase with variations enclosed in parentheses or square brackets and
    return all possible variations of the phrase.
    """
    def split_segments(phrase):
        """Split the phrase into segments by parentheses or square brackets"""
        tokens = re.findall(r'\([^()]*\)|\[[^\[\]]*\]|[^\s]+', phrase)
        segments, temp_segment = [], []
        for token in tokens:
            temp_segment.append(token)
            if token.endswith(")") or token.endswith("]"):
                segments.append(list(temp_segment))
                temp_segment.clear()
        # Append any remaining tokens in `temp_segment`
        if temp_segment:
            segments.append(list(temp_segment))
        return segments

    # Step 2: Define paraphrase types
    def not_paraphrase(text):
        return not (text.startswith("[") and text.endswith("]") or text.startswith("(") and text.endswith(")"))

    def is_one_paraphrase(text):
        inner_text = text[1:-1].strip()
        return text.startswith("(") and text.endswith(")") and "," not in inner_text and not inner_text.startswith("-")

    def is_n_paraphrases(text):
        inner_text = text[1:-1].strip()
        return text.startswith("(") and text.endswith(")") and "," in text[1:-1].strip() and not inner_text.startswith("-")

    def is_end_paraphrase(text):
        inner_text = text[1:-1].strip()
        return text.startswith("(") and text.endswith(")") and inner_text.lstrip().startswith("-")

    def is_one_optional(text):
        inner_text = text[1:-1].strip()
        return text.startswith("[") and text.endswith("]") and "," not in inner_text and not inner_text.startswith("-")

    def is_n_optional(text):
        inner_text = text[1:-1].strip()
        return text.startswith("[") and text.endswith("]") and "," in inner_text and not inner_text.startswith("-")

    segments = split_segments(phrase)

    # Step 3: Classify and process each segment
    variation_segments = []
    for segment in segments:
        last_element = segment[-1]
        sub_i = []

        if is_one_paraphrase(last_element):
            n = len(last_element[1:-1].split())  # n words inside the parentheses
            sub_i.append(" ".join(segment[:-1]))  # Original segment without the paraphrase
            sub_i.append(" ".join(segment[:-n-1] + [last_element[1:-1]]))  # Replace parentheses with content

        elif is_n_paraphrases(last_element):
            n_list = last_element[1:-1].split(",")  # n of possible variations
            for item in n_list:
                len_var = len(list(item.split()))
                sub_i.append(" ".join(segment[:-1]))  # Original segment without the paraphrase
                sub_i.append(f"{' '.join(segment[:-(len_var+1)])} {item.strip()}".strip())

        elif is_end_paraphrase(last_element):
            sub_i.append(" ".join(segment[:-1]))  # omit the var ending, because the same lemma

        elif is_one_optional(last_element):
            sub_i.append(" ".join(segment[:-1]))  # Original segment without the paraphrase
            sub_i.append(" ".join(segment[:-1] + [last_element[1:-1]]))  # original+optional

        elif is_n_optional(last_element):
            n_list = last_element[1:-1].split(",")  # n of possible optional el-s
            for item in n_list:
                sub_i.append(" ".join(segment[:-1]))  # Original segment without the paraphrase
                sub_i.append(" ".join(segment[:-1] + [item]))  # original+optional

        elif not_paraphrase(last_element):
            sub_i.append(" ".join(segment)) #add the whole segment

        variation_segments.append(sub_i)

    # Step 4: Generate all combinations of variations
    variations_list = [" ".join(combination) for combination in itertools.product(*variation_segments)]
    # Remove duplicates by converting to a set and back to a list
    variations_list = list(set(variations_list))
    return variations_list

def remove_accents_keep_cyrillic(text):
    """
    Removes accent marks and punctuation from text while preserving special Cyrillic letters
    such as `ё`, `й`, and `ў`, and keeping hyphens (`-`) as they are part of a word.
    """
    # List of special Cyrillic characters to preserve
    preserved_chars = {'ё', 'й', 'ў', '-'}
    punct_to_remove = set(string.punctuation) - {'-'}
    filtered = ''.join(
        c for c in text
        if (
            not unicodedata.combining(c)
            and c not in punct_to_remove
        ) or c in preserved_chars
    )
    return filtered

def process_file(input_file, output_file):
    """
    Process an input file containing phrases and generate all possible variations,
    writing them to an output file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            if '\t' in line:
                text, register = line.strip().split('\t', 1)
                variations_list = variation_phrase(text)
                for variation in variations_list:
                    cleaned_variation = remove_accents_keep_cyrillic(variation)
                    outfile.write(f"{cleaned_variation}\t{register}\n")


#### Use rus_phraseology.txt to generate all the variations.

In [None]:
process_file("rus_phraseology.txt", "rus_phraseology_variations.txt")