In [27]:
from random import choice

## Data generators

### First-last harmony

In [28]:
def first_last_generate(n = 10, length = 10, grammatical = True, 
                        vowels = None, transparent = None):
    """ Generates a collection words following the rule of first-last harmony.
    
    * n (int): number of strings that need to be generated;
    * length (int): length of every one of the generated strings;
    * grammatical (bool): if set to True, the correctly harmonizing
                          forms are generated, and if set to False,
                          the disharmonic forms are produced;
    * vowels (list): list of vowels among which the first-last
                     agreement is established;
    * transparent (list): list of irrelevant elements.
    """
    
    # initialization and sanity check for the list of vowels
    if vowels is None:
        vowels = ["a", "o"]
    elif len(vowels) < 2:
        raise IndexError("The vowel system needs to contain at least two distinct vowels.")
    
    # initialization and sanity check for the list of transparent elements
    if transparent is None:
        transparent = ["x"]
    elif [i for i in vowels if i in transparent]:
        raise ValueError("Lists of harmonizing vowels and transparent elements cannot overlap.")
    
    # generate the required number of harmonic strings
    strings = []
    for i in range(n):
        new = choice(vowels)
        new += "".join([choice(vowels + transparent) for j in range(length - 2)])
        if grammatical:
            new += new[0]
        else:
            new += choice([i for i in vowels if i != new[0]])
            
        strings.append(new)

    return strings

In [29]:
print(first_last_generate(n = 5, grammatical = True))
print(first_last_generate(n = 5, grammatical = False))

['oaaoxaxaxo', 'axoxaaoooa', 'oooaaoaaoo', 'oxooxxoxao', 'oaoaoaxxao']
['axaooxxxoo', 'ooxoaxaxxa', 'aaooaxxooo', 'aoaaoooaxo', 'oaxoxxxxoa']


### VC harmony

In [30]:
def vc_harmony_generate(n = 10, length = 10, grammatical = True,
                       vowels = None, consonants = None):
    """ Generates a collection words following the rule of vowel-consonant harmony.
    
    * n (int): number of strings that need to be generated;
    * length (int): length of every one of the generated strings;
    * grammatical (bool): if set to True, the correctly harmonizing
                          forms are generated, and if set to False,
                          the disharmonic forms are produced;
    * vowels (list): list of vowels among which the agreement
                     is established;
    * consonants (list): list of consonants among which the agreement
                         is established.
    """
    
    # initialization and sanity check for the list of vowels
    if vowels is None:
        vowels = ["a", "o"]
    elif len(vowels) < 2:
        raise IndexError("The vowel system needs to contain at least two distinct vowels.")
        
    # initialization and sanity check for the list of consonants
    if consonants is None:
        consonants = ["p", "b"]
    elif len(consonants) < 2:
        raise IndexError("The consonant system needs to contain at least two distinct consonants.")
    elif [i for i in vowels if i in consonants] or [i for i in consonants if i in vowels]:
        raise ValueError("Lists of harmonizing vowels and transparent elements cannot overlap.")
        
    # generate the required number of harmonic strings
    strings = []
    for i in range(n):
        v = choice(vowels)
        c = choice(consonants)
        new = "".join([choice([v, c]) for j in range(length)])
        
        # the ungrammatical forms are created by taking a random index
        # and rewriting it to the opposite vowel / consonant
        if not grammatical:
            ind = choice(range(length))
            if new[ind] == c:
                new = new[:ind] + choice([i for i in consonants if i != c]) + new[ind + 1:]
            else:
                new = new[:ind] + choice([i for i in vowels if i != v]) + new[ind + 1:]
                
        strings.append(new)
    
    return strings

In [31]:
print(vc_harmony_generate(n = 5, grammatical = True))
print(vc_harmony_generate(n = 5, grammatical = False))

['abbbbbabaa', 'aapaapaaap', 'aaababaabb', 'oooooobboo', 'aaabaaaabb']
['bopooooobo', 'aapppaoaaa', 'poappooopo', 'obbbobbpbo', 'ooboooboab']


### Structure sensitive muti-tier harmony

#### 1. Preparing a class to encode structure sensitive rules

In [32]:
class SSRule(object):
    """ A generic template for a structure-sensititve rule. 
    
    * symbols (tuple): list of tier symbols relevant for the generalization;
    * target (str): a target character context of which is important;
    * right_context (str): a context in which a target character
                           is projected on the tier;
    * can_follow (tuple): a list of tier symbols that are allowed after
                         the target character is projected.
    """
    def __init__(self, symbols, target, right_context, can_follow):
        self.symbols = symbols
        self.target = target
        self.right_context = right_context
        self.can_follow = can_follow

    def is_grammatical(self, string):
        """ Checks if the given form follows a rule that is encoded.
        
        * string (str): a string well-formedness of which needs to be checked.
        """
        
        # first, we get rid of all irrelevant symbols (not symbols and contexts)
        string = "".join([i for i in string if i in list(self.symbols) + [self.right_context]])
        
        # second, we construct a tier of that strings
        tier = ""
        for i in range(len(string)):
            if string[i] in self.symbols:
                if string[i] == self.target and i < len(string) - 1 and\
                    string[i + 1] == self.right_context:
                    tier += self.target
                elif string[i] != self.target:
                    tier += string[i]

        # third, we check if that tier is well-formed
        for t in range(len(tier)):
            if tier[t] == self.target and t < len(tier) - 1 and\
                tier[t + 1] not in self.can_follow:
                return False
        return True

In [33]:
R1 = SSRule(symbols = ("o", "e", "a"), target = "o", right_context = "x", can_follow = ("a", "o"))
R1.is_grammatical("oxoeaee")

False

In [34]:
# prose: vowels "o", "e", and "a" are projected on the tier, but "o" can only be projected
#        if it is immediately followed by "x". On the tier, after "o" we can only observe
#        "a" or "o", i.e. projected "o" cannot be followed by "e" over the tier.
R1 = SSRule(symbols = ("o", "e", "a"), target = "o", right_context = "x", can_follow = ("a", "o"))

# the tier is "aa", "o" is not projected because it is not followed by "x"
print(R1.is_grammatical("baboa"), "expected True") 
# the tier is "aoa", "o" is projected and followed by "a" (allowed)
print(R1.is_grammatical("baboxa"), "expected True")
# the tier is "ae", "o" is not projected because it is not followed by "x"
print(R1.is_grammatical("baboe"), "expected True")
# the tier is "aoe", "o" is projected and followed by "e" (NOT allowed)
print(R1.is_grammatical("baboxe"), "expected False")


# similarly, encoding another rule to make sure that we are dealing with multiple tiers
R2 = SSRule(symbols = ("b", "p", "d"), target = "b", right_context = "y", can_follow = ("b", "p"))

True expected True
True expected True
True expected True
False expected False


#### 2. Writing a generator of a sequence grammatical wrt the rule
TODO: make this function a method in the SSRule class above.

In [35]:
def generate_rule_sequence(rule, length = 7, grammatical = True):
    """ This function generates a sequence of symbols (un)grammatical 
        with respect to the given rule.
        
    * rule (SSRule): a rule describing a structure sensitive dependency;
    * length (int): length of the generated sequence;
    * grammatical (bool): produces correct form when set to True, and 
                          makes a mistake when set to False.
    """
    
    # the generation of the well-formed sequence is done by a simplistic FSA
    sequence = ""
    state = 0
    for i in range(length):
        
        # State 0: the target was not observed
        if state == 0:
            sequence += choice(list(rule.symbols) + [rule.right_context])
            if sequence[-1] == rule.target:
                state = 1
                
        # State 1: the target was observed
        elif state == 1:
            sequence += choice(list(rule.symbols) + [rule.right_context])
            if sequence[-1] == rule.right_context:
                state = 2
            elif sequence[-1] != rule.target:
                state = 0
                
        # State 2: the right context was observed
        elif state == 2:
            sequence += choice(list(rule.can_follow) + [rule.right_context])
            if sequence[-1] in rule.can_follow and sequence[-1] != rule.target:
                state = 0
                
    # if the ungrammatical form is needed, a violating sequence is generated
    # and inserted into a random position within the sequence
    if not grammatical:
        violate = rule.target + rule.right_context +\
            choice([i for i in list(rule.symbols) if i not in rule.can_follow])
        index_violate = choice(range(length - 3))
        sequence = sequence[:index_violate] + violate + sequence[index_violate + 3:]
        
    return sequence

In [36]:
examples1 = [generate_rule_sequence(R1) for i in range(1000)]
v1 = all(map(R1.is_grammatical, examples1))
examples2 = [generate_rule_sequence(R2) for i in range(1000)]
v2 = all(map(R2.is_grammatical, examples2))
examples3 = [generate_rule_sequence(R1, grammatical = False) for i in range(1000)]
v3 = not any(map(R1.is_grammatical, examples3))
examples4 = [generate_rule_sequence(R2, grammatical = False) for i in range(1000)]
v4 = not any(map(R2.is_grammatical, examples4))

assert all([v1, v2, v3, v4]) == True
print("The correctness of the sequence generator is verified.")

The correctness of the sequence generator is verified.


#### 3. Intertwine
Helper function for the next module, for taking two sequences that are grammatical wrt different rules and intertwining them.

In [37]:
def intertwine(str1, str2, r = (0, 3)):
    """ Intertwines two strings: str1 and str2. At every step, it takes
    some characters from one string, and then some characters from another.
    oxxooxa
    * str1 (str): the first string;
    * str2 (str): the second string;
    * r (tuple[int, int]): min and max+1 symbols to be taken.
    """
    new_string = ""
    current = choice([1, 2])
    while str1 or str2:
        if current == 1:
            cut = choice(range(r[0], r[1]))
            if len(str1) < cut:
                new = str1[:]
            else:
                new = str1[:cut]
            new_string += new
            str1 = str1[len(new):]
            current = 2
        elif current == 2:
            cut = choice(range(r[0], r[1]))
            if len(str2) < cut:
                new = str2[:]
            else:
                new = str2[:cut]
            new_string += new
            str2 = str2[len(new):]
            current = 1
    return new_string

In [38]:
intertwine("abcdefg", "12345678")

'ab1cd23e45f6g78'

#### 4. Generator for the IMTSL harmony

In [39]:
def imtsl_harmony_generate(n = 10, length = 10, grammatical = True,
                       rule_1 = None, rule_2 = None):
    """ Generates a collection words following the given rules of the structure-
        Takessensitive dependencies that involve several tiers.
    
    * n (int): number of strings that need to be generated;
    * length (int): length of every one of the generated strings;
    * grammatical (bool): if set to True, the correctly harmonizing
                          forms are generated, and if set to False,
                          the disharmonic forms are produced;
    * rule_1 (SSRule): the first rule describing a long-distant structure-
                      sensitive dependency;
    * rule_2 (SSRule): the second rule describing a long-distant structure-
                      sensitive dependency.
    """
    
    # set the rules to R1 and R2 shown above
    if rule_1 == None:
        rule_1 = SSRule(symbols = ("o", "e", "a"), target = "o",\
                        right_context = "x", can_follow = ("a", "o"))
    if rule_2 == None:
        rule_2 = SSRule(symbols = ("b", "p", "d"), target = "b",\
                        right_context = "y", can_follow = ("b", "p"))
    
    strings = []
    for i in range(n):
        # generate two tiers independently, and then intertwine them
        # WARNING: the tier alphabets of the two rules cannot overlap
        #          (required by both learner and generator)
        len_part_1 = length // 2
        len_part_2 = length - len_part_1

        part_1 = generate_rule_sequence(rule_1, len_part_1)
        part_2 = generate_rule_sequence(rule_2, len_part_2)

        if not grammatical:
            mistake = choice(["R1", "R2", "both"])
            if mistake == "R1":
                part_1 = generate_rule_sequence(rule_1, len_part_1, grammatical = False)
            elif mistake == "R2":
                part_2 = generate_rule_sequence(rule_2, len_part_2, grammatical = False)
            else:
                part_1 = generate_rule_sequence(rule_1, len_part_1, grammatical = False)
                part_2 = generate_rule_sequence(rule_2, len_part_2, grammatical = False)

        # intertwining the two generated sequences
        new_string = intertwine(part_1, part_2)
        strings.append(new_string)
    return strings

In [40]:
print(imtsl_harmony_generate(n = 15, length = 15))

['yboapaxdobappyo', 'dxapybapxebeopb', 'apxeaayadpaybdy', 'xyppaoeedeadppd', 'apxabxxpexdpddd', 'bybexbobbexoypx', 'ppoxdbaoyoxpodd', 'bapyeaobdooybbo', 'daepyxddeaaoddy', 'ebpoexebaeyybbb', 'boayyooeebbabyb', 'doeyaoeyodpodbb', 'opybeapodpxobpa', 'ypybxapdpxxdxoa', 'yxdbebpaybopooe']


In [41]:
good = imtsl_harmony_generate(n = 1000, length = 15)
good_evals = list(map(all, zip(map(R1.is_grammatical, good), map(R2.is_grammatical, good))))
bad = imtsl_harmony_generate(n = 1000, length = 15, grammatical = False)
bad_evals = list(map(all, zip(map(R1.is_grammatical, bad), map(R2.is_grammatical, bad))))
assert all(good_evals) == (not any(bad_evals)) == True
print("The correctness of the generator is verified.")

The correctness of the generator is verified.


## Getting data samples ready

In [42]:
def annotate(string):
    return ">>" + string + "<<"

In [170]:
import random
random.seed(0)
FL_harmony = first_last_generate(n = 200, length = 2)
FL_harmony.extend(first_last_generate(n = 200, length = 3))
FL_harmony.extend(first_last_generate(n = 200, length = 4))
FL_harmony.extend(first_last_generate(n = 200, length = 5))
FL_harmony.extend([">><<", ">>a<<", ">>o<<"])
FL_harmony = list(map(annotate, FL_harmony))
#FL_harmony_bad = first_last_generate(n = 150, length = 5, grammatical = False)
print(FL_harmony)

['>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>oo<<', '>>aa<<', '>>oo<<', '>>oo<<', '>>aa<<', '>>aa<<', '>>aa<<',

In [171]:
MTSL_harmony = vc_harmony_generate(n = 150, length = 7)
#print("F")MTSL_harmony_bad = vc_harmony_generate(n = 150, length = 7, grammatical = False)

In [172]:
IMTSL_harmony = imtsl_harmony_generate(n = 250, length = 15)
#IMTSL_harmony_bad = imtsl_harmony_generate(n = 250, length = 15, grammatical = False)

## Code of the learning algorithm

In [173]:
sigma1 = ["a", "o", "x"]
sigma2 = ["a", "o", "b", "p"]
sigma3 = ["a", "o", "e", "x", "b", "p", "d", "y"]

### Generating only well-formed n-grams

First, since we generate ngrams automatically, we need to have a checker that nothing like `a<<b`, `a><k`, or `fdh<` is generated.

In [174]:
def well_formed_ngram(ngram):
    """Tells if the given ngram is well-formed. An ngram is ill-formed if:
    * there is something in-between two start- or end-symbols
      ('>a>'), or
    * something is before start symbol or after the end symbol
       ('a>'), or
    * the ngram consists only of start- or end-symbols.
    Otherwise it is well-formed.
    Arguments:
        ngram (str): The ngram that needs to be evaluated.
    Returns:
        bool: well-formedness of the ngram.
    """
    start, end = [], []
    for i in range(len(ngram)):
        if ngram[i] == ">":
            start.append(i)
        elif ngram[i] == "<":
            end.append(i)

    start_len, end_len = len(start), len(end)
    if any([start_len == len(ngram), end_len == len(ngram)]):
        return False
    
    if start_len > 0:
        if ngram[0] != ">":
            return False
        if start_len > 1:
            for i in range(1, start_len):
                if start[i] - start[i - 1] != 1:
                    return False
    
    if end_len > 0:
        if ngram[-1] != "<":
            return False
        if end_len > 1:
            for i in range(1, end_len):
                if end[i] - end[i - 1] != 1:
                    return False
                
    # this part is different from the checker from the SigmaPie
    # to avoid passing fourgrams such as "><<<"
    if len(ngram) > 3 and (ngram[1] == "<" or ngram[2] == ">"):
        return False

    return True

In [175]:
# generate all possible 4-grams based on the alphabet

from itertools import product

def generate_all_ngrams(alphabet, size):
    all_of_them = ["".join(i) for i in product(alphabet + ["<", ">"], repeat = size)]
    return [i for i in all_of_them if well_formed_ngram(i)]

sigma1 = ["a", "o", "x"]
# print(generate_all_ngrams(sigma1, 4))

In [176]:
# see which 4-grams are not in data

def unattested_ngrams(data, all_ngrams):
    not_in_data = []
    for f in all_ngrams:
        found = False
        for string in data:
            if f in string:
                found = True
                break
        if not found:
            not_in_data.append(f)
    return not_in_data

unattested = unattested_ngrams(FL_harmony, generate_all_ngrams(sigma1, 4))
print(unattested)

['aax<', 'aox<', 'axx<', 'ax<<', 'oax<', 'oox<', 'oxx<', 'ox<<', 'xaax', 'xaox', 'xaxx', 'xax<', 'xoax', 'xoox', 'xoxx', 'xox<', 'xxax', 'xxox', 'xxxx', 'xxx<', 'xx<<', '>ao<', '>ax<', '>oa<', '>ox<', '>xaa', '>xao', '>xax', '>xa<', '>xoa', '>xoo', '>xox', '>xo<', '>xxa', '>xxo', '>xxx', '>xx<', '>x<<', '>>xa', '>>xo', '>>xx', '>>x<']


In [177]:
# find all paths from those those 4-grams


# NOTE: we cannot just look at all symbols when inserting/removing 
# from the set of possible tier alphabet items: to learn the whole class
# of IMTSL grammars, we need to look at all pairs of symbols.
# we check which other _pairs_ can be removed/added to every path X

# paths ("ab", [X], "cd") : set of tuples

import re
paths = {}

def string_paths(string, fourgram):
    ind_first = [s.start() for s in re.finditer(fourgram[:2], string)]
    ind_second = [s.start() for s in re.finditer(fourgram[2:], string)]
    
    if not (ind_first and ind_second):
        return []
    
    paths = set()
    
    for f in ind_first:
        for s in ind_second:
            if f >= s:
                continue
                
            middle = string[f+1:s+1]
            in_between = tuple(set(middle[i:i+2] for i in range(len(middle) - 1)))
            path = (fourgram[:2], in_between, fourgram[2:])
            paths.add(path)

    return paths

# string_paths(">>abcc<<", ">ac<")
# string_paths(">>oxao<<", "oxxa")
string_paths(">>aoxoao<<", ">ao<")

{('>a', ('ao', 'ox', 'xo', 'oa'), 'o<')}

In [178]:
# find all the paths for the unattested ngrams

# collect a list of paths for a particular (unattested) pair of bigrams
def data_paths(data, fourgram):
    total_paths = set()
    for d in data:
        total_paths = total_paths.union(string_paths(d, fourgram))
        
    return total_paths

# print(data_paths(FL_harmony, ">ao<"))

def find_relevant_paths(data, unattested_ngrams):
    relevant_paths = dict()
    for un in unattested_ngrams:
        relevant_paths[un] = data_paths(data, un)
    return relevant_paths

relevant_paths = find_relevant_paths(FL_harmony, unattested)
# print(relevant_paths)

In [214]:
import random
random.seed(0)
FL_harmony = first_last_generate(n = 2000, length = 2)
FL_harmony.extend(first_last_generate(n = 2000, length = 3))
FL_harmony.extend(first_last_generate(n = 2000, length = 4))
FL_harmony.extend(first_last_generate(n = 2000, length = 5))
FL_harmony.extend(first_last_generate(n = 2000, length = 6))
FL_harmony.extend(first_last_generate(n = 2000, length = 7))
FL_harmony.extend([">><<", ">>a<<", ">>o<<"])
FL_harmony = list(set(list(map(annotate, FL_harmony))))
#FL_harmony_bad = first_last_generate(n = 150, length = 5, grammatical = False)
print(FL_harmony)

['>>oaoaaao<<', '>>ooaxxxo<<', '>>aoxxa<<', '>>axoxxoa<<', '>>oxaaoxo<<', '>>oaoaxxo<<', '>>aaxaoaa<<', '>>oaxoaoo<<', '>>oaaaaxo<<', '>>aaxxoaa<<', '>>aaoaxxa<<', '>>oxxoaoo<<', '>>aaaxaxa<<', '>>aaoaa<<', '>>oxaaxxo<<', '>>oaaoxo<<', '>>oaxooao<<', '>>oooaooo<<', '>>oxaaxo<<', '>>axxaaa<<', '>>oxxooo<<', '>>axxxa<<', '>>ooooxo<<', '>>aaoaooa<<', '>>aaaoaaa<<', '>>axaaaa<<', '>>ooaaao<<', '>>axoooxa<<', '>>aoaxooa<<', '>>aooxoa<<', '>>ooxxaoo<<', '>>axaxooa<<', '>>oaooxxo<<', '>>ooaxoao<<', '>>oxxaooo<<', '>>aoaxxxa<<', '>>oaooaxo<<', '>>aaaaxxa<<', '>>oaoxo<<', '>>oaxo<<', '>>aaooa<<', '>>axaooaa<<', '>>aoaoa<<', '>>oxaoao<<', '>>aoxoxxa<<', '>>axxaooa<<', '>>oxoaoao<<', '>>oooxxxo<<', '>>aaoaaaa<<', '>>axoa<<', '>>axooxoa<<', '>>aoxooaa<<', '>>oaaxooo<<', '>>oxxoxao<<', '>>aoaxaaa<<', '>>aooxaoa<<', '>>ooaaoo<<', '>>aooaoxa<<', '>>oaooxao<<', '>>oaooao<<', '>>oxaoxao<<', '>>axxoaoa<<', '>>ooaaxoo<<', '>>aoaaaa<<', '>>aaaooxa<<', '>>ooxaxoo<<', '>>ooxoooo<<', '>>aoxaxoa<<', '>>aaxaoa

In [220]:
def learn_imtsl(data, alphabet, n = 2, context = 2, redacted=False):
    """
    A function that extracts IMTSL grammars. 
    
    Arguments:
    data (list): examples from the target language;
    alphabet (list or set): symbols of the language;
    n (int): size of the target ngrams (available for 2);
    context (int): size of the local context considered (available for 2).
    
    Returns:
    dict: keys are tier alphabets, values are tier grammars.
    """
    if n != 2 or context != 2:
        raise NotImplementedError("This algorithm does not support such values yet.")
        
    # collect a list of unattested ngrams (n * context = 2 * 2 = 4)
    unattested = unattested_ngrams(data, generate_all_ngrams(alphabet, 4))
    
    # build a look-up table with the relevant paths
    relevant_paths = find_relevant_paths(FL_harmony, unattested)
    
    # initialize the grammar and the maximum tier
    max_tier_guess = generate_all_ngrams(alphabet, context)
    grammar = dict()
    
    for un in unattested:
        
        local_tier = [un[:2], un[2:]]
        local_relevant = [(i[0], set(i[1]), i[2]) for i in relevant_paths[un]]
        
        for ss in max_tier_guess:
            if ss in [un[:2], un[2:]]:
                continue
                
            contain_ss = [i for i in local_relevant if ss in i[1]]
            
            added = False
            for pth in contain_ss:
                no_ss_in_path = (pth[0], set(i for i in pth[1] if i != ss), pth[2])
                if no_ss_in_path not in local_relevant:
                    local_tier.append(ss)
                    added = True
                    break
                if added:
                    continue
                    
        grammar[(un[:2], un[2:])] = local_tier[:]
        
    if not redacted:
        return grammar
    
    new_grammar = dict()
    for pair in grammar:
        if tuple(grammar[pair]) not in new_grammar:
            new_grammar[tuple(grammar[pair])] = [pair]
        else:
            new_grammar[tuple(grammar[pair])].append([pair])
    
    del grammar
    return new_grammar



from pprint import pprint
pprint(learn_imtsl(FL_harmony, sigma1, redacted=False))

# for all tier candidates
    # check the add condition
    # check the remove condition

{('>>', 'x<'): ['>>', 'x<'],
 ('>>', 'xa'): ['>>', 'xa', 'ao', 'ax', 'oa', 'ox', 'xo', '>a', '>o'],
 ('>>', 'xo'): ['>>', 'xo', 'ao', 'ax', 'oa', 'ox', 'xa', '>a', '>o'],
 ('>>', 'xx'): ['>>', 'xx', 'ao', 'ax', 'oa', 'ox', 'xa', 'xo', '>a', '>o'],
 ('>a', 'o<'): ['>a', 'o<'],
 ('>a', 'x<'): ['>a', 'x<'],
 ('>o', 'a<'): ['>o', 'a<'],
 ('>o', 'x<'): ['>o', 'x<'],
 ('>x', '<<'): ['>x', '<<'],
 ('>x', 'a<'): ['>x', 'a<'],
 ('>x', 'aa'): ['>x', 'aa'],
 ('>x', 'ao'): ['>x', 'ao'],
 ('>x', 'ax'): ['>x', 'ax'],
 ('>x', 'o<'): ['>x', 'o<'],
 ('>x', 'oa'): ['>x', 'oa'],
 ('>x', 'oo'): ['>x', 'oo'],
 ('>x', 'ox'): ['>x', 'ox'],
 ('>x', 'x<'): ['>x', 'x<'],
 ('>x', 'xa'): ['>x', 'xa'],
 ('>x', 'xo'): ['>x', 'xo'],
 ('>x', 'xx'): ['>x', 'xx'],
 ('aa', 'x<'): ['aa', 'x<'],
 ('ao', 'x<'): ['ao', 'x<'],
 ('ax', '<<'): ['ax', '<<', 'ao', 'a<', 'oa', 'ox', 'o<', 'xa', 'xo'],
 ('ax', 'x<'): ['ax', 'x<'],
 ('oa', 'x<'): ['oa', 'x<'],
 ('oo', 'x<'): ['oo', 'x<'],
 ('ox', '<<'): ['ox', '<<', 'ao', 'ax', 'a<