# TSL data helper

Let's examine a student's data...

In [28]:
import sigmapie
nawuri = sigmapie.TSL(polar="n")

'''
C = rounded consonant
c = unrounded non-labial consonant
l = unrounded labial consonant
v = unrounded vowel
V = rounded vowel
'''

data = ['cvlv', 'cvcvlv', 'cVcV', 'cvcv', 'cvcvcv', 'cVcV', 
        'cVCv', 'cVCVcC', 'cvlV', 
        'cvlVcV', 'cvlV', 'cvccv', 'cvcccv', 'cvclV', 'cvlcV', 
        'cvclv', 'cvlcv', 'cVcCv', 'cVCcv', 'cc', 'ccc', 'c', '']
# data = ['cvcv', 'cVcV','cVCv', 'cVCVcC', 'cvccv', 'cVCcv', #non-blocked patterns
#              'cvlcV', 'cvclv', 'cvlV', #blocked patterns
#               'c', '']#other


nawuri.data = data
nawuri.extract_alphabet()
nawuri.learn()
#nawuri.switch_polarity()

print("p:\t",nawuri.check_polarity())
print("T:\t",nawuri.tier)
print("G:\t",nawuri.grammar)


p:	 n
T:	 ['C', 'V', 'c', 'l', 'v']
G:	 [('C', 'C'), ('C', 'l'), ('V', 'V'), ('V', 'l'), ('V', 'v'), ('l', 'C'), ('l', 'l'), ('l', '<'), ('v', 'C'), ('v', 'V'), ('v', 'v'), ('>', 'C'), ('>', 'V'), ('>', 'l'), ('>', 'v')]


**Problem**: According to Jardine 2016, `c` should be removed from the tier because it is in free distribution

***Actual* problem**: The `sigmapie` package implements a slightly different version of the TSL learning algorithm ([Jardine & McMullin 2016](https://adamjardine.net/files/jardinemcmullin2016tslk.pdf)), with stricter requirements on what can be removed from a tier (oops!)

## TSL data helper

The following functions test the first round of the learning algorithm (where $T = \Sigma$) to see what can be removed from the tier. This is the same thing `sigmapie` does, but the below prints what it finds. 

In [8]:
def k_fac(s, k):
    '''get k-factors with edge markers'''
    fac = []
    s = '>'+s+'<'
    if len(s) < k:
        fac.append(tuple(s))
    else:
        for i in range(len(s)-(k-1)):
            fac.append(tuple(s[i:i+k]))
    return fac

def sample_factors(sample, k):
    '''get all k-factors from a data set'''
    factors = []
    for s in sample:
        factors.extend(k_fac(s,k))
    return factors

def inject(fac,sym):
    '''helper for Condition a for Theorem 2'''
    injected = []
    for i in range(len(fac)+1):
        new = fac[:i] + (sym,) + fac[i:]
        injected.append(new)
    return [f for f in injected if f[0] != '<' and f[1] != '>']

def eject(fac,sym):
    '''helper for Condition b for Theorem 2'''
    if sym in fac:
        for i in range(len(f)):
            if f[i] == 'a':
                new = f[:i] + f[i+1:]
                return new
    
def test_injections(less,kf,sym):
    '''Tests condition a for Theorem 2'''
    test_set = set()
    for f in less:
        for new in inject(f,sym):
            test_set.add(new)
    return test_set.difference(kf)

def test_ejections(more,kf,sym):
    '''Tests condition b for Theorem 2'''
    test_set = set()
    for f in more:
        if sym in f:
            for i in range(len(f)):
                if f[i] == sym:
                    new = f[:i] + f[i+1:]
                    test_set.add(new)
    return test_set.difference(kf)

def test_sample_set(sample, k):
    alphabet = list(set(list(''.join(sample))))
    k_factors = sample_factors(sample, k)
    less_factors = sample_factors(sample, k-1)
    more_factors = sample_factors(sample, k+1)
    for sym in alphabet:
        print(f"Testing distribution of '{sym}'...", end ="")
        i_result = test_injections(less_factors, k_factors, sym)
        e_result = test_ejections(more_factors, k_factors, sym)
        results = i_result.union(e_result)
        if results == set():
            print(f"PASS, '{sym}' will be removed from T")
        else:
            print(f"FAIL")
            if i_result != set():
                print(f"\t '{sym}' not freely distributed, missing factor(s) {i_result}")
            if e_result != set():
                print(f"\t '{sym}' as an intervener has the following dependent factor(s): {e_result}")

### Example 1 - (NoMoreThan)OneB

In [9]:
alphabet = ['a','b']
sample = ['','a','ba','ab','aba','aa']
k = 2

test_sample_set(sample, k)

Testing distribution of 'a'...PASS, 'a' will be removed from T
Testing distribution of 'b'...FAIL
	 'b' not freely distributed, missing factor(s) {('b', 'b')}


In [10]:
oneb = sigmapie.TSL(polar="n")
oneb.data = sample
oneb.extract_alphabet()
oneb.learn()

print("p:\t",oneb.check_polarity())
print("T:\t",oneb.tier)
print("G:\t",oneb.grammar)

p:	 n
T:	 ['b']
G:	 [('b', 'b')]


### Example 2 - back to Nawuri

In [29]:
print(data,"\n")

data += ['v', 'V', 'vv', 'VV', 'Vlv', 'l', 'll']

test_sample_set(data, 2)

['cvlv', 'cvcvlv', 'cVcV', 'cvcv', 'cvcvcv', 'cVcV', 'cVCv', 'cVCVcC', 'cvlV', 'cvlVcV', 'cvlV', 'cvccv', 'cvcccv', 'cvclV', 'cvlcV', 'cvclv', 'cvlcv', 'cVcCv', 'cVCcv', 'cc', 'ccc', 'c', ''] 

Testing distribution of 'l'...FAIL
	 'l' not freely distributed, missing factor(s) {('l', 'C'), ('C', 'l')}
	 'l' as an intervener has the following dependent factor(s): {('v', 'V'), ('V', 'v')}
Testing distribution of 'c'...PASS, 'c' will be removed from T
Testing distribution of 'C'...FAIL
	 'C' not freely distributed, missing factor(s) {('C', 'C'), ('v', 'C'), ('C', 'l'), ('>', 'C'), ('l', 'C')}
	 'C' as an intervener has the following dependent factor(s): {('V', 'v')}
Testing distribution of 'V'...FAIL
	 'V' not freely distributed, missing factor(s) {('v', 'V'), ('V', 'v')}
Testing distribution of 'v'...FAIL
	 'v' not freely distributed, missing factor(s) {('v', 'V'), ('V', 'v'), ('v', 'C')}


In [30]:
new = sigmapie.TSL(polar="n")
new.data = data
new.extract_alphabet()
new.learn()

print("p:\t",new.check_polarity())
print("T:\t",new.tier)
print("G:\t",new.grammar)

p:	 n
T:	 ['C', 'V', 'l', 'v']
G:	 [('C', 'C'), ('C', 'l'), ('V', 'v'), ('l', 'C'), ('v', 'C'), ('v', 'V'), ('>', 'C')]


## One problem at a time

In [36]:
stress_alpha = ['c','v','V']

# V = stressed vowel
# words must have exactly one stressed vowel

stress_lang = sigmapie.TSL(polar="n")
stress_lang.tier = ['V']
stress_lang.grammar = [('>', '<'),('V','V')]
stress_lang.alphabet = stress_alpha

print("T:\t",stress_lang.tier)
print("G:\t",stress_lang.grammar)

T:	 ['V']
G:	 [('>', '<'), ('V', 'V')]


In [37]:
stress_lang.generate_sample(10,repeat=False)

['vVcv', 'vvcVcc', 'cVv', 'cvV', 'cvcV', 'vvVcc', 'vvVc', 'vVv', 'cV', 'cVc']

In [38]:
# syllables must always have one onset and no codas

syl_sample = ['cV','cv','cvcv','cVcV','']
syl_lang = sigmapie.SL(polar="p")
syl_lang.data = syl_sample
syl_lang.extract_alphabet()
syl_lang.learn()

print("G:\t",syl_lang.grammar)

G:	 [('v', 'c'), ('V', '<'), ('V', 'c'), ('c', 'V'), ('>', 'c'), ('>', '<'), ('c', 'v'), ('v', '<')]


In [39]:
syl_lang.generate_sample(10,repeat=False)

['',
 'cvcvcV',
 'cv',
 'cVcV',
 'cVcVcV',
 'cvcV',
 'cVcvcvcVcvcVcvcVcvcvcVcvcv',
 'cvcVcv',
 'cVcv',
 'cV']

In [46]:
# intersect the two languages
# what is the complexity of this resulting language?

stress_set = set(stress_lang.generate_sample(100,repeat=False))
syl_set = set(syl_lang.generate_sample(100,repeat=False))

print(stress_set.intersection(syl_set))

{'cvcVcv', 'cvcV', 'cVcv', 'cV'}
