In [210]:
from itertools import product
from more_itertools import run_length

In [211]:
def vocabulary(letters, n):
    aux = [letters]*n
    return [''.join(k) for k in list(product(*aux))]
    

def simplification_rules(Lmax):

    rules = {}

    if letters == ['H','T']:

        # H rules
        for n in range(1,Lmax+1):
            rules[f'H{n}'] = ('' if n%2==0 else 'H')

        # T rules
        rules['T1'] = 'T'
        rules['T2'] = 'S'
        rules['T4'] = 'Z'
        rules['T8'] = ''
        rules['T3'] = 'ST'
        rules['T5'] = 'ZT'
        rules['T6'] = 'ZS'
        rules['T7'] = 'ZST'
        for n in range(9,Lmax+1):
            rules[f'T{n}'] = rules[f'T{n-8}']

        # S rules
        rules['S1'] = 'S'
        rules['S2'] = 'Z'
        rules['S3'] = 'ZS'
        rules['S4'] = ''
        for n in range(5,Lmax+1):
                    rules[f'S{n}'] = rules[f'S{n-4}']

        # Z rules
        rules['Z1'] = 'Z'
        rules['Z2'] = ''
        for n in range(3,Lmax+1):
                    rules[f'Z{n}'] = rules[f'Z{n-2}']        

        # conjugation rules
        rules['HZH'] = 'X'
        rules['HSH'] = 'Sd'
        rules['HSdH'] = 'S'
        rules['HTH'] = 'Td'
        rules['HTdH'] = 'T'
        
        return rules

    else:
        print('Rules not implemented!')

In [246]:
letters = ['H','T']
L = 20

vocab = []
simp = {}
for l in range(1,L+1):
    vocab = vocab + vocabulary(letters, l)
    for k,v in simplification_rules(l).items():
        simp[k] = v
totalsize = len(vocab)
size = totalsize # will be updated

print(f'There are {size} words in the vocabulary with length up to {L}')

for i in range(1,100):

    vocab = [list(run_length.encode(word)) for word in vocab]
    vocab = [' '.join([k[0]+str(k[1]) for k in word]) for word in vocab]
    # print(f'Vocabulary using run length encoding:\n{vocab}\n')
    vocab = [word.split(' ') for word in vocab]
    vocab = [[(simp[syl] if syl != '' else '') for syl in word] for word in vocab]
    vocab = list(set([''.join(word) for word in vocab]))
    print(f'\n---- simplification #{i}: reduced to {len(vocab)} unique words')
    if len(vocab) < size:
        size = len(vocab)
    else:
        print(f'\nDONE!\n\nCompressed to {round(100*size/totalsize,2)}% of the original vocabulary\n')
        break

print(f'Here are some of the unique words:\n\n{vocab[:50]}')

There are 2097150 words in the vocabulary with length up to 20

---- simplification #1: reduced to 235580 unique words

---- simplification #2: reduced to 125622 unique words

---- simplification #3: reduced to 104389 unique words

---- simplification #4: reduced to 102621 unique words

---- simplification #5: reduced to 102035 unique words

---- simplification #6: reduced to 101899 unique words

---- simplification #7: reduced to 101868 unique words

---- simplification #8: reduced to 101867 unique words

---- simplification #9: reduced to 101867 unique words

DONE!

Compressed to 4.86% of the original vocabulary

Here are some of the unique words:

['', 'HZSHSHTSHTH', 'THSHTHZHZSHT', 'THSHTHTHTHTHZTHT', 'ZTHTHTHZHTHT', 'SHTSHSHZHTH', 'HTHTHSHSTHTHSHSH', 'TZSTHZTS', 'THSHTHTHTHZHST', 'THTHTHZSHSTHTHT', 'THTHSTSHTHTH', 'SHTSHZHTHSHT', 'THTSHSTSTHTHT', 'HSHZHZTHZ', 'THSHTHTHSHZS', 'HTSTHZSHSTHT', 'HTHTHTHSHTHSHTHT', 'TSHZTHTHTHT', 'HTHSHSTHZHSHT', 'THTHSTSTHTHTHST', 'SHTSHSHTHTSH', 'STH