In [667]:
import numpy as np
import editdistance

In [668]:
# read syllables.txt
syllables = []
with open('syllables.txt', 'r') as f:
    for line in f:
        syllables.append(line.strip())

In [669]:
# remove ; from syllables and generate list of all 4 and 8 letter words
wordss = []
for s in syllables:
    wordss.append(s.replace(';', ''))
wordss4 = [w for w in wordss if len(w) == 4]
wordss8 = [w for w in wordss if len(w) == 8]

In [670]:
data = [s.split(';') for s in syllables if ";" in s]
# add <BOS> and <EOS> to each entry
data = [['<BOS>'] + d + ['<EOS>'] for d in data if len(d) <= 3]

In [671]:
# flatten data and convert to set
syllables = ['<BOS>', '<EOS>'] + list(set([item for sublist in data for item in sublist if item != '<BOS>' and item != '<EOS>']))

In [672]:
len(syllables)

5153

In [673]:
# build a 2-gram transition matrix from data using syllables as index
# initialize matrix
transition_matrix = np.zeros((len(syllables), len(syllables)))

# fill matrix
for s in data:
    for i in range(len(s)-1):
        transition_matrix[syllables.index(s[i]), syllables.index(s[i+1])] += 1

# add a constant to each row to avoid zero probabilities
transition_matrix += 1

In [674]:
# normalize across rows so that the sum of each row is 1
transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)
# replace NaNs with 0
transition_matrix = np.nan_to_num(transition_matrix)

In [675]:
def generate_word(n):
    # start with <BOS> (index 0)
    word = [0]

    # generate word
    for i in range(n):
        # get last syllable
        last_syllable = word[-1]
        # get next syllable
        next_syllable = np.random.choice(len(syllables), p=transition_matrix[last_syllable])
        # if next syllable is <EOS>, stop
        if next_syllable == 1:
            break
        # append next syllable to word
        word.append(next_syllable)
    
    # convert word to string
    word = ''.join([syllables[s] for s in word])
    # remove <BOS> and <EOS> (if present)
    word = word.replace('<BOS>', '')
    word = word.replace('<EOS>', '')
    return word

In [701]:
# generate 10000 2-syllable words and pick ones with the length of 8 letters
words_8 = [generate_word(3) for i in range(50000)]
words_8 = list(set([w for w in words_8 if len(w) == 8]))
# remove words_8 that are in wordss8
words_8 = [w for w in words_8 if w not in wordss8]

In [702]:
words_4 = [generate_word(1) for i in range(50000)]
words_4 = list(set([w for w in words_4 if len(w) == 4]))
# remove words_4 that are in wordss4
words_4 = [w for w in words_4 if w not in wordss4]

In [739]:
# create all possible combinations of words_8 and words_4 and choose 20 pairs with the highest edit distance
pairs = [(w1, w2) for w1 in words_8 for w2 in words_4]
# remowe duplicated pairs
pairs = list(set(pairs))

# calculate edit distance for each pair
edit_distances = [editdistance.eval(p[0], p[1]) for p in pairs]

# remove duplicates
# edit_distances = list(set(edit_distances))

In [740]:
# sort pairs by edit distance and pick the 20 pairs with the highest edit distance
pairs = [p for _, p in sorted(zip(edit_distances, pairs), reverse=True)]

In [741]:
# select top 1000 pairs and sample 10 random pairs
pairs_ = pairs[:80000]
# random choice from pairs_ (multidimensional array)
pairs_ = [f'{p[0]} {p[1]}' for p in pairs_]

In [787]:
# random 10 words_8
words_8_ = np.random.choice(words_8, size=5, replace=False)
# random 10 words_4
words_4_ = np.random.choice(words_4, size=5, replace=False)

In [788]:
words_8_

array(['grayosun', 'caugegat', 'biowclos', 'disaxlol', 'efvawick'],
      dtype='<U8')

In [789]:
words_4_

array(['glan', 'clus', 'dane', 'asth', 'luke'], dtype='<U4')