In [1]:
import random as ra
import pandas as pd
import string
import hungarian_algorithm.algorithm as ha

**Import the table of visual similarities between lower-case letters.** For this step to work, you will need to download the file `13428_2012_271_MOESM1_ESM.xlsx` which is available in the "Electronic supplementary material" for the research paper
[A letter visual-similarity matrix for Latin-based alphabets](https://link.springer.com/article/10.3758/s13428-012-0271-4).

As described in the paper, two forms of letter 'a' are included in the table; we average the similarities for the two forms of 'a'.

In [2]:
def read_similarity_table():
    
    input_filename = "13428_2012_271_MOESM1_ESM.xlsx"
    
    df = pd.read_excel(input_filename, sheet_name="List-Lower", usecols="C:E")
    df = df.loc[df['Letter1'].isin(list(string.ascii_lowercase))]
    df = df.loc[df['Letter2'].isin(list(string.ascii_lowercase))]
    df = df.loc[df['Letter1'] != df['Letter2']]

    df = df.groupby(['Letter1', 'Letter2']).agg(similarity = ('Value', 'mean')).reset_index()    

    df['Key'] = df.apply(lambda row: (row['Letter1'], row['Letter2']), axis='columns')

    assert df.shape[0] == 26 * 25
    
    return df


In [3]:
sim_df = read_similarity_table()

**Which letter pairs were judged the most similar? Which were judged the least similar?**

In [4]:
sim_df.loc[sim_df['Letter1'] < sim_df['Letter2']].sort_values('similarity', ascending=False).head(5)

Unnamed: 0,Letter1,Letter2,similarity,Key
210,i,l,6.133333,"(i, l)"
27,b,d,5.6,"(b, d)"
390,p,q,5.566667,"(p, q)"
187,h,n,5.533333,"(h, n)"
548,v,y,5.333333,"(v, y)"


In [5]:
sim_df.loc[sim_df['Letter1'] < sim_df['Letter2']].sort_values('similarity').head(5)

Unnamed: 0,Letter1,Letter2,similarity,Key
238,j,o,1.0,"(j, o)"
496,t,w,1.0,"(t, w)"
421,q,w,1.033333,"(q, w)"
446,r,w,1.033333,"(r, w)"
263,k,o,1.033333,"(k, o)"


In [6]:
sim_dict = sim_df.set_index('Key')['similarity'].to_dict()

We decree that every letter is highly similar to itself.

In [7]:
self_similarity = sim_df['similarity'].max() + 1

In [8]:
for letter in string.ascii_lowercase:
    sim_dict[(letter, letter)] = self_similarity

**Function to measure the total similarity of a rearrangement of a word.**

In [9]:
def rearr_similarity(word, rearr):
    return sum([sim_dict[(x, y)] for x, y in zip(word, rearr)])

In [10]:
rearr_similarity("boat", "baot")

22.4

In [11]:
rearr_similarity("boat", "taob")

11.399999999999999

In [12]:
rearr_similarity("boat", "atob")

9.666666666666666

**Function to build the dictionary representation of the assignment problem needed by `hungarian-algorithm` library.**

In [13]:
def mk_assignment_problem(word):

    def options_for_letter(letter):
        return {'_%d' % i: sim_dict[(letter, word[i])] for i in range(len(word))}
    
    return {str(i): options_for_letter(word[i]) for i in range(len(word))}

**Function to build the assignment problem, solve it, and interpret the results into a rearranged word.**

In [14]:
def rearrange_word(word):
    matching = ha.find_matching(mk_assignment_problem(word), matching_type = 'min')

    def recover_mapping(x):
        src = int(x[0][0])
        dst = int(x[0][1][1:])
        return (src, dst)
    
    permutation = list(map(recover_mapping, matching))
    permutation_dict = {src: dst for src, dst in permutation}
    
    return "".join([word[permutation_dict[i]] for i in range(len(word))])
    
    return permutation_dict

In [15]:
rearrange_word("boat")

'atbo'