In [29]:
import pandas as pd
from nltk.metrics import edit_distance, binary_distance
from pyphonetics import Soundex
import string
import numpy as np
import ast

In [2]:
df = pd.read_csv('tt_dataset.csv')
df

Unnamed: 0,text,phonemes,soundex
0,Peter Piper picked a peck of pickled peppers. ...,"[['P', 'IY1', 'T', 'ER0'], ['P', 'AY1', 'P', '...","['P360', 'P160', 'P230', 'A000', 'P200', 'O100..."
1,I saw Susie sitting in a shoe shine shop. Wher...,"[['AY1'], ['S', 'AO1'], ['S', 'UW1', 'Z', 'IY0...","['I000', 'S000', 'S200', 'S352', 'I500', 'A000..."
2,How many boards Could the Mongols hoard If the...,"[['HH', 'AW1'], ['M', 'EH1', 'N', 'IY0'], ['B'...","['H000', 'M500', 'B632', 'C430', 'T000', 'M524..."
3,How can a clam cram in a clean cream can?,"[['HH', 'AW1'], ['K', 'AE1', 'N'], ['AH0'], ['...","['H000', 'C500', 'A000', 'C450', 'C650', 'I500..."
4,Send toast to ten tense stout saints' ten tall...,"[['S', 'EH1', 'N', 'D'], ['T', 'OW1', 'S', 'T'...","['S530', 'T230', 'T000', 'T500', 'T520', 'S330..."
...,...,...,...
2671,Then step up mister and twist your tongue,"[['DH', 'EH1', 'N'], ['S', 'T', 'EH1', 'P'], [...","['T500', 'S310', 'U100', 'M236', 'A530', 'T230..."
2672,Now Kissle will whistle at busty Miss. Russell...,"[['N', 'AW1'], ['K', 'IH1', 'S', 'AH0', 'L'], ...","['N000', 'K240', 'W400', 'W234', 'A300', 'B230..."
2673,"Purple paper people, purple paper people, purp...","[['P', 'ER1', 'P', 'AH0', 'L'], ['P', 'EY1', '...","['P614', 'P160', 'P140', 'P614', 'P160', 'P140..."
2674,De doorgaans dappere Durgerdammer drukker Dirk...,"[['D', 'IY1'], ['D', 'AO1', 'R', 'G', 'AH0', '...","['D000', 'D625', 'D160', 'D626', 'D626', 'D620..."


## implement metrics

### for text

In [3]:
### necessary functions

# remove all punctuation from a string
def remove_punctuation(input_string):
    punctuation_chars = string.punctuation # Create a string containing all punctuation marks
    no_punct = ''.join(char for char in input_string if char not in punctuation_chars) # Create new string without punctuation
    
    return no_punct



In [4]:
# calculates levenshtein dist between every two words in a string
# args: input string, boolean of allowing transpositions
def levenshtein_dist(tongue_twister, transp):
    words = remove_punctuation(tongue_twister).split(' ') # split tt into indiv words
    levenshtein_arr = np.zeros((len(words), len(words))) # initialise distance array
    for i in range(len(words)):
        for j in range(i+1, len(words)):
            dist = edit_distance(words[i], words[j], substitution_cost=1, transpositions=transp) # Transposition True because of difficukty to distinguish both words
            levenshtein_arr[i, j] = dist
            levenshtein_arr[j, i] = dist
            
    return levenshtein_arr


# calculates binary dist between every two words in a string
# args: input string
def binary_dist(tongue_twister):
    words = remove_punctuation(tongue_twister).split(' ') # split tt into indiv words
    binary_dist_arr = np.zeros((len(words), len(words))) # initialise distance array
    for i in range(len(words)):
        for j in range(i, len(words)):
            dist = binary_distance(words[i], words[j]) 
            binary_dist_arr[i, j] = dist
            binary_dist_arr[j, i] = dist
            
    return binary_dist_arr

In [5]:
print(levenshtein_dist(df['text'][0], True))

[[0. 2. 5. ... 0. 2. 5.]
 [2. 0. 4. ... 2. 0. 4.]
 [5. 4. 0. ... 5. 4. 0.]
 ...
 [0. 2. 5. ... 0. 2. 5.]
 [2. 0. 4. ... 2. 0. 4.]
 [5. 4. 0. ... 5. 4. 0.]]


In [6]:
print(df['text'][0])
print(binary_dist(df['text'][0]))

Peter Piper picked a peck of pickled peppers. A peck of pickled peppers Peter Piper picked. If Peter Piper picked a peck of pickled peppers, Where's the peck of pickled peppers Peter Piper picked?
[[0. 1. 1. ... 0. 1. 1.]
 [1. 0. 1. ... 1. 0. 1.]
 [1. 1. 0. ... 1. 1. 0.]
 ...
 [0. 1. 1. ... 0. 1. 1.]
 [1. 0. 1. ... 1. 0. 1.]
 [1. 1. 0. ... 1. 1. 0.]]


### for phonemes

In [44]:
# Nimmt zwei Listen mit Phonemen (, die je ein Wort darstellen) und gibt den Levenshtein-Abstand zwischen den Phonemlisten zurück
def phoneme_levenshtein_dist(ph_list1, ph_list2):
    len_str1 = len(ph_list1) + 1
    len_str2 = len(ph_list2) + 1
    matrix = [[0] * len_str2 for _ in range(len_str1)] # Erstellung der Matrix, in der die Abstände gespeichert werden
    # Erste Zeile und Spalte füllen
    for i in range(len_str1):
        matrix[i][0] = i
    for j in range(len_str2):
        matrix[0][j] = j
    # Restliche Matrix füllen
    for i in range(1, len_str1):
        for j in range(1, len_str2):
            cost = 0 if ph_list1[i - 1] == ph_list2[j - 1] else 1
            matrix[i][j] = min(
                matrix[i - 1][j] + 1,      # Löschen
                matrix[i][j - 1] + 1,      # Einfügen
                matrix[i - 1][j - 1] + cost  # Ersetzen
            )

    # The final value in the matrix represents the Levenshtein distance
    return matrix[-1][-1]

In [50]:
ph_list = ast.literal_eval(df['phonemes'][0])
print(ph_list)

print(phoneme_levenshtein_dist(ph_list[0], ph_list[2]))

[['P', 'IY1', 'T', 'ER0'], ['P', 'AY1', 'P', 'ER0'], ['P', 'IH1', 'K', 'T'], ['AH0'], ['P', 'EH1', 'K'], ['AH1', 'V'], ['P', 'IH1', 'K', 'AH0', 'L', 'D'], ['P', 'EH1', 'P', 'ER0', 'Z'], ['AH0'], ['P', 'EH1', 'K'], ['AH1', 'V'], ['P', 'IH1', 'K', 'AH0', 'L', 'D'], ['P', 'EH1', 'P', 'ER0', 'Z'], ['P', 'IY1', 'T', 'ER0'], ['P', 'AY1', 'P', 'ER0'], ['P', 'IH1', 'K', 'T'], ['IH1', 'F'], ['P', 'IY1', 'T', 'ER0'], ['P', 'AY1', 'P', 'ER0'], ['P', 'IH1', 'K', 'T'], ['AH0'], ['P', 'EH1', 'K'], ['AH1', 'V'], ['P', 'IH1', 'K', 'AH0', 'L', 'D'], ['P', 'EH1', 'P', 'ER0', 'Z'], ['W', 'EH1', 'R', 'Z'], ['DH', 'AH0'], ['P', 'EH1', 'K'], ['AH1', 'V'], ['P', 'IH1', 'K', 'AH0', 'L', 'D'], ['P', 'EH1', 'P', 'ER0', 'Z'], ['P', 'IY1', 'T', 'ER0'], ['P', 'AY1', 'P', 'ER0'], ['P', 'IH1', 'K', 'T']]
3


### for soundex

In [40]:
soundex = Soundex()

sd = ast.literal_eval(df['soundex'][0])
sd2 = list(df['soundex'][0])
print(sd[3])
print(soundex.distance('test', 'rear', metric='levenshtein'))


# calculates levenshtein dist between every two words in a string
# args: input string, boolean of allowing transpositions
def soundex_levenshtein_dist(soundex_list):
    # words = remove_punctuation(tongue_twister).split(' ') # split tt into indiv words
    levenshtein_arr = np.zeros((len(soundex_list), len(soundex_list))) # initialise distance array
    for i in range(len(soundex_list)):
        for j in range(i+1, len(soundex_list)):
            dist = edit_distance(soundex_list[i], soundex_list[j], substitution_cost=1, transpositions=False) # Transposition True because of difficukty to distinguish both words
            levenshtein_arr[i, j] = dist
            levenshtein_arr[j, i] = dist
            
    return levenshtein_arr

A000
3


In [52]:
s1 = df['soundex'][0]
print(s1)
print(soundex_levenshtein_dist(s1))

['P360', 'P160', 'P230', 'A000', 'P200', 'O100', 'P243', 'P162', 'A000', 'P200', 'O100', 'P243', 'P162', 'P360', 'P160', 'P230', 'I100', 'P360', 'P160', 'P230', 'A000', 'P200', 'O100', 'P243', 'P162', 'W620', 'T000', 'P200', 'O100', 'P243', 'P162', 'P360', 'P160', 'P230']
[[0. 1. 1. ... 1. 1. 1.]
 [1. 0. 1. ... 1. 0. 1.]
 [1. 1. 0. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 0. 1. 1.]
 [1. 0. 1. ... 1. 0. 1.]
 [1. 1. 1. ... 1. 1. 0.]]
