In [1]:
import jellyfish as j
from fuzzywuzzy import fuzz

# Soundex Tests (only English)

In [2]:
j.soundex(u'beirut')

'B630'

In [3]:
j.soundex(u'beyrut')

'B630'

In [4]:
j.soundex(u'beyrouth')

'B630'

In [5]:
j.soundex(u'بيروت')

'ب000'

In [6]:
j.soundex(u'صيدا')

'ص000'

-----

# Metaphone Tests (only English)

In [7]:
j.metaphone(u'beirut')

'BRT'

In [8]:
j.metaphone(u'beyrut')

'BRT'

In [9]:
j.metaphone(u'beyrouth')

'BR0'

In [10]:
j.metaphone(u'بيروت')

''

In [11]:
j.metaphone(u'صيدا')

''

---

# NYSIIS Tests (only English)

In [12]:
j.nysiis(u'beirut')

'BARAT'

In [13]:
j.nysiis(u'beyrut')

'BAYRAT'

In [14]:
j.nysiis(u'beyrouth')

'BAYRAT'

In [15]:
j.nysiis(u'بيروت')

'بيروت'

In [16]:
j.nysiis(u'صيدا')

'صيدا'

---

# Match Rating Codex Tests

In [17]:
j.match_rating_codex(u'beirut')

'BRT'

In [18]:
j.match_rating_codex(u'beyrut')

'BYRT'

In [19]:
j.match_rating_codex(u'beyrouth')

'BYRTH'

In [20]:
j.match_rating_codex(u'بيروت')

'بيروت'

In [21]:
j.match_rating_codex(u'صيدا')

'صيدا'

---

# Levenshtein Distance Tests

In [22]:
def levenshtein_distance(a, b):
    return str(j.levenshtein_distance(a, b)) + ' edit(s)'

In [23]:
levenshtein_distance(u'beirut', u'beyrut')

'1 edit(s)'

In [24]:
levenshtein_distance(u'beirut', u'beyrouth')

'3 edit(s)'

In [25]:
levenshtein_distance(u'beyrut', u'beyrouth')

'2 edit(s)'

In [26]:
levenshtein_distance(u'receive', u'recieve')

'2 edit(s)'

In [27]:
levenshtein_distance(u'بيروت', u'يبيروت')

'1 edit(s)'

In [28]:
levenshtein_distance(u'بيروت', u'لبيروت')

'1 edit(s)'

---

# Damera Levenshtein Distance Tests

In [29]:
def damerau_levenshtein_distance(a, b):
    return str(j.damerau_levenshtein_distance(a, b)) + ' edit(s)'

In [30]:
damerau_levenshtein_distance(u'beirut', u'beyrut')

'1 edit(s)'

In [31]:
damerau_levenshtein_distance(u'beirut', u'beyrouth')

'3 edit(s)'

In [32]:
damerau_levenshtein_distance(u'beyrut', u'beyrouth')

'2 edit(s)'

In [86]:
damerau_levenshtein_distance(u'receive', u'recieve')

'1 edit(s)'

In [33]:
damerau_levenshtein_distance(u'بيروت', u'يبيروت')

'1 edit(s)'

In [34]:
damerau_levenshtein_distance(u'بيروت', u'لبيروت')

'1 edit(s)'

---

# Hamming Distance Tests

In [35]:
def hamming_distance(a, b):
    return str(j.hamming_distance(a, b)) + ' different characters'

In [36]:
hamming_distance(u'beirut', u'beyrut')

'1 different characters'

In [37]:
hamming_distance(u'beirut', u'beyrouth')

'5 different characters'

In [38]:
hamming_distance(u'beyrut', u'beyrouth')

'4 different characters'

In [39]:
hamming_distance(u'بيروت', u'يبيروت')

'6 different characters'

In [40]:
hamming_distance(u'بيروت', u'لبيروت')

'6 different characters'

---

# Tri-grams Similarity (similar to the postegres db one)

In [41]:
import re


def find_ngrams(text, number=3):
    """
    returns a set of ngrams for the given string
    :param text: the string to find ngrams for
    :param number: the length the ngrams should be. defaults to 3 (trigrams)
    :return: set of ngram strings
    """

    if not text:
        return set()

    words = [f'  {x} ' for x in re.split(r'\W+', text.lower()) if x.strip()]

    ngrams = set()

    for word in words:
        for x in range(0, len(word) - number + 1):
            ngrams.add(word[x:x+number])

    return ngrams


def similarity(text1, text2, number=3):
    """
    Finds the similarity between 2 strings using ngrams.
    0 being completely different strings, and 1 being equal strings
    """

    ngrams1 = find_ngrams(text1, number)
    ngrams2 = find_ngrams(text2, number)

    num_unique = len(ngrams1 | ngrams2)
    num_equal = len(ngrams1 & ngrams2)

    return float(num_equal) / float(num_unique)

In [42]:
similarity(u'beirut', u'beyrut', 2)

0.6

In [43]:
similarity(u'beirut', u'beyrouth', 2)

0.2857142857142857

In [44]:
similarity(u'beyrut', u'beyrouth', 2)

0.5

In [45]:
similarity(u'بيروت', u'يبيروت', 2)

0.6666666666666666

In [46]:
similarity(u'بيروت', u'لبيروت', 2)

0.6666666666666666

---

# FuzzyWuzzy Ratio()

In [47]:
fuzz.ratio(u'beirut', u'beyrut')

83

In [48]:
fuzz.ratio(u'beirut', u'beyrouth')

71

In [49]:
fuzz.ratio(u'beyrut', u'beyrouth')

86

In [50]:
fuzz.ratio(u'بيروت', u'يبيروت')

91

In [51]:
fuzz.ratio(u'بيروت', u'لبيروت')

91

---

# FuzzyWuzzy Partial Ratio()

In [52]:
fuzz.partial_ratio(u'beirut', u'beyrut')

83

In [53]:
fuzz.partial_ratio(u'beirut', u'beyrouth')

67

In [54]:
fuzz.partial_ratio(u'beyrut', u'beyrouth')

83

In [55]:
fuzz.partial_ratio(u'بيروت', u'يبيروت')

100

In [56]:
fuzz.partial_ratio(u'بيروت', u'لبيروت')

100

In [57]:
fuzz.partial_ratio('and', 'aandkit')

100

---

# FuzzyWuzzy Token Sort Ratio()

In [58]:
fuzz.token_sort_ratio(u'beirut', u'beyrut')

83

In [59]:
fuzz.token_sort_ratio(u'beirut', u'beyrouth')

71

In [60]:
fuzz.token_sort_ratio(u'beyrut', u'beyrouth')

86

In [61]:
fuzz.token_sort_ratio(u'بيروت', u'يبيروت')

91

In [62]:
fuzz.token_sort_ratio(u'بيروت', u'لبيروت')

91

---

# FuzzyWuzzy Token Set Ratio()

In [63]:
fuzz.token_set_ratio(u'beirut', u'beyrut')

83

In [64]:
fuzz.token_set_ratio(u'beirut', u'beyrouth')

71

In [65]:
fuzz.token_set_ratio(u'beyrut', u'beyrouth')

86

In [66]:
fuzz.token_set_ratio(u'بيروت', u'يبيروت')

91

In [67]:
fuzz.token_set_ratio(u'بيروت', u'لبيروت')

91

---

# Combination

In [68]:
a = 'This is beirut we are talking about'
b = 'hyde byrout le 3m n7ke 3nna'
c = "بعدو محمد عطوي بين الحياة و الموت بسبب بهايم قرروا ينهوا حياة شاب بدفن شاب ، و اليوم بهايم جدد بدفن بالشويفات عم يطلقوا النار بشكل مخيف و يهددوا حياة الناس مجددا و البهايم بدولتنا متل العادة او مش عارفين او ما خلوهم #انا_خط_احمر #كلن_يعني_كلن #لبنان_ينتفض"

locations = ['beirut', 'beyrouth', 'شويفات']
text = [a, b, c]
for i, t in enumerate(text):
    for l in locations:
        print(f'[{i}] Location:', l, '==> Ratio:', fuzz.ratio(t, l))
        print(f'[{i}] Location:', l, '==> Partial Ratio:', fuzz.partial_ratio(t, l))
        print('---')
    print('-------------------------')

[0] Location: beirut ==> Ratio: 29
[0] Location: beirut ==> Partial Ratio: 100
---
[0] Location: beyrouth ==> Ratio: 28
[0] Location: beyrouth ==> Partial Ratio: 62
---
[0] Location: شويفات ==> Ratio: 0
[0] Location: شويفات ==> Partial Ratio: 0
---
-------------------------
[1] Location: beirut ==> Ratio: 24
[1] Location: beirut ==> Partial Ratio: 67
---
[1] Location: beyrouth ==> Ratio: 34
[1] Location: beyrouth ==> Partial Ratio: 75
---
[1] Location: شويفات ==> Ratio: 0
[1] Location: شويفات ==> Partial Ratio: 0
---
-------------------------
[2] Location: beirut ==> Ratio: 0
[2] Location: beirut ==> Partial Ratio: 0
---
[2] Location: beyrouth ==> Ratio: 0
[2] Location: beyrouth ==> Partial Ratio: 0
---
[2] Location: شويفات ==> Ratio: 5
[2] Location: شويفات ==> Partial Ratio: 100
---
-------------------------


In [69]:
fuzz.ratio('and', 'aandkit')

60

In [70]:
fuzz.partial_ratio('and', 'aandkit')

100

In [71]:
fuzz.partial_ratio('aandkit', 'and')

100

In [72]:
(fuzz.partial_ratio('and', 'aandkit') + fuzz.ratio('and', 'aandkit')) / 2

80.0

In [73]:
fuzz.partial_ratio('شويفات', 'شو')

100

In [74]:
fuzz.ratio('شويفات', 'شو')

50

---

In [75]:
a = 'شو'
b = 'شويفات'
c = 'شوي'
d = 'بلا'
e = 'بلاط'

In [76]:
fuzz.ratio(a, b)

50

In [77]:
fuzz.partial_ratio(a, b)

100

In [78]:
fuzz.ratio(c, b)

67

In [79]:
fuzz.partial_ratio(c, b)

100

In [80]:
fuzz.ratio(d, e)

86

In [81]:
fuzz.partial_ratio(d, e)

100

In [82]:
(fuzz.partial_ratio(a, b) + fuzz.ratio(a, b)) / 2

75.0

In [83]:
(fuzz.partial_ratio(c, b) + fuzz.ratio(c, b)) / 2

83.5

In [84]:
(fuzz.partial_ratio(d, e) + fuzz.ratio(d, e)) / 2

93.0

---

In [93]:
j.soundex('beirut')

'B163'

In [91]:
j.soundex('beyr')

'B600'

In [92]:
damerau_levenshtein_distance('beirut', 'beyrouth')

'3 edit(s)'