In [1]:
import textdistance as td

In [2]:
td.levenshtein("Test","Test")

0

In [3]:

td.levenshtein("Test","Text")

1

In [4]:
td.jaro_winkler("this test","that test")

0.8052910052910053

In [5]:
td.jaccard('this','that')

0.3333333333333333

In [6]:
td.cosine('apple','ppale')

1.0

In [7]:
td.mra('alex','alice')

2

In [8]:
td.editex('alex','alice')

5

In [9]:
td.editex('tie','euphemism')

14

In [10]:
td.hamming.normalized_similarity('alex','alice')

0.4

In [11]:
td.hamming.normalized_similarity('tie','tie')

1.0

In [12]:
# Iterative Levenshtein Function

def iterative_levenshtein(s, t):
    """ 
        iterative_levenshtein(s, t) -> ldist
        ldist is the Levenshtein distance between the strings 
        s and t.
        For all i and j, dist[i,j] will contain the Levenshtein 
        distance between the first i characters of s and the 
        first j characters of t
    """

    rows = len(s)+1
    cols = len(t)+1
    dist = [[0 for x in range(cols)] for x in range(rows)]

    # source prefixes can be transformed into empty strings 
    # by deletions:
    for i in range(1, rows):
        dist[i][0] = i

    # target prefixes can be created from an empty source string
    # by inserting the characters
    for i in range(1, cols):
        dist[0][i] = i
        
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0
            else:
                cost = 1
            dist[row][col] = min(dist[row-1][col] + 1,      # deletion
                                 dist[row][col-1] + 1,      # insertion
                                 dist[row-1][col-1] + cost) # substitution

    # for r in range(rows):
    #     print(dist[r])
    
 
    return dist[row][col]

f"Distance is {iterative_levenshtein('Alex', 'Alice')}"

'Distance is 3'

In [23]:
# Fuzzy Joins
# https://python-bloggers.com/2020/12/fuzzy-joins-tutorial/

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [24]:
df1 = pd.DataFrame({'Text_A':['12 Estias Street, Ampelokipi',
                             'Georgios Pipis',
                             'fuzzy much in python',
                             'Today is Friday! TGIF',
                             'This is a partial text']})

df2 = pd.DataFrame({'Text_B':['Predictive Hacks is a Data Science Blog',
                             'Abelokipi, Estias Str 12',
                             'Fuzzy matching in Python',
                             'George P. Pipis',
                             'partial text']})

In [26]:
df1

Unnamed: 0,Text_A
0,"12 Estias Street, Ampelokipi"
1,Georgios Pipis
2,fuzzy much in python
3,Today is Friday! TGIF
4,This is a partial text


In [27]:
df2

Unnamed: 0,Text_B
0,Predictive Hacks is a Data Science Blog
1,"Abelokipi, Estias Str 12"
2,Fuzzy matching in Python
3,George P. Pipis
4,partial text


In [28]:
df1['dummy'] = True
df2['dummy'] = True

df = pd.merge(df1, df2, on='dummy')

df.drop('dummy', axis=1, inplace=True)
df

Unnamed: 0,Text_A,Text_B
0,"12 Estias Street, Ampelokipi",Predictive Hacks is a Data Science Blog
1,"12 Estias Street, Ampelokipi","Abelokipi, Estias Str 12"
2,"12 Estias Street, Ampelokipi",Fuzzy matching in Python
3,"12 Estias Street, Ampelokipi",George P. Pipis
4,"12 Estias Street, Ampelokipi",partial text
5,Georgios Pipis,Predictive Hacks is a Data Science Blog
6,Georgios Pipis,"Abelokipi, Estias Str 12"
7,Georgios Pipis,Fuzzy matching in Python
8,Georgios Pipis,George P. Pipis
9,Georgios Pipis,partial text


In [29]:
# FuzzyWuzzy Library

# We will caclulate the follwing ratios between the two columns of our data frame:

#     Ratio: It refers to the Levenshtein Distance Ratio.
#     Partial Ratio: Assume that we are dealing with two strings of different lengths such as L1 and L2, and assume that L1 is less than L2. Then the algorithm seeks the score of the best matching of length -L1 substring.
#     Token Sort Ratio: First it removes punctuations and converts the text to lower case and then it tokenizes it. Then it sorts the tokens alphabetically and then it joins them in a single string.
#     Token Set Ratio: Similar to the Token Sort Ratio, but it takes into consideration the unique tokens.

# Below we will return these four measures by adding them as columns to the joined data frame.

In [30]:
df['Ratio'] = df[['Text_A','Text_B']].apply(lambda x:fuzz.ratio(x.Text_A, x.Text_B), axis=1)
df['Partial_Ratio'] = df[['Text_A','Text_B']].apply(lambda x:fuzz.partial_ratio(x.Text_A, x.Text_B), axis=1)
df['Token_Sort_Ratio'] = df[['Text_A','Text_B']].apply(lambda x:fuzz.token_sort_ratio(x.Text_A, x.Text_B), axis=1)
df['Token_Set_Ratio'] = df[['Text_A','Text_B']].apply(lambda x:fuzz.token_set_ratio(x.Text_A, x.Text_B), axis=1)

df

Unnamed: 0,Text_A,Text_B,Ratio,Partial_Ratio,Token_Sort_Ratio,Token_Set_Ratio
0,"12 Estias Street, Ampelokipi",Predictive Hacks is a Data Science Blog,33,39,39,39
1,"12 Estias Street, Ampelokipi","Abelokipi, Estias Str 12",46,50,88,88
2,"12 Estias Street, Ampelokipi",Fuzzy matching in Python,23,25,27,27
3,"12 Estias Street, Ampelokipi",George P. Pipis,28,40,29,29
4,"12 Estias Street, Ampelokipi",partial text,35,58,41,41
5,Georgios Pipis,Predictive Hacks is a Data Science Blog,23,36,34,34
6,Georgios Pipis,"Abelokipi, Estias Str 12",32,43,38,38
7,Georgios Pipis,Fuzzy matching in Python,21,29,21,21
8,Georgios Pipis,George P. Pipis,76,71,79,79
9,Georgios Pipis,partial text,23,25,23,23


In [31]:
# Fuzzy Joins

# Since we have calculated the pairwise similarities of the text, we can join the two string columns by keeping the most similar pair. Let’s assume that we want to match df1 on df2. We can group the joined df on Text_A and get the rank of similarities and then keep the most similar (i.e. Rank=1). We have to choose measure, and for this example, we will keep the Token_Set_Ratio

In [32]:
df['Rank_Token_Set_Ratio'] = df.groupby('Text_A')['Token_Set_Ratio'].rank(ascending=False, method='dense')
df[['Text_A','Text_B', 'Token_Set_Ratio']].loc[df.Rank_Token_Set_Ratio==1]

Unnamed: 0,Text_A,Text_B,Token_Set_Ratio
1,"12 Estias Street, Ampelokipi","Abelokipi, Estias Str 12",88
8,Georgios Pipis,George P. Pipis,79
12,fuzzy much in python,Fuzzy matching in Python,86
17,Today is Friday! TGIF,Fuzzy matching in Python,45
24,This is a partial text,partial text,100


In [33]:
# As we can see, it captured all the matches but it matched also two strings that are not related. The reason for that is because the text “Today is Friday! TGIF” cannot be matched with any text from Text_B. For that reason, it makes sense to add a threshold. A good threshold is around 70.

In [39]:
df[['Text_A','Text_B', 'Token_Set_Ratio']].loc[(df.Rank_Token_Set_Ratio==1)&(df.Token_Set_Ratio>70)]

Unnamed: 0,Text_A,Text_B,Token_Set_Ratio
1,"12 Estias Street, Ampelokipi","Abelokipi, Estias Str 12",88
8,Georgios Pipis,George P. Pipis,79
12,fuzzy much in python,Fuzzy matching in Python,86
24,This is a partial text,partial text,100
