In [1]:
import textdistance as td

In [2]:
td.levenshtein("Test","Test")

0

In [3]:

td.levenshtein("Test","Text")

1

In [4]:
td.jaro_winkler("this test","that test")

0.8052910052910053

In [5]:
td.jaccard('this','that')

0.3333333333333333

In [6]:
td.cosine('apple','ppale')

1.0

In [7]:
td.mra('alex','alice')

2

In [8]:
td.editex('alex','alice')

5

In [9]:
td.editex('tie','euphemism')

14

In [10]:
td.hamming.normalized_similarity('alex','alice')

0.4

In [11]:
td.hamming.normalized_similarity('tie','tie')

1.0

In [12]:
# Iterative Levenshtein Function

def iterative_levenshtein(s, t):
    """ 
        iterative_levenshtein(s, t) -> ldist
        ldist is the Levenshtein distance between the strings 
        s and t.
        For all i and j, dist[i,j] will contain the Levenshtein 
        distance between the first i characters of s and the 
        first j characters of t
    """

    rows = len(s)+1
    cols = len(t)+1
    dist = [[0 for x in range(cols)] for x in range(rows)]

    # source prefixes can be transformed into empty strings 
    # by deletions:
    for i in range(1, rows):
        dist[i][0] = i

    # target prefixes can be created from an empty source string
    # by inserting the characters
    for i in range(1, cols):
        dist[0][i] = i
        
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0
            else:
                cost = 1
            dist[row][col] = min(dist[row-1][col] + 1,      # deletion
                                 dist[row][col-1] + 1,      # insertion
                                 dist[row-1][col-1] + cost) # substitution

    # for r in range(rows):
    #     print(dist[r])
    
 
    return dist[row][col]

f"Distance is {iterative_levenshtein('Alex', 'Alice')}"

'Distance is 3'

In [16]:
# Fuzzy Joins
# https://python-bloggers.com/2020/12/fuzzy-joins-tutorial/

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [17]:
df1 = pd.DataFrame({'Text_A':['12 Estias Street, Ampelokipi',
                             'Georgios Pipis',
                             'fuzzy much in python',
                             'Today is Friday! TGIF',
                             'This is a partial text']})

df2 = pd.DataFrame({'Text_B':['Predictive Hacks is a Data Science Blog',
                             'Abelokipi, Estias Str 12',
                             'Fuzzy matching in Python',
                             'George P. Pipis',
                             'partial text']})

In [18]:
df1

Unnamed: 0,Text_A
0,"12 Estias Street, Ampelokipi"
1,Georgios Pipis
2,fuzzy much in python
3,Today is Friday! TGIF
4,This is a partial text


In [20]:
df2

Unnamed: 0,Text_B
0,Predictive Hacks is a Data Science Blog
1,"Abelokipi, Estias Str 12"
2,Fuzzy matching in Python
3,George P. Pipis
4,partial text


In [22]:
df1['dummy'] = True
df2['dummy'] = True

df = pd.merge(df1, df2, on='dummy')

df.drop('dummy', axis=1, inplace=True)
df

Unnamed: 0,Text_A,Text_B
0,"12 Estias Street, Ampelokipi",Predictive Hacks is a Data Science Blog
1,"12 Estias Street, Ampelokipi","Abelokipi, Estias Str 12"
2,"12 Estias Street, Ampelokipi",Fuzzy matching in Python
3,"12 Estias Street, Ampelokipi",George P. Pipis
4,"12 Estias Street, Ampelokipi",partial text
5,Georgios Pipis,Predictive Hacks is a Data Science Blog
6,Georgios Pipis,"Abelokipi, Estias Str 12"
7,Georgios Pipis,Fuzzy matching in Python
8,Georgios Pipis,George P. Pipis
9,Georgios Pipis,partial text
