In [2]:
import numpy as np
from collections import Counter

#### Implementing simple levenstein distance calculation using memory dict to prevent more calls

In [9]:
def call_counter(func):
    def helper(*args, **kwargs):
        helper.calls += 1
        return func(*args, **kwargs)
    helper.calls = 0
    helper.__name__= func.__name__
    return helper

memory = {}

@call_counter
def levenshtein(s, t):
    if s == "":
        return len(t)
    if t == "":
        return len(s)
    cost = 0 if s[-1] == t[-1] else 1
       
    i1 = (s[:-1], t)
    if not i1 in memory:
        memory[i1] = levenshtein(*i1)
    i2 = (s, t[:-1])
    if not i2 in memory:
        memory[i2] = levenshtein(*i2)
    i3 = (s[:-1], t[:-1])
    if not i3 in memory:
        memory[i3] = levenshtein(*i3)
    res = min([memory[i1]+1, memory[i2]+1, memory[i3]+cost])
    
    return res

In [10]:
print(levenshtein("Python", "Pethno"))
print("The function was called " + str(levenshtein.calls) + " times!")

3
The function was called 49 times!


#### Simple implementation of DBScan clustering to form clusters based on Ld 

In [11]:
words = ['cow','now','bow','apple','sos','combs','andrew','syther','instan']

##### Helper functions

In [13]:
def return_list_ld(list_words,tword,ld):
    list_ids = []
    for word in list_words:
        if(levenshtein(word,tword)<ld):
            list_ids.append(list_words.index(word))
    return list_ids        

In [14]:
visited = np.zeros(words.__len__())

##### clustering forming function..to be optimized later on

In [15]:
def grow(words,ld):
    cluster = np.zeros(words.__len__())
    for word,visit in zip(words,visited):
        if(visit==0):
            visit=1
            indexes = return_list_ld(words,word,ld)
            if(len(indexes)):
                for i in indexes:
                    cluster[i] = words.index(word)
    return cluster            

In [16]:
clusters = grow(words,3)
clusters = list(map(int, clusters))

In [17]:
cluster_dict = {}
for ids in set(clusters):
    cluster_dict[words[ids]] = [words[i] for i, x in enumerate(clusters) if x == ids]  

In [18]:
cluster_dict

{'apple': ['apple'],
 'sos': ['cow', 'now', 'bow', 'sos'],
 'combs': ['combs'],
 'andrew': ['andrew'],
 'syther': ['syther'],
 'instan': ['instan']}

#### next step is to query the clustered words, when new word comes