In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from Levenshtein import distance
from nltk.corpus import wordnet
stmr = PorterStemmer() 
lmtzr = WordNetLemmatizer()

In [2]:
skip_punc = [",",".","'"]

In [3]:
# LEXICAL SIMILARTY HELPER FUNCTIONS
def pre_lex(sent):
    '''
        Preprocessing a sentence 
        - word tokeinize
        - stemming
        - lemmatizing
        - remove stop words
    '''
    def pre_lex_aux(word):
        return 'not' if word == "n't" else word
    stp_words = stopwords.words('english') + skip_punc
    tokens = [lmtzr.lemmatize(stmr.stem(pre_lex_aux(_word))) for _word in word_tokenize(sent) 
              if _word not in stp_words]
    return(tokens)


In [4]:
def word_similarity(w1,w2):
    '''
       similarity between two words
       - LevenshteinDistance between two words w1 and w2 is number of swaps(insertion,deletion) 
            done to transform one word to another
       - LevSimilarity = 1.0 − (LevenshteinDistance(word 1 , word 2 )/maxLength(word 1 , word 2 ))
       - Word net Distance(Path measure) is the distance between the vectors of two words
       - Here
           if Path measure(word 1 , word 2 ) < 0.1 then
            similarity = LevSimilarity(word 1 , word 2 )
            else
            similarity = Path measure(word 1 , word 2 )
            end if
    '''
    
    w1_syns = wordnet.synsets(w1)
    w2_syns = wordnet.synsets(w2)
    
    # wnet_distance is taken as 0 for initial purpose 
    # considering the distance between words to be zero
    wnet_distance = 0
    
    # distance is the LevenshteinDistance
    _distance = distance(w1,w2)
    rev_sim = 1-(_distance/max(len(w1),len(w2)))
    
    if w1_syns and w2_syns:
        # If at all the the word is found in word net we 
        # considering the first matching
        wnet_distance = w1_syns[0].path_similarity(w2_syns[0]) or 0
    if wnet_distance < 0.1:
        return rev_sim
    else:
        return wnet_distance
    

In [5]:
def get_similarity_matrix(str1,str2):
    """
        Generate a similarity matrix for two strings
        based on similarty mesurement for each word of 
        one matrix to another and vice-versa
    """
    
    sim_matrix = []
    str_1 = pre_lex(str1)
    str_2 = pre_lex(str2)
    
    for _word1 in str_1:
        sim = []
        for _word2 in str_2:
            sim.append(word_similarity(_word1,_word2))
        sim_matrix.append(sim)
        
    return np.matrix(sim_matrix)

In [6]:
def get_max(df):
    """
        delets the row and column where
        maximun value is found
    """
    ctr = 0 
    max_all= df.values.max()
    for _idx,_max in df.max().items():
        ctr +=1
        if _max == max_all:
            for _idx2,_val in df[_idx].items():
                if _val == max_all:
                    df = df.drop(_idx2)
                    break
            df = df.drop(_idx,axis=1)
            return df,max_all

In [7]:
def lex_similarity(str1,str2):
    """
        Lexical similarity of two strings
        -   matrix = newmatrix(size(A)xsize(B))
            total similarity = 0
            iteration = 0
            for bu i ∈ A do
                for bu  j ∈ B do
                    matrix(i, j) = similarity(ti,tj)
                end for
            end for
            for has line(matrix) and has column(matrix) do
                total similarity = total similarity + larger similarity(matrix)
                remove line(matrix, larger similarity(matrix))
                remove column(matrix, larger similarity(matrix))
                iteration++
            end for
            partial similarity = total similarity/iteration
            return partial similarity
        - From the similarty matrix it find the maximum 
        - deletes the respective rows and column
        - 
    """
    sim_matrix = get_similarity_matrix(str1,str2)
    str_1 = pre_lex(str1)
    str_2 = pre_lex(str2)
    df = pd.DataFrame(data=sim_matrix[0:,0:])
    sum_total = 0
    iter = 0
    for i in range(max(df.shape)):
        df,max_val = get_max(df)
        sum_total += max_val
        iter +=1
        if df.shape[0] == 0 or df.shape[1] == 0:
            break
    partial_similarity = sum_total/iter
    
    #size difference penalization coeff
    sdpc = (abs(len(str_1)-len(str_2)))*partial_similarity/max(len(str_1),len(str_2))
    return {
            "partial_similarity":partial_similarity,
            "sdpc":sdpc,
            "similarity":partial_similarity-sdpc
    }

In [8]:
str1 = "Jane was unhappy because she had lost her job."
str2 = "As Jane had lost her job she was unhappy."
str3 = "James decided to quit smoking but it was not an easy decision."
str4 = "Though it was not an easy decision, James decided to quit smoking."
str5 = "James decided to quit smoking. However, it was not an easy decision."
str6 = "In spite of it not being an easy decision, James decided to quit smoking."
str7 = "I love Jupyter Notebook";
str8 = "Jupyter Notebook is awesome"
lex_similarity(str1,str2)

{'partial_similarity': 1.0, 'sdpc': 0.2, 'similarity': 0.8}