In [1]:
import re
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from tqdm import tqdm
import sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [3]:
def awesome_cossim_top(A, B, ntop, threshold):
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape

    idx_dtype = np.int32

    nnz_max = M*ntop
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        threshold,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [4]:
def return_match_pair(match,employee_vendor_vector,vendor_vector,init_length):
    non_zero = match.nonzero()
    sparse_rows = non_zero[0]
    sparse_columns = non_zero[1]
    key1 = list()
    key2 = list()
    score = list()
    for i, j in zip(sparse_rows, sparse_columns):
        if i == init_length:
            break
        score_array = match[i,:].toarray()
        key1.append(name_vector[i]['orderId'])
        key2.append(dict_vendor[j]['orderId'])
        score.append(int(score_array[0][j]*100))

    dataframe = pd.DataFrame({
        "Refernce_id_one":key1,
        "Refernce_id_two":key2,
        "similarity_score":score,
    })
    return dataframe

In [5]:
employee = pd.read_csv("employee.csv")
vendor = pd.read_csv("vendor_.csv")

In [6]:
employee.dropna(subset=['name'], inplace=True)
vendor.dropna(subset=['name'], inplace=True)

In [7]:
ename = employee['name'].tolist()
vname = vendor['name'].tolist()

In [8]:
vectorizer = TfidfVectorizer(min_df = 0,token_pattern='(?u)\\b\\w+\\b', analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(ename + vname)
query = vectorizer.transform(vname)

In [9]:
(tf_idf_matrix * query.T).toarray()

array([[1.       , 0.4717245, 0.       , 0.       ],
       [0.4717245, 1.       , 0.       , 0.       ],
       [0.       , 0.       , 1.       , 1.       ],
       [0.       , 0.       , 1.       , 1.       ],
       [1.       , 0.4717245, 0.       , 0.       ],
       [0.4717245, 1.       , 0.       , 0.       ],
       [0.       , 0.       , 1.       , 1.       ],
       [0.       , 0.       , 1.       , 1.       ]])

In [10]:
matches = awesome_cossim_top(tf_idf_matrix, query.transpose(), 10, 50)

In [11]:
matches.toarray()

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [None]:
matches_df = return_match_pair(matches, (ename+vname), dict_vendor, len(ename))

In [None]:
matches.nonzero()