# Daltix Data Science Test

In [52]:
import pandas as pd
import numpy as np
import os
#import sparse_dot_topn
import re

### Load file

In [53]:
# Input your path word
os.chdir("C:\\)

In [54]:
df = pd.read_json('hackathon_dataset.json', lines=True)

### Process and clean the features

In [55]:
# Creating a new variable to use like token and key

df['aux'] = df["DALTIX_ID"].map(str) + '@' + df["NAME"] + df["DALTIX_ID"].map(str) 

In [64]:
# Fuction to clean and tokenize the name of products

def ngrams(string, n=7):
    string.lower
    string = re.sub(r'\s+', '',string)
    string = re.sub(r'^(.*)(?=@)', '',string)
    string = re.sub(r'[0-9]',r'', string)
    string = re.sub(r'[\W*]+',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [66]:
# TF-IDF: method to generate features from text

from sklearn.feature_extraction.text import TfidfVectorizer

product_names = df['aux']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(product_names)

### Functions and Measures

In [None]:
# To calculate the similarity between two vectors of TF-IDF, we will use Cosine Similarity

import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
# Fuction to get matches

def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
# Calculate the best matches

matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.3)

In [None]:
# look at our matches

matches_df = get_matches_df(matches, product_names , top=101813)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df

### Output

In [None]:
# Creating a new dataset with the values

submission_set = pd.DataFrame()
submission_set['daltix_id_1'] = matches_df['left_side'].apply(lambda x: re.sub(r'\s+', '',x))
submission_set['daltix_id_1'] = matches_df['left_side'].apply(lambda x: re.sub(r'(@[a-zA-z0-9\W]+)', '',x))
submission_set['daltix_id_2'] = matches_df['right_side'].apply(lambda x: re.sub(r'\s+', '',x))
submission_set['daltix_id_2'] = matches_df['right_side'].apply(lambda x: re.sub(r'(@[a-zA-z0-9\W]+)', '',x))

### Validation

In [None]:
# Evaluation dataset provided from company

validation_set = pd.read_csv('y_true.csv')
validation_set = validation_set.rename(columns={'0': "daltix_id_1",'1': "daltix_id_2" })

In [None]:
# Fi Score

len_validation = validation_set.shape[0]
len_submission = submission_set.shape[0]
tp = pd.merge(submission_set, validation_set, how='inner').shape[0]
fp = len_validation - len_submission
recall = tp/len_validation
precision = tp/len_submission
fpr = fp/len_submission

print("f1 score: {0:.3f}%".format( (2/((1/recall) + (1/precision)))*100 ))