### Most of the code is self explanatory

In [1]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dot, Dense, Concatenate
from tensorflow.keras.models import Model
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import distance
from fuzzywuzzy import fuzz
import spacy
nlp=spacy.load('en_core_web_lg')

  import pandas.util.testing as tm


In [2]:
# !python -m spacy download en_core_web_lg

In [3]:
# !pip install Distance
# !pip install fuzzywuzzy
# !pip install python-Levenshtein

In [4]:
import pickle

# loading
with open('tokenizer.pickle', 'rb') as handle:
    t = pickle.load(handle)

In [5]:
data_eval = pd.read_excel('entity_links.xlsx', sheet_name='eval')
data_items = pd.read_excel('entity_links.xlsx', sheet_name='canonical_line_item_table')

In [6]:
data_eval.head()

Unnamed: 0,line_item_name,line_item_description,canonical_vendor_name,canonical_line_item_name
0,Management Services,May 2019 Services,10 Minute Ventures,
1,Acrobat Pro DC,,Adobe,
2,AIEX 96 Pieces Adhesive Poster Tacky Putty Sti...,,Amazon Business,
3,AmazonBasics AAA 1.5 Volt Performance Alkaline...,,Amazon Business,
4,AmazonBasics Mesh Trash Can Waste Basket,1,Amazon Business,


In [7]:
def get_token_features(q1, q2):
    SAFE_DIV = 0.0001 
    token_features = [0.0]*10
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features
    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens])
    q2_words = set([word for word in q2_tokens])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens])
    q2_stops = set([word for word in q2_tokens])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
    return token_features

# get the Longest Common sub string

def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

def extract_features(df, f1, f2):
    
    token_features = df.apply(lambda x: get_token_features(x[f1], x[f2]), axis=1)
    
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))
   
    #Computing Fuzzy Features and Merging with Dataset
    
    # do read this blog: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
    # https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
    # https://github.com/seatgeek/fuzzywuzzy
    print("fuzzy features..")

    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x[f1], x[f2]), axis=1)
    # The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and 
    # then joining them back into a string We then compare the transformed strings with a simple ratio().
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x[f1], x[f2]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x[f1], x[f2]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x[f1], x[f2]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x[f1], x[f2]), axis=1)
    
    def normalized_word_Common(row):
        w1 = set(map(lambda word: word.lower().strip(), row[f1].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row[f2].split(" ")))    
        return 1.0 * len(w1 & w2)
    df['word_Common'] = df.apply(normalized_word_Common, axis=1)

    def normalized_word_Total(row):
        w1 = set(map(lambda word: word.lower().strip(), row[f1].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row[f2].split(" ")))    
        return 1.0 * (len(w1) + len(w2))
    df['word_Total'] = df.apply(normalized_word_Total, axis=1)

    def normalized_word_share(row):
        w1 = set(map(lambda word: word.lower().strip(), row[f1].split(" ")))
        w2 = set(map(lambda word: word.lower().strip(), row[f2].split(" ")))    
        return 1.0 * len(w1 & w2)/(len(w2))
    df['word_share'] = df.apply(normalized_word_share, axis=1)
    
    return df

In [8]:
def remove_more_than_6(text):
    string = ""
    for word in text.split():
        if re.search('\d',word) and len(word)>6:
            pass
        elif re.search('(\d+)(\D+)',word) and re.search('(\D+)(\d+)',word):
            pass
        else:
            string +=word+" "
    return string.strip()

In [9]:
def replace_persons_cities(txt):
    sents = nlp(txt) 
    for ee in sents.ents:
        st = str(ee)
        le = len(st)
        if ee.label_=="PERSON" or ee.label_ == "GPE":
            txt = txt[:txt.find(st)]+ " "+ee.label_+ " "+txt[txt.find(st)+le:]
    return txt

In [10]:
import re
def preprocessing(text, replace=True):
    text=re.sub("@"," at ",text) # 1
    text=re.sub("%"," percent ",text) # 2
    text= " ".join([i[0]+"_and_"+i[2] if (('&' in i ) and (len(i)==3)) else i for i in text.split()]) # 3
    text=re.sub("&"," and ",text) # 3
    text=re.sub("\$","",text) # 4
    text=re.sub("->"," to ",text) # 5
    text=re.sub("-", " ", text) # 6
    text=re.sub(",", " ", text) # 7
    text=re.sub("(?<=\D)/(?=\D)", " ", text) # 8
    text=re.sub("(?<=\d)/(?=\D)", " per ", text) # 8
    text=re.sub('"', "", text) # 9
    text= " ".join([i[0]+"_and_"+i[2] if (('.' in i ) and (len(i)==3)) else i for i in text.split()]) #10
    text=re.sub("(?<=\D)\.(?=\D)", " ", text) # 10
    text=re.sub("(?<=\d)\.(?=\D)", " ", text) # 10
    text=re.sub('\+', " sum ", text) # 11
    text=re.sub(':', " ", text) # 12
    text=re.sub(';', " ", text) # 13
    text=re.sub('\|', " ", text) # 14
    text=re.sub('\*', "", text) # 15
    text=re.sub('#', "", text) # 16
    text=re.sub(' x ', " times ", text) # 17
    text=re.sub("(?<=\d)x(?=\d)", " times ", text) # 10
    text=re.sub('=', " equals_to ", text) # 18
    text=text.replace("'", '').replace('[','').replace(']','').replace('{','').replace('}','').replace('(','').replace(')','')
    # 18-25
    text= "".join(filter(lambda x: ord(x)<128, text))# 25-33
    text=re.sub("\?","",text) # 34
    text=re.sub("\!","",text) # 34
    text = re.sub('\d{1}/\d{1}/\d{4}', " date_string " ,text)
    text = re.sub('\d{1}/\d{1}/\d{2}', " date_string " ,text)
    text = re.sub('\d{2}/\d{2}/\d{4}', " date_string " ,text)
    text = re.sub('\d{2}/\d{2}/\d{2}', " date_string " ,text)
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/\d{1,2}"," ip_address ",text)
    text = re.sub('/', ' ', text)
    if replace:
        text = replace_persons_cities(text)
    text = re.sub(' +', ' ', text)
    text = text.lower()
    text = text.strip()
    if text.endswith("."):
        text= text[:-1]
    text=re.sub(r"\d", "#", text)
    text= " ".join([i for i in text.split() if len(i)>1]) #10
    return text

In [11]:
def concat(x):
    x,y = x[0],x[1]
    if len(y.split())<11:
        return x.strip()+" "+y.strip()
    elif len(x)==0:
        return y.strip()
    else:
        return x.strip()

In [12]:
from sklearn.metrics import roc_auc_score
def auroc(y_true, y_pred):
    y_true= tf.reshape(y_true, shape=(-1,))
    if tf.unique(y_true)[0].shape[0]==1:
        print(tf.unique(y_true)[0].shape, "hello")
        return 0.5
    else:
        return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [13]:
datap_eval=pd.DataFrame()
for i in data_eval.columns:
    data_eval[i] = data_eval[i].astype('str')
    if not i in ['canonical_vendor_name','canonical_line_item_name']:
        datap_eval[i]=data_eval[i].apply(preprocessing, replace=False)
    else:
        datap_eval[i]=data_eval[i].apply(preprocessing, replace=True)

datap_eval = datap_eval.replace('nan', "")

datap_items=pd.DataFrame()
for i in data_items.columns:
    datap_items[i]=data_items[i].apply(preprocessing, replace=False)

In [14]:
print(datap_eval.shape)
datap_eval['line_item_nd'] = datap_eval[['line_item_name','line_item_description']].apply(lambda x: concat(x),axis=1)
datap_eval.drop(['line_item_name','line_item_description','canonical_line_item_name'],axis=1,inplace=True)
datap_eval.head()

(337, 4)


Unnamed: 0,canonical_vendor_name,line_item_nd
0,## minute ventures,management services may #### services
1,adobe,acrobat pro dc
2,amazon business,aiex ## pieces adhesive poster tacky putty sti...
3,amazon business,amazonbasics aaa #_and_# volt performance alka...
4,amazon business,amazonbasics mesh trash can waste basket


In [15]:
validation=pd.merge(datap_eval, datap_items, on='canonical_vendor_name')
print("data points in validation",validation.shape)
validation.head()

data points in validation (6228, 3)


Unnamed: 0,canonical_vendor_name,line_item_nd,canonical_line_item_name
0,## minute ventures,management services may #### services,management services
1,adobe,acrobat pro dc,acrobat pro dc
2,adobe,acrobat pro dc,creative cloud all apps
3,amazon business,aiex ## pieces adhesive poster tacky putty sti...,anti theft adjustable tablet security stand
4,amazon business,aiex ## pieces adhesive poster tacky putty sti...,apple ipad


In [16]:
with_featres = extract_features(validation, "line_item_nd", "canonical_line_item_name")

fuzzy features..


In [17]:
x3 = with_featres.drop(['canonical_vendor_name','line_item_nd','canonical_line_item_name'], axis=1).values
with_featres.head()

Unnamed: 0,canonical_vendor_name,line_item_nd,canonical_line_item_name,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio,word_Common,word_Total,word_share
0,## minute ventures,management services may #### services,management services,0.99995,0.499988,0.99995,0.499988,0.99995,0.399992,1,1,3,3.5,100,75,68,100,0.95,2.0,6.0,1.0
1,adobe,acrobat pro dc,acrobat pro dc,0.999967,0.999967,0.999967,0.999967,0.999967,0.999967,1,1,0,3.0,100,100,100,100,0.933333,3.0,7.0,1.0
2,adobe,acrobat pro dc,creative cloud all apps,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,3.5,32,32,38,53,0.125,0.0,8.0,0.0
3,amazon business,aiex ## pieces adhesive poster tacky putty sti...,anti theft adjustable tablet security stand,0.0,0.0,0.0,0.0,0.0,0.0,0,0,18,15.0,34,33,33,51,0.113636,0.0,28.0,0.0
4,amazon business,aiex ## pieces adhesive poster tacky putty sti...,apple ipad,0.0,0.0,0.0,0.0,0.0,0.0,0,0,22,13.0,11,10,12,60,0.272727,0.0,24.0,0.0


In [18]:
x1 = t.texts_to_sequences(with_featres['line_item_nd'])
x1 = pad_sequences(x1, maxlen=25,padding='post')

x2 = t.texts_to_sequences(with_featres['canonical_line_item_name'])
x2 = pad_sequences(x2, maxlen=25,padding='post')

In [19]:
x1.shape, x2.shape, x3.shape

((6228, 25), (6228, 25), (6228, 18))

In [20]:
model = tf.keras.models.load_model('auc_0_9713.h5', custom_objects={'auroc': auroc})

In [24]:
# validation['predicted']
predictions=model.predict([x1, x2, x3])
validation['predicted']=predictions
predictions=(predictions>0.015)
predictions.dtype=np.int8
validation['predictions']=predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
validation = validation[['canonical_vendor_name','line_item_nd','canonical_line_item_name','predicted', 'predictions']]
validation.head(10)

Unnamed: 0,canonical_vendor_name,line_item_nd,canonical_line_item_name,predicted,predictions
0,## minute ventures,management services may #### services,management services,0.9999009,1
1,adobe,acrobat pro dc,acrobat pro dc,0.07122597,1
2,adobe,acrobat pro dc,creative cloud all apps,0.5821036,1
3,amazon business,aiex ## pieces adhesive poster tacky putty sti...,anti theft adjustable tablet security stand,1.054113e-08,0
4,amazon business,aiex ## pieces adhesive poster tacky putty sti...,apple ipad,0.008977413,0
5,amazon business,aiex ## pieces adhesive poster tacky putty sti...,apple ipad with retina display,3.668864e-09,0
6,amazon business,aiex ## pieces adhesive poster tacky putty sti...,apple ipad with retina display md###ll ##gb wi...,5.042089e-08,0
7,amazon business,aiex ## pieces adhesive poster tacky putty sti...,oval plastic storage tubs with handle,3.384124e-06,0
8,amazon business,aiex ## pieces adhesive poster tacky putty sti...,sunland mesh dish cloths for washing dishes no...,1.718738e-07,0
9,amazon business,aiex ## pieces adhesive poster tacky putty sti...,usb charger nekteck ##w type wall charger usb ...,7.73398e-08,0


In [28]:
validation[validation['predictions']==1].head(10)
# we can surely see, there are false positives here

Unnamed: 0,canonical_vendor_name,line_item_nd,canonical_line_item_name,predicted,predictions
0,## minute ventures,management services may #### services,management services,0.999901,1
1,adobe,acrobat pro dc,acrobat pro dc,0.071226,1
2,adobe,acrobat pro dc,creative cloud all apps,0.582104,1
17,amazon business,aiex ## pieces adhesive poster tacky putty sti...,aiex ## pieces adhesive poster tacky putty sti...,0.850372,1
159,amazon business,amazonbasics aaa #_and_# volt performance alka...,amazonbasics aaa #_and_# volt performance alka...,0.144146,1
232,amazon business,amazonbasics mesh trash can waste basket,sunland mesh dish cloths for washing dishes no...,0.032568,1
273,amazon business,amazonbasics mesh trash can waste basket,amazonbasics mesh trash can waste basket,0.989626,1
274,amazon business,amazonbasics mesh trash can waste basket,amazonbasics multipurpose copy printer paper w...,0.042189,1
316,amazon business,amazonbasics mesh trash can waste basket,philonext halloween decorations spider webs sp...,0.042278,1
354,amazon business,amazonfresh mediterranean extra virgin olive o...,amazonfresh mediterranean extra virgin olive o...,0.99924,1
