In [1]:
import csv
import re
import numpy as np
import pandas as pd 
import math
import fasttext
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load Facebook FastText Pre-trained Model
model = fasttext.load_model('wiki.en.bin')




## Functions Definitions
#### Preprocesing Functions

In [4]:
def read_file(filename):
    # Read in CSV File into DataFrame
    list_var = []
    data = pd.DataFrame()
    with open(filename,'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i in reader:
            list_var.append(i)
    # Header & Data
    col_name = list_var[0]
    data_val = list_var[1:]
    # Convert to DataFrame
    data = pd.DataFrame(data_val, columns=col_name)
    return data

In [5]:
def tokenize(data,clean):
    # Clean and Tokenize both Name and Description Field
    temp_name = data['name'].copy()
    temp_description = data['description'].copy()
    df = data.copy()
    # Raw
    if clean=='Raw':
        pass
    # Semi-Cleaned 
    elif clean=='Nspace':
        # Remove Symbols in Name and Description Column (no space)
        temp_name = [re.sub(r'[^\w\s]','',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]','',x) for x in temp_description]
    elif clean=='Wspace':
        # Remove Symbols in Name and Description Column (replace with space)
        temp_name = [re.sub(r'[^\w\s]',' ',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]',' ',x) for x in temp_description]
    # Tokenize: Split single sentence into list of words
    for i in range(0,len(temp_name),1):
        # Tokenize Name and Description Columns into words
        df['name'][i] = word_tokenize(temp_name[i])
        df['description'][i] = word_tokenize(temp_description[i])
        try:
            df['name'][i].remove('')
        except ValueError:
            pass  # do nothing!
        try:
            df['description'][i].remove('')
        except ValueError:
            pass  # do nothing!
    return df

In [6]:
def vectorize(df):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    arr = []
    for i in df['name']:
        array = [model.get_word_vector(x) for x in i]
        avg = np.average(array, axis=0)
        arr.append(avg)
        counter += 1
    df_vectorize.drop('name',axis=1)
    df_vectorize['name'] = arr
    #df_concat = pd.concat([df.id,df_vectorize.name,df.price,df.manufacturer],axis=1)
    return df_vectorize

In [7]:
def vec_concat(df1,df2):
    # Concate 2 word vectors from 2 model into a single word vector of dim 600
    df = df1.copy()
    arr = []
    for i in range(0,len(df1),1):
        vec1 = df1.name[i]
        vec2 = df2.name[i]
        comb = np.concatenate((vec1,vec2),axis=0)
        arr.append(comb)
    df['name'] = arr
    return df

In [8]:
def file_to_vector(file,clean):
    # Read in Abt File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Clean manufacture field 
    if 'manufacturer' not in df.columns:
        manufact = []
        for i in range(0,len(df),1):
            manufact.append(df.name[i][0])
        # Insert back in orignal df    
        df['manufacturer'] = [i.lower() for i in manufact]
    else:
        df['manufacturer'] = [i.lower() for i in df.manufacturer] 
    # Convert Price into Float
    arr = []
    for i in df.price:
        if i != '':
            arr.append(float(i.replace('$','').replace(',','').replace('gbp','')))
        else:
            arr.append(float(0))
    df['price'] = arr
    # Vectorize name column
    data = vectorize(df)
    return data

#### TF-IDF

In [9]:
def create_corpus(df,df2,clean):
    # Change Column names to 'name' instead of 'title'
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    if 'title' in df2.columns:
        df2['name']=df2['title'].copy()  
    # Combine name and description text
    df['combine'] = df['name'] + ' ' + df['description']
    df2['combine'] = df2['name'] + ' ' + df2['description']
    # Create Corpus to train TFIDF
    # With option for Raw vs Cleaned
    corpus = []
    for i in range(0,len(df['combine']),1):
        temp = df['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    for i in range(0,len(df2['combine']),1):
        temp = df2['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    return corpus

In [10]:
def tf_idf_vect(file1,file2,clean):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    corpus = create_corpus(df,df2,clean)
    # Fit TF-IDF Model
    vectorizer = TfidfVectorizer(max_features=300)
    tfidf_model = vectorizer.fit(corpus)
    # Generate TF-IDF Vectors for each product
    arr = []
    for i in range(0,len(df),1):
        text = df['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df['name'] = arr
    arr = []
    for i in range(0,len(df2),1):
        text = df2['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df2['name'] = arr
    
    return df[['id','name']], df2[['id','name']]

#### Create Ranked List Functions

In [11]:
def vector_length(a):
    return float(math.sqrt(sum(x*x for x in a)))
def dot_product(a,b):
    if len(a) == len(b):
        return sum([x*y for (x,y) in zip(a,b)])
    else:
        return "Vector Length Different"
def cos_similarity(a,b):
    return dot_product(a,b)/(vector_length(a)*vector_length(b))

In [12]:
def top_10_ranking(df1,df2):
    # Initialize Ranking Chart
    col_names = ['df1_id','1','2','3','4','5','6','7','8','9','10']
    df_ranking = pd.DataFrame(columns=col_names);
    # Loop for every Abt_id element
    for i in range(0,len(df1.name),1):
        cos_sim = []
        # Loop for computing buy_id ranking for current Abt_id
        for j in range(0,len(df2.name),1):
            # Find cos_similarity score between name pairs
            cos_score = cos_similarity(df1.name[i],df2.name[j])
            # If manufacture not empty
            if(df1.manufacturer[i]!='') and (df2.manufacturer[j]!=''):
                # If matching manufacture add to score
                if (df1.manufacturer[i] in df2.manufacturer[j]) or (df2.manufacturer[j] in df1.manufacturer[i]):
                    pass
                # No match subtract score
                else:
                    cos_score = cos_score - 0.25
            # If price not empty and not equal to zero
            if(df1.price[i]!='') and (df2.price[j]!=''):
                # If two price have percent differecence greater than 50% than subtract score
                if(df1.price[i]!=0) and (df2.price[j]!=0): 
                    if df1.price[i] >= df2.price[j]:
                        if (float(df1.price[i]-df2.price[j])/df1.price[i]) >= 0.50:
                            cos_score = cos_score - 0.25
                    else:
                        if (float(df2.price[j]-df1.price[i])/df2.price[j]) >= 0.50:
                            cos_score = cos_score - 0.25
            # Save cos_similarity score to array
            cos_sim.append(cos_score)
        # Sort list and pick top 10
        temp_df = pd.DataFrame(cos_sim)
        temp_sorted = temp_df.sort_values(by=0,ascending=False)
        row_ranking = temp_sorted.index.values[0:10]
        rank_buy_id = []
        # Convert row # into buy_id
        for k in row_ranking:
            rank_buy_id.append(df2.id.iloc[k])
        rank_buy_id.insert(0,df1.id[i])
        # Fill Rank ID to 10 rows 
        if len(rank_buy_id) <= 10:
            for i in range(0,11-len(rank_buy_id),1):
                rank_buy_id.insert(len(rank_buy_id),'')
        # Concat into Abt Ranking Chart
        combine_df = pd.DataFrame([rank_buy_id],columns=col_names)
        df_ranking = pd.concat([df_ranking,combine_df],names=col_names,sort=False,ignore_index=True)
    return df_ranking

In [13]:
def avgcos_top_10_ranking(df1,df2,df3,df4):
    # df1 & df2 from First Model
    # df3 & df4 from Second Model
    # Initialize Ranking Chart
    col_names = ['df1_id','1','2','3','4','5','6','7','8','9','10']
    df_ranking = pd.DataFrame(columns=col_names);
    # Loop for every Abt_id element
    for i in range(0,len(df1.name),1):
        cos_sim = []
        # Loop for computing buy_id ranking for current Abt_id
        for j in range(0,len(df2.name),1):
            # Compute Cosine Similarity for 2 models and average their cos sim
            score1 = cos_similarity(df1.name[i],df2.name[j])
            score2 = cos_similarity(df3.name[i],df4.name[j])
            meanscore = (score1+score2)/2
            cos_sim.append(meanscore)
        # Sort list and pick top 10
        temp_df = pd.DataFrame(cos_sim) 
        temp_sorted = temp_df.sort_values(by=0,ascending=False)
        row_ranking = temp_sorted.index.values[0:10]
        rank_buy_id = []
        # Convert row # into buy_id
        for k in row_ranking:
            rank_buy_id.append(df2.id.iloc[k])
        rank_buy_id.insert(0,df1.id[i])
        # Concat into Abt Ranking Chart
        combine_df = pd.DataFrame([rank_buy_id],columns=col_names)
        df_ranking = pd.concat([df_ranking,combine_df],names=col_names,sort=False,ignore_index=True)
    return df_ranking

In [14]:
def true_ranking_loc(df_ranking,mapping,col1,col2):
    # Find where ground truth is ranked 
    col_names = ['df1_id','Ranked_Location']
    df_rank_loc = pd.DataFrame(columns=col_names)
    # Loop for every element of Abt_id from df_ranking
    for i in range(0,len(df_ranking.df1_id),1):
        # Look for ground truth from True Mapping
        look_up_id = df_ranking.df1_id[i]
        # Check if product is in Perfect Mapping
        if len(mapping[col2][mapping[col1]==look_up_id])!=0:
            mapped_id = mapping[col2][mapping[col1]==look_up_id].values[0]
            # Check if ground truth in ranking list and note its location
            if mapped_id in list(df_ranking.iloc[i]):
                ranking = list(df_ranking.iloc[i]).index(mapped_id)
            else:
                ranking = 0
            # Concat into Abt Ranking Location df
            temp_list = list([look_up_id,ranking])
            temp_df2 = pd.DataFrame([temp_list],columns=col_names)
            df_rank_loc = pd.concat([df_rank_loc,temp_df2],names=col_names,sort=False,ignore_index=True)
    return df_rank_loc

#### Scoring Metrics Functions

In [15]:
def hits_at_n(df_rank_loc,n):
    # Sum the correct ranks at ranks 1 to n divide by n number of ranks
    HR_n_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            if df_rank_loc.Ranked_Location[i]<=n:
                HR_n_sum += 1
    Hits_Rate_at_n = HR_n_sum/len(df_rank_loc)
    return Hits_Rate_at_n

In [16]:
def mean_reciprocal_rank(df_rank_loc):
    # Sum the Reciprocal Rank of Abt Ranking Location df and divide by total number element
    MRR_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            MRR_sum += 1/df_rank_loc.Ranked_Location[i]
    mean_reciprocal_rank = MRR_sum/len(df_rank_loc)
    return mean_reciprocal_rank

In [17]:
# Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def ndcg(df_rank_loc,k):
    # Sum up all NDCG at each Queries and divided by total # of Queries
    # Initalize Ranking Encoding
    rank_encoding = {0:[0,0,0,0,0,0,0,0,0,0],
                     1:[1,0,0,0,0,0,0,0,0,0],
                     2:[0,1,0,0,0,0,0,0,0,0],
                     3:[0,0,1,0,0,0,0,0,0,0],
                     4:[0,0,0,1,0,0,0,0,0,0],
                     5:[0,0,0,0,1,0,0,0,0,0],
                     6:[0,0,0,0,0,1,0,0,0,0],
                     7:[0,0,0,0,0,0,1,0,0,0],
                     8:[0,0,0,0,0,0,0,1,0,0],
                     9:[0,0,0,0,0,0,0,0,1,0],
                     10:[0,0,0,0,0,0,0,0,0,1]}
    sum_NDCG = 0
    for i in df_rank_loc.Ranked_Location:
        sum_NDCG += ndcg_at_k(rank_encoding[i], k)
    NDCG = sum_NDCG/len(df_rank_loc)
    return NDCG

#### All in One Function

In [18]:
def all_in_one(file1,file2,mapping,clean):
    # Generate TF-IDF Vectors
    data1_1, data2_1 = tf_idf_vect(file1,file2,clean)
    # FastText Pre-Trained
    # Read File 1 & Vectorize
    data1_2 = file_to_vector(file1,clean)
    # Read File 2 & Vectorize
    data2_2 = file_to_vector(file2,clean)
    # Concate word vectors from both models of file 1 into one word vector of dim 600
    data1 = vec_concat(data1_2,data1_1)
    # Concate word vectors from both models of file 2 into one word vector of dim 600
    data2 = vec_concat(data2_2,data2_1)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

## Code

### Abt vs Buy (Raw)

In [19]:
df_rank_loc = all_in_one('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw')
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.9010175763182239
Mean Reciprocal Rank:  0.67933681335624
NDCG at 10 Score:  0.7330225625581259


### Abt vs Buy (Cleaned No Space)

In [20]:
df_rank_loc = all_in_one('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace')
print("No space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No space
Hit Rate at 10:  0.910268270120259
Mean Reciprocal Rank:  0.7021467483077102
NDCG at 10 Score:  0.7526898239597221


### Abt vs Buy (Cleaned With Space)

In [21]:
df_rank_loc = all_in_one('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace')
print("With space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With space
Hit Rate at 10:  0.849213691026827
Mean Reciprocal Rank:  0.6245789465955979
NDCG at 10 Score:  0.678885481384402


### Amazon vs Google (Raw)

In [22]:
df_rank_loc = all_in_one('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw')
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.8032345013477089
Mean Reciprocal Rank:  0.6075311256578108
NDCG at 10 Score:  0.6552468257693013


### Amazon vs Google (Cleaned No Space)

In [23]:
df_rank_loc = all_in_one('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace')
print("No space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No space
Hit Rate at 10:  0.8571428571428571
Mean Reciprocal Rank:  0.6593430453942589
NDCG at 10 Score:  0.7077453673757608


### Amazon vs Google (Cleaned With Space)

In [24]:
df_rank_loc = all_in_one('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace')
print("With space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With space
Hit Rate at 10:  0.8247978436657682
Mean Reciprocal Rank:  0.64305359460346
NDCG at 10 Score:  0.6876296949577001


## Conclusion
### Using the TF-IDF + Pre-Trained (SkipGram) Vector Concatenation + Soft Blocking (-0.25)

|   Vector Concatenation    | Hits Rate at 10 | Mean Reciprocal Rank | NDCG Score at 10 |
| ------------------------     | ---   | ---     |   --- |
| Abt_buy (Raw)                         | 0.901 |  0.679  | 0.733 |
| Abt_buy (Cleaned No Space)            | 0.910 |  0.702  | 0.753 |
| Abt_buy (Cleaned With Space)          | 0.849 |  0.625  | 0.679 |
| Amazon_Google (Raw)                   | 0.803 |  0.608  | 0.655 |
| Amazon_Google (Cleaned No Space)      | 0.857 |  0.659  | 0.708 |
| Amazon_Google (Cleaned With Space)    | 0.825 |  0.643  | 0.688 |