In [21]:
import csv
import re
import numpy as np
import pandas as pd 
import math
from random import randint
import fasttext
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

## Functions Definitions
#### Preprocesing Functions

In [2]:
def read_file(filename):
    # Read in CSV File into DataFrame
    list_var = []
    data = pd.DataFrame()
    with open(filename,'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i in reader:
            list_var.append(i)
    # Header & Data
    col_name = list_var[0]
    data_val = list_var[1:]
    # Convert to DataFrame
    data = pd.DataFrame(data_val, columns=col_name)
    return data

In [238]:
def tokenize(data,clean):
    # Clean and Tokenize both Name and Description Field
    temp_name = data['name'].copy()
    temp_description = data['description'].copy()
    df = data.copy()
    # Raw
    if clean=='Raw':
        pass
    # Semi-Cleaned 
    elif clean=='Nspace':
        # Remove Symbols in Name and Description Column (no space)
        temp_name = [re.sub(r'[^\w\s]','',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]','',x) for x in temp_description]
    elif clean=='Wspace':
        # Remove Symbols in Name and Description Column (replace with space)
        temp_name = [re.sub(r'[^\w\s]',' ',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]',' ',x) for x in temp_description]
    # Tokenize: Split single sentence into list of words
    for i in range(0,len(temp_name),1):
        # Tokenize Name and Description Columns into words
        df['name'][i] = word_tokenize(temp_name[i])
        df['description'][i] = word_tokenize(temp_description[i])
        try:
            df['name'][i].remove('')
        except ValueError:
            pass  # do nothing!
        try:
            df['description'][i].remove('')
        except ValueError:
            pass  # do nothing!
    return df

In [4]:
def vectorize(df):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = [model.get_word_vector(x) for x in i]
        avg = np.average(array, axis=0)
        df_vectorize['name'][counter] = avg
        counter += 1
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [5]:
def file_to_vector(file,clean):
    # Read in Abt File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize name column
    data = vectorize(df)
    return data

#### TF-IDF

In [6]:
def create_corpus(df,df2,clean):
    # Change Column names to 'name' instead of 'title'
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    if 'title' in df2.columns:
        df2['name']=df2['title'].copy()  
    # Combine name and description text
    df['combine'] = df['name'] + ' ' + df['description']
    df2['combine'] = df2['name'] + ' ' + df2['description']
    # Create Corpus to train TFIDF
    # With option for Raw vs Cleaned
    corpus = []
    for i in range(0,len(df['combine']),1):
        temp = df['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    for i in range(0,len(df2['combine']),1):
        temp = df2['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    return corpus

In [7]:
def tf_idf_vect(file1,file2,clean):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    corpus = create_corpus(df,df2,clean)
    # Fit TF-IDF Model
    vectorizer = TfidfVectorizer(max_features=300)
    tfidf_model = vectorizer.fit(corpus)
    # Generate TF-IDF Vectors for each product
    arr = []
    for i in range(0,len(df),1):
        text = df['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df['name'] = arr
    arr = []
    for i in range(0,len(df2),1):
        text = df2['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df2['name'] = arr
    
    return df[['id','name']], df2[['id','name']]

#### Random Indexing

In [157]:
def random_index_vectors(file1,file2,clean,k):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    # Create Corpus
    corpus = create_corpus(df,df2,clean)
    # Get Vocabulary of the Corpus
    vocab = []
    for i in corpus:
        words = word_tokenize(i)
        for j in words:
            vocab.append(j)
    vocabulary = set(vocab)
    # Create DF
    df = pd.DataFrame()
    # Insert Vocabulary Column
    df['vocab'] = list(vocabulary)
    # Insert Word Vector Column
    arr = []
    for i in df.vocab:
        # Generate Random index vector of +1 or -1 using Gaussian Distribution 
        random_vec = np.array([1 if i>=0 else 0 for i in np.random.normal(size=(300))])
        arr.append(random_vec)
    # Initialize fixed and current vector to be vectors generated from above
    df['fixed_vector'] = arr
    df['current_vector'] = arr
    # Window Size of k
    k = k
    for i in corpus:
        # Tokenize Product Name + Description into words
        words = word_tokenize(i)
        # For every word in the line, update word vector
        for j in range(0,len(words),1):
            # Obtain the current word vector
            current_index = df[df.vocab==words[j]].index[0]
            current_vector = df.loc[current_index].current_vector
            # Set up upper and lower bound
            lowerbound = j-k
            upperbound = j+k
            for x in range(lowerbound,upperbound+1,1):
                # if word within upper and lower bound and the index can be accessable and does not equal current index
                if (x>=0) and (x<=(len(words)-1)):
                    if x!=j:
                        # Obtain fixed vectors of context word 
                        context_index = df[df.vocab==words[x]].index[0]
                        context_vector = df.loc[context_index].fixed_vector
                        # Update current word vector by addint fixed vectors of context word
                        current_vector = current_vector + context_vector 
                else:
                    # Out of Bounds
                    pass    
            # Update current word vector into the df
            df.loc[current_index].current_vector = current_vector
    return df

In [206]:
def vectorize_indexing(df,df_vec):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = np.zeros(300)
        avg_counter = 0
        for x in i:
            array += df_vec.current_vector[df_vec.vocab==x].values[0]
            avg_counter += 1
        avg = array/avg_counter
        df_vectorize['name'][counter] = avg
        counter += 1   
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [209]:
def file_to_RI_vector(file,df_vec,clean):
    # Read in File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize
    data = vectorize_indexing(df,df_vec)
    data.head()
    return data

#### Create Ranked List Functions

In [8]:
def vector_length(a):
    return float(math.sqrt(sum(x*x for x in a)))
def dot_product(a,b):
    if len(a) == len(b):
        return sum([x*y for (x,y) in zip(a,b)])
    else:
        return "Vector Length Different"
def cos_similarity(a,b):
    return dot_product(a,b)/(vector_length(a)*vector_length(b))

In [9]:
def top_10_ranking(df1,df2):
    # Initialize Ranking Chart
    col_names = ['df1_id','1','2','3','4','5','6','7','8','9','10']
    df_ranking = pd.DataFrame(columns=col_names);
    # Loop for every Abt_id element
    for i in range(0,len(df1.name),1):
        cos_sim = []
        # Loop for computing buy_id ranking for current Abt_id
        for j in range(0,len(df2.name),1):
            # Compute Cosine Similarity for every buy_id element compared to current Abt_id
            cos_sim.append(cos_similarity(df1.name[i],df2.name[j]))
        # Sort list and pick top 10
        temp_df = pd.DataFrame(cos_sim)
        temp_sorted = temp_df.sort_values(by=0,ascending=False)
        row_ranking = temp_sorted.index.values[0:10]
        rank_buy_id = []
        # Convert row # into buy_id
        for k in row_ranking:
            rank_buy_id.append(df2.id.iloc[k])
        rank_buy_id.insert(0,df1.id[i])
        # Concat into Abt Ranking Chart
        combine_df = pd.DataFrame([rank_buy_id],columns=col_names)
        df_ranking = pd.concat([df_ranking,combine_df],names=col_names,sort=False,ignore_index=True)
    return df_ranking

In [10]:
def true_ranking_loc(df_ranking,mapping,col1,col2):
    # Find where ground truth is ranked 
    col_names = ['df1_id','Ranked_Location']
    df_rank_loc = pd.DataFrame(columns=col_names)
    # Loop for every element of Abt_id from df_ranking
    for i in range(0,len(df_ranking.df1_id),1):
        # Look for ground truth from True Mapping
        look_up_id = df_ranking.df1_id[i]
        # Check if product is in Perfect Mapping
        if len(mapping[col2][mapping[col1]==look_up_id])!=0:
            mapped_id = mapping[col2][mapping[col1]==look_up_id].values[0]
            # Check if ground truth in ranking list and note its location
            if mapped_id in list(df_ranking.iloc[i]):
                ranking = list(df_ranking.iloc[i]).index(mapped_id)
            else:
                ranking = 0
            # Concat into Abt Ranking Location df
            temp_list = list([look_up_id,ranking])
            temp_df2 = pd.DataFrame([temp_list],columns=col_names)
            df_rank_loc = pd.concat([df_rank_loc,temp_df2],names=col_names,sort=False,ignore_index=True)
    return df_rank_loc

#### Scoring Metrics Functions

In [11]:
def hits_at_n(df_rank_loc,n):
    # Sum the correct ranks at ranks 1 to n divide by n number of ranks
    HR_n_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            if df_rank_loc.Ranked_Location[i]<=n:
                HR_n_sum += 1
    Hits_Rate_at_n = HR_n_sum/len(df_rank_loc)
    return Hits_Rate_at_n

In [12]:
def mean_reciprocal_rank(df_rank_loc):
    # Sum the Reciprocal Rank of Abt Ranking Location df and divide by total number element
    MRR_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            MRR_sum += 1/df_rank_loc.Ranked_Location[i]
    mean_reciprocal_rank = MRR_sum/len(df_rank_loc)
    return mean_reciprocal_rank

In [13]:
# Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def ndcg(df_rank_loc,k):
    # Sum up all NDCG at each Queries and divided by total # of Queries
    # Initalize Ranking Encoding
    rank_encoding = {0:[0,0,0,0,0,0,0,0,0,0],
                     1:[1,0,0,0,0,0,0,0,0,0],
                     2:[0,1,0,0,0,0,0,0,0,0],
                     3:[0,0,1,0,0,0,0,0,0,0],
                     4:[0,0,0,1,0,0,0,0,0,0],
                     5:[0,0,0,0,1,0,0,0,0,0],
                     6:[0,0,0,0,0,1,0,0,0,0],
                     7:[0,0,0,0,0,0,1,0,0,0],
                     8:[0,0,0,0,0,0,0,1,0,0],
                     9:[0,0,0,0,0,0,0,0,1,0],
                     10:[0,0,0,0,0,0,0,0,0,1]}
    sum_NDCG = 0
    for i in df_rank_loc.Ranked_Location:
        sum_NDCG += ndcg_at_k(rank_encoding[i], k)
    NDCG = sum_NDCG/len(df_rank_loc)
    return NDCG

#### All in One Function

In [14]:
def all_in_one(file1,file2,mapping,clean):
    # Read File 1 & Vectorize
    data1 = file_to_vector(file1,clean)
    # Read File 2 & Vectorize
    data2 = file_to_vector(file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [15]:
def all_in_one_tfidf(file1,file2,mapping,clean):
    # Generate TF-IDF Vectors
    data1, data2 = tf_idf_vect(file1,file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [212]:
def all_in_one_random_index(file1,file2,mapping,clean,k):
    # Generate Random Indexing vectors
    df_vec = random_index_vectors(file1,file2,clean,k)
    # Vectorize File 1
    data1 = file_to_RI_vector(file1,df_vec,clean)
    # Vectorize File 2
    data2 = file_to_RI_vector(file2,df_vec,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

## Code

### Abt vs Buy (Raw)

In [213]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',1)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23496762257169287
Mean Reciprocal Rank:  0.14526709542898256
NDCG at 10 Score:  0.16680724615219517


In [214]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',2)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.24421831637372804
Mean Reciprocal Rank:  0.1493979707208199
NDCG at 10 Score:  0.1721242666985971


In [215]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',3)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.25716928769657726
Mean Reciprocal Rank:  0.15571340469582834
NDCG at 10 Score:  0.1798372369475266


In [216]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',4)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.26919518963922295
Mean Reciprocal Rank:  0.15583674727985553
NDCG at 10 Score:  0.18273682808834393


In [218]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',5)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2664199814986124
Mean Reciprocal Rank:  0.1595458349852429
NDCG at 10 Score:  0.18469696878408343


In [219]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',6)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2627197039777983
Mean Reciprocal Rank:  0.1575055430744607
NDCG at 10 Score:  0.18249660283447106


### Abt vs Buy (Cleaned No Space)

In [220]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',1)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.35892691951896394
Mean Reciprocal Rank:  0.23572456426295457
NDCG at 10 Score:  0.2654305972497467


In [221]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',2)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3894542090656799
Mean Reciprocal Rank:  0.25466866364183655
NDCG at 10 Score:  0.2867944880387043


In [222]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',3)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.41350601295097134
Mean Reciprocal Rank:  0.2660044344595684
NDCG at 10 Score:  0.3009518220123374


In [223]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',4)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.4181313598519889
Mean Reciprocal Rank:  0.264700894233734
NDCG at 10 Score:  0.3009406348351362


In [224]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',5)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.42368177613321
Mean Reciprocal Rank:  0.2620317753990277
NDCG at 10 Score:  0.30017060591746364


In [225]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',6)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.42645698427382056
Mean Reciprocal Rank:  0.2733616727603774
NDCG at 10 Score:  0.3095154555482274


### Abt vs Buy (Cleaned With Space)

In [239]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',1)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3404255319148936
Mean Reciprocal Rank:  0.21237830932558024
NDCG at 10 Score:  0.2428549380079372


In [240]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',2)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.37187789084181316
Mean Reciprocal Rank:  0.22607667797307
NDCG at 10 Score:  0.2604755413244143


In [241]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',3)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3829787234042553
Mean Reciprocal Rank:  0.23812607374124467
NDCG at 10 Score:  0.27247599958388125


In [242]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',4)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.393154486586494
Mean Reciprocal Rank:  0.24015498583028635
NDCG at 10 Score:  0.2764464145223661


In [243]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',5)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.4061054579093432
Mean Reciprocal Rank:  0.24461881268079222
NDCG at 10 Score:  0.28281528715000054


In [244]:
df_rank_loc = all_in_one_random_index('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',6)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.4061054579093432
Mean Reciprocal Rank:  0.2470669427191165
NDCG at 10 Score:  0.28492975510973106


### Amazon vs Google (Raw)

In [227]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',1)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.252470799640611
Mean Reciprocal Rank:  0.19182996049572865
NDCG at 10 Score:  0.2064249228668872


In [228]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',2)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2398921832884097
Mean Reciprocal Rank:  0.1817385444743935
NDCG at 10 Score:  0.19569326701589718


In [229]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',3)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22911051212938005
Mean Reciprocal Rank:  0.1798107502959255
NDCG at 10 Score:  0.1916029866591657


In [230]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',4)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23000898472596587
Mean Reciprocal Rank:  0.181813060654031
NDCG at 10 Score:  0.19340679226806248


In [231]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',5)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23450134770889489
Mean Reciprocal Rank:  0.18263951282819205
NDCG at 10 Score:  0.194948965804312


In [232]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',6)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23360287511230907
Mean Reciprocal Rank:  0.1796645702306079
NDCG at 10 Score:  0.1924750537854518


### Amazon vs Google (Cleaned No Space)

In [233]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',1)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.344115004492363
Mean Reciprocal Rank:  0.25352828762532265
NDCG at 10 Score:  0.27514763940099246


In [234]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',2)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3477088948787062
Mean Reciprocal Rank:  0.25950455653959686
NDCG at 10 Score:  0.2804888186504555


In [235]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',3)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3557951482479784
Mean Reciprocal Rank:  0.2661097562714813
NDCG at 10 Score:  0.2873688464609486


In [245]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',4)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.35759209344115006
Mean Reciprocal Rank:  0.2693492491336156
NDCG at 10 Score:  0.29030509940465593


In [246]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',5)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3692722371967655
Mean Reciprocal Rank:  0.2707376032173874
NDCG at 10 Score:  0.2942132839589285


In [247]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',6)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.36298292902066487
Mean Reciprocal Rank:  0.27511765712574326
NDCG at 10 Score:  0.296033401720416


### Amazon vs Google (Cleaned With Space)

In [248]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',1)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.35938903863432164
Mean Reciprocal Rank:  0.25614205850054894
NDCG at 10 Score:  0.2805683729235091


In [249]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',2)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.353099730458221
Mean Reciprocal Rank:  0.2585415507922247
NDCG at 10 Score:  0.28096257204629094


In [250]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',3)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3468104222821204
Mean Reciprocal Rank:  0.25914801979492
NDCG at 10 Score:  0.2798934541986852


In [251]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',4)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.35130278526504943
Mean Reciprocal Rank:  0.26354803976097774
NDCG at 10 Score:  0.2842731068125859


In [252]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',5)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3477088948787062
Mean Reciprocal Rank:  0.2639926410815898
NDCG at 10 Score:  0.2839327859029024


In [253]:
df_rank_loc = all_in_one_random_index('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',6)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.35669362084456424
Mean Reciprocal Rank:  0.2659311313623982
NDCG at 10 Score:  0.28745931850479994


### Conclusion
#### Random Indexing

|  Hits Rate at 10      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.235 |  0.244  | 0.257 | 0.269 | 0.266 | 0.263 |
| Abt_buy (Cleaned No Space)            | 0.359 |  0.389  | 0.414 | 0.418 | 0.424 | 0.426 |
| Abt_buy (Cleaned With Space)          | 0.340 |  0.372  | 0.383 | 0.393 | 0.406 | 0.406 |
| Amazon_Google (Raw)                   | 0.252 |  0.240  | 0.229 | 0.230 | 0.235 | 0.234 |
| Amazon_Google (Cleaned No Space)      | 0.344 |  0.348  | 0.356 | 0.358 | 0.369 | 0.363 |
| Amazon_Google (Cleaned With Space)    | 0.359 |  0.353  | 0.347 | 0.351 | 0.348 | 0.357 |

|  Mean Reciprocal Rank      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.145 |  0.149  | 0.156 | 0.156 | 0.160 | 0.158 |
| Abt_buy (Cleaned No Space)            | 0.236 |  0.255  | 0.266 | 0.265 | 0.262 | 0.273 |
| Abt_buy (Cleaned With Space)          | 0.212 |  0.226  | 0.238 | 0.240 | 0.245 | 0.247 |
| Amazon_Google (Raw)                   | 0.192 |  0.182  | 0.180 | 0.182 | 0.183 | 0.180 |
| Amazon_Google (Cleaned No Space)      | 0.254 |  0.260  | 0.266 | 0.269 | 0.271 | 0.275 |
| Amazon_Google (Cleaned With Space)    | 0.256 |  0.259  | 0.259 | 0.264 | 0.264 | 0.266 |

|  NDCG Score at 10      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.167 |  0.172  | 0.180 | 0.183 | 0.185 | 0.182 |
| Abt_buy (Cleaned No Space)            | 0.265 |  0.287  | 0.301 | 0.301 | 0.300 | 0.310 |
| Abt_buy (Cleaned With Space)          | 0.243 |  0.260  | 0.272 | 0.276 | 0.283 | 0.285 |
| Amazon_Google (Raw)                   | 0.206 |  0.196  | 0.192 | 0.193 | 0.195 | 0.192 |
| Amazon_Google (Cleaned No Space)      | 0.275 |  0.280  | 0.287 | 0.290 | 0.294 | 0.296 |
| Amazon_Google (Cleaned With Space)    | 0.281 |  0.281  | 0.280 | 0.284 | 0.284 | 0.287 |