In [1]:
import csv
import re
import numpy as np
import pandas as pd 
import math
from random import randint
import fasttext
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

### Pre-Trained SKIPGRAM

In [20]:
# Load Facebook FastText Pre-trained Model
model = fasttext.load_model('wiki.en.bin')




## Functions Definitions
#### Preprocesing Functions

In [2]:
def read_file(filename):
    # Read in CSV File into DataFrame
    list_var = []
    data = pd.DataFrame()
    with open(filename,'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i in reader:
            list_var.append(i)
    # Header & Data
    col_name = list_var[0]
    data_val = list_var[1:]
    # Convert to DataFrame
    data = pd.DataFrame(data_val, columns=col_name)
    return data

In [85]:
def tokenize(data,clean):
    # Clean and Tokenize both Name and Description Field
    temp_name = data['name'].copy()
    temp_description = data['description'].copy()
    df = data.copy()
    # Raw
    if clean=='Raw':
        pass
    # Semi-Cleaned 
    elif clean=='Nspace':
        # Remove Symbols in Name and Description Column (no space)
        temp_name = [re.sub(r'[^\w\s]','',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]','',x) for x in temp_description]
    elif clean=='Wspace':
        # Remove Symbols in Name and Description Column (replace with space)
        temp_name = [re.sub(r'[^\w\s]',' ',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]',' ',x) for x in temp_description]
    # Tokenize: Split single sentence into list of words
    for i in range(0,len(temp_name),1):
        # Tokenize Name and Description Columns into words
        df['name'][i] = word_tokenize(temp_name[i])
        df['description'][i] = word_tokenize(temp_description[i])
        try:
            df['name'][i].remove('')
        except ValueError:
            pass  # do nothing!
        try:
            df['description'][i].remove('')
        except ValueError:
            pass  # do nothing!
    return df

In [4]:
def vectorize(df):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = [model.get_word_vector(x) for x in i]
        avg = np.average(array, axis=0)
        df_vectorize['name'][counter] = avg
        counter += 1
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [5]:
def file_to_vector(file,clean):
    # Read in Abt File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize name column
    data = vectorize(df)
    return data

#### TF-IDF

In [6]:
def create_corpus(df,df2,clean):
    # Change Column names to 'name' instead of 'title'
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    if 'title' in df2.columns:
        df2['name']=df2['title'].copy()  
    # Combine name and description text
    df['combine'] = df['name'] + ' ' + df['description']
    df2['combine'] = df2['name'] + ' ' + df2['description']
    # Create Corpus to train TFIDF
    # With option for Raw vs Cleaned
    corpus = []
    for i in range(0,len(df['combine']),1):
        temp = df['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    for i in range(0,len(df2['combine']),1):
        temp = df2['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    return corpus

In [7]:
def tf_idf_vect(file1,file2,clean):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    corpus = create_corpus(df,df2,clean)
    # Fit TF-IDF Model
    vectorizer = TfidfVectorizer(max_features=300)
    tfidf_model = vectorizer.fit(corpus)
    # Generate TF-IDF Vectors for each product
    arr = []
    for i in range(0,len(df),1):
        text = df['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df['name'] = arr
    arr = []
    for i in range(0,len(df2),1):
        text = df2['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df2['name'] = arr
    
    return df[['id','name']], df2[['id','name']]

#### Random Indexing

In [8]:
def random_index_vectors(file1,file2,clean,k):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    # Create Corpus
    corpus = create_corpus(df,df2,clean)
    # Get Vocabulary of the Corpus
    vocab = []
    for i in corpus:
        words = word_tokenize(i)
        for j in words:
            vocab.append(j)
    vocabulary = set(vocab)
    # Create DF
    df = pd.DataFrame()
    # Insert Vocabulary Column
    df['vocab'] = list(vocabulary)
    # Insert Word Vector Column
    arr = []
    for i in df.vocab:
        # Generate Random index vector of +1 or 0 using Gaussian Distribution 
        random_vec = np.array([1 if i>=0 else 0 for i in np.random.normal(size=(300))])
        arr.append(random_vec)
    # Initialize fixed and current vector to be vectors generated from above
    df['fixed_vector'] = arr
    df['current_vector'] = arr
    # Window Size of k
    k = k
    for i in corpus:
        # Tokenize Product Name + Description into words
        words = word_tokenize(i)
        # For every word in the line, update word vector
        for j in range(0,len(words),1):
            # Obtain the current word vector
            current_index = df[df.vocab==words[j]].index[0]
            current_vector = df.loc[current_index].current_vector
            # Set up upper and lower bound
            lowerbound = j-k
            upperbound = j+k
            for x in range(lowerbound,upperbound+1,1):
                # if word within upper and lower bound and the index can be accessable and does not equal current index
                if (x>=0) and (x<=(len(words)-1)):
                    if x!=j:
                        # Obtain fixed vectors of context word 
                        context_index = df[df.vocab==words[x]].index[0]
                        context_vector = df.loc[context_index].fixed_vector
                        # Update current word vector by addint fixed vectors of context word
                        current_vector = current_vector + context_vector 
                else:
                    # Out of Bounds
                    pass    
            # Update current word vector into the df
            df.loc[current_index].current_vector = current_vector
    return df

In [36]:
def random_index_pretrain(file1,file2,clean,k):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    # Create Corpus
    corpus = create_corpus(df,df2,clean)
    # Get Vocabulary of the Corpus
    vocab = []
    for i in corpus:
        words = word_tokenize(i)
        for j in words:
            vocab.append(j)
    vocabulary = set(vocab)
    # Create DF
    df = pd.DataFrame()
    # Insert Vocabulary Column
    df['vocab'] = list(vocabulary)
    # Insert Word Vector Column
    arr = []
    for i in df.vocab:
        # Get word vector from pretrained model
        arr.append(model.get_word_vector(i))
    # Initialize fixed and current vector to be vectors generated from above
    df['fixed_vector'] = arr
    df['current_vector'] = arr
    # Window Size of k
    k = k
    for i in corpus:
        # Tokenize Product Name + Description into words
        words = word_tokenize(i)
        # For every word in the line, update word vector
        for j in range(0,len(words),1):
            # Obtain the current word vector
            current_index = df[df.vocab==words[j]].index[0]
            current_vector = df.loc[current_index].current_vector
            # Set up upper and lower bound
            lowerbound = j-k
            upperbound = j+k
            for x in range(lowerbound,upperbound+1,1):
                # if word within upper and lower bound and the index can be accessable and does not equal current index
                if (x>=0) and (x<=(len(words)-1)):
                    if x!=j:
                        # Obtain fixed vectors of context word 
                        context_index = df[df.vocab==words[x]].index[0]
                        context_vector = df.loc[context_index].fixed_vector
                        # Update current word vector by addint fixed vectors of context word
                        current_vector = current_vector + context_vector 
                else:
                    # Out of Bounds
                    pass    
            # Update current word vector into the df
            df.loc[current_index].current_vector = current_vector
    return df

In [9]:
def vectorize_indexing(df,df_vec):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = np.zeros(300)
        avg_counter = 0
        for x in i:
            array += df_vec.current_vector[df_vec.vocab==x].values[0]
            avg_counter += 1
        avg = array/avg_counter
        df_vectorize['name'][counter] = avg
        counter += 1   
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [10]:
def file_to_RI_vector(file,df_vec,clean):
    # Read in File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize
    data = vectorize_indexing(df,df_vec)
    data.head()
    return data

#### Create Ranked List Functions

In [11]:
def vector_length(a):
    return float(math.sqrt(sum(x*x for x in a)))
def dot_product(a,b):
    if len(a) == len(b):
        return sum([x*y for (x,y) in zip(a,b)])
    else:
        return "Vector Length Different"
def cos_similarity(a,b):
    return dot_product(a,b)/(vector_length(a)*vector_length(b))

In [12]:
def top_10_ranking(df1,df2):
    # Initialize Ranking Chart
    col_names = ['df1_id','1','2','3','4','5','6','7','8','9','10']
    df_ranking = pd.DataFrame(columns=col_names);
    # Loop for every Abt_id element
    for i in range(0,len(df1.name),1):
        cos_sim = []
        # Loop for computing buy_id ranking for current Abt_id
        for j in range(0,len(df2.name),1):
            # Compute Cosine Similarity for every buy_id element compared to current Abt_id
            cos_sim.append(cos_similarity(df1.name[i],df2.name[j]))
        # Sort list and pick top 10
        temp_df = pd.DataFrame(cos_sim)
        temp_sorted = temp_df.sort_values(by=0,ascending=False)
        row_ranking = temp_sorted.index.values[0:10]
        rank_buy_id = []
        # Convert row # into buy_id
        for k in row_ranking:
            rank_buy_id.append(df2.id.iloc[k])
        rank_buy_id.insert(0,df1.id[i])
        # Concat into Abt Ranking Chart
        combine_df = pd.DataFrame([rank_buy_id],columns=col_names)
        df_ranking = pd.concat([df_ranking,combine_df],names=col_names,sort=False,ignore_index=True)
    return df_ranking

In [13]:
def true_ranking_loc(df_ranking,mapping,col1,col2):
    # Find where ground truth is ranked 
    col_names = ['df1_id','Ranked_Location']
    df_rank_loc = pd.DataFrame(columns=col_names)
    # Loop for every element of Abt_id from df_ranking
    for i in range(0,len(df_ranking.df1_id),1):
        # Look for ground truth from True Mapping
        look_up_id = df_ranking.df1_id[i]
        # Check if product is in Perfect Mapping
        if len(mapping[col2][mapping[col1]==look_up_id])!=0:
            mapped_id = mapping[col2][mapping[col1]==look_up_id].values[0]
            # Check if ground truth in ranking list and note its location
            if mapped_id in list(df_ranking.iloc[i]):
                ranking = list(df_ranking.iloc[i]).index(mapped_id)
            else:
                ranking = 0
            # Concat into Abt Ranking Location df
            temp_list = list([look_up_id,ranking])
            temp_df2 = pd.DataFrame([temp_list],columns=col_names)
            df_rank_loc = pd.concat([df_rank_loc,temp_df2],names=col_names,sort=False,ignore_index=True)
    return df_rank_loc

#### Scoring Metrics Functions

In [14]:
def hits_at_n(df_rank_loc,n):
    # Sum the correct ranks at ranks 1 to n divide by n number of ranks
    HR_n_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            if df_rank_loc.Ranked_Location[i]<=n:
                HR_n_sum += 1
    Hits_Rate_at_n = HR_n_sum/len(df_rank_loc)
    return Hits_Rate_at_n

In [15]:
def mean_reciprocal_rank(df_rank_loc):
    # Sum the Reciprocal Rank of Abt Ranking Location df and divide by total number element
    MRR_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            MRR_sum += 1/df_rank_loc.Ranked_Location[i]
    mean_reciprocal_rank = MRR_sum/len(df_rank_loc)
    return mean_reciprocal_rank

In [16]:
# Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def ndcg(df_rank_loc,k):
    # Sum up all NDCG at each Queries and divided by total # of Queries
    # Initalize Ranking Encoding
    rank_encoding = {0:[0,0,0,0,0,0,0,0,0,0],
                     1:[1,0,0,0,0,0,0,0,0,0],
                     2:[0,1,0,0,0,0,0,0,0,0],
                     3:[0,0,1,0,0,0,0,0,0,0],
                     4:[0,0,0,1,0,0,0,0,0,0],
                     5:[0,0,0,0,1,0,0,0,0,0],
                     6:[0,0,0,0,0,1,0,0,0,0],
                     7:[0,0,0,0,0,0,1,0,0,0],
                     8:[0,0,0,0,0,0,0,1,0,0],
                     9:[0,0,0,0,0,0,0,0,1,0],
                     10:[0,0,0,0,0,0,0,0,0,1]}
    sum_NDCG = 0
    for i in df_rank_loc.Ranked_Location:
        sum_NDCG += ndcg_at_k(rank_encoding[i], k)
    NDCG = sum_NDCG/len(df_rank_loc)
    return NDCG

#### All in One Function

In [17]:
def all_in_one(file1,file2,mapping,clean):
    # Read File 1 & Vectorize
    data1 = file_to_vector(file1,clean)
    # Read File 2 & Vectorize
    data2 = file_to_vector(file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [18]:
def all_in_one_tfidf(file1,file2,mapping,clean):
    # Generate TF-IDF Vectors
    data1, data2 = tf_idf_vect(file1,file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [19]:
def all_in_one_random_index(file1,file2,mapping,clean,k):
    # Generate Random Indexing vectors
    df_vec = random_index_vectors(file1,file2,clean,k)
    # Vectorize File 1
    data1 = file_to_RI_vector(file1,df_vec,clean)
    # Vectorize File 2
    data2 = file_to_RI_vector(file2,df_vec,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [39]:
def all_in_one_random_index_pretrain(file1,file2,mapping,clean,k):
    # Generate Random Indexing vectors
    df_vec = random_index_pretrain(file1,file2,clean,k)
    # Vectorize File 1
    data1 = file_to_RI_vector(file1,df_vec,clean)
    # Vectorize File 2
    data2 = file_to_RI_vector(file2,df_vec,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

## Code

### Abt vs Buy (Raw)

In [46]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',1)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.21369102682701202
Mean Reciprocal Rank:  0.13071964524323446
NDCG at 10 Score:  0.15025430174610463


In [47]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',2)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23126734505087881
Mean Reciprocal Rank:  0.1382167452241458
NDCG at 10 Score:  0.1602369537689421


In [48]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',3)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23866790009250693
Mean Reciprocal Rank:  0.14107858684639438
NDCG at 10 Score:  0.16406702278040936


In [49]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',4)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.24421831637372804
Mean Reciprocal Rank:  0.14285016812768886
NDCG at 10 Score:  0.16680394945336943


In [50]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',5)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2432932469935245
Mean Reciprocal Rank:  0.13990646520711264
NDCG at 10 Score:  0.16430802213966478


In [51]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',6)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2506938020351526
Mean Reciprocal Rank:  0.14471278798290815
NDCG at 10 Score:  0.16970186344309948


### Abt vs Buy (Cleaned No Space)

In [52]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',1)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3524514338575393
Mean Reciprocal Rank:  0.2312254966741553
NDCG at 10 Score:  0.26026497025946704


In [53]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',2)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3792784458834413
Mean Reciprocal Rank:  0.24607689822768442
NDCG at 10 Score:  0.27753623912822534


In [54]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',3)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3885291396854764
Mean Reciprocal Rank:  0.24693699249665924
NDCG at 10 Score:  0.2806232914248817


In [55]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',4)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.393154486586494
Mean Reciprocal Rank:  0.2511078807101008
NDCG at 10 Score:  0.2850047501671032


In [56]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',5)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.4024051803885291
Mean Reciprocal Rank:  0.2574526819670203
NDCG at 10 Score:  0.29198838227407803


In [57]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',6)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.40795559666975023
Mean Reciprocal Rank:  0.2624245627945903
NDCG at 10 Score:  0.29713167824644104


### Abt vs Buy (Cleaned With Space)

In [89]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',1)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3098982423681776
Mean Reciprocal Rank:  0.2006148774650161
NDCG at 10 Score:  0.2266414870284118


In [90]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',2)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.33117483811285847
Mean Reciprocal Rank:  0.21369065973598797
NDCG at 10 Score:  0.24155926491584073


In [91]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',3)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.34875115633672527
Mean Reciprocal Rank:  0.2141722831593321
NDCG at 10 Score:  0.24598225967247275


In [92]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',4)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.36262719703977797
Mean Reciprocal Rank:  0.2241076017209226
NDCG at 10 Score:  0.2567785660812225


In [93]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',5)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3774283071230342
Mean Reciprocal Rank:  0.23092741876275624
NDCG at 10 Score:  0.2657246978446282


In [94]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',6)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3894542090656799
Mean Reciprocal Rank:  0.2341666299575641
NDCG at 10 Score:  0.2710223447968756


### Amazon vs Google (Raw)

In [95]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',1)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.24438454627133874
Mean Reciprocal Rank:  0.18469815599195652
NDCG at 10 Score:  0.1988859082009485


In [96]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',2)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22821203953279426
Mean Reciprocal Rank:  0.17704473823072203
NDCG at 10 Score:  0.18935002785098914


In [97]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',3)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22641509433962265
Mean Reciprocal Rank:  0.1718496413240348
NDCG at 10 Score:  0.1848659295164695


In [98]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',4)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22641509433962265
Mean Reciprocal Rank:  0.17272137366476983
NDCG at 10 Score:  0.1853699276046824


In [99]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',5)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22192273135669363
Mean Reciprocal Rank:  0.17173626263922756
NDCG at 10 Score:  0.18369354066261206


In [100]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',6)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22461814914645103
Mean Reciprocal Rank:  0.17109164420485168
NDCG at 10 Score:  0.18377377043815704


### Amazon vs Google (Cleaned No Space)

In [101]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',1)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3423180592991914
Mean Reciprocal Rank:  0.25425348336399545
NDCG at 10 Score:  0.27528841076667154


In [102]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',2)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.344115004492363
Mean Reciprocal Rank:  0.2539949942241047
NDCG at 10 Score:  0.2753615542729341


In [103]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',3)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3468104222821204
Mean Reciprocal Rank:  0.25312754032430584
NDCG at 10 Score:  0.27533791770382293


In [104]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',4)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3387241689128482
Mean Reciprocal Rank:  0.2508503401360543
NDCG at 10 Score:  0.27192096047660025


In [105]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',5)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.33692722371967654
Mean Reciprocal Rank:  0.25223441577889016
NDCG at 10 Score:  0.27249428795534053


In [106]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',6)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.33692722371967654
Mean Reciprocal Rank:  0.2507940073303954
NDCG at 10 Score:  0.2713362521921128


### Amazon vs Google (Cleaned With Space)

In [107]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',1)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3279424977538185
Mean Reciprocal Rank:  0.23884111296510216
NDCG at 10 Score:  0.2600963861448355


In [108]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',2)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3189577717879605
Mean Reciprocal Rank:  0.23888389737446342
NDCG at 10 Score:  0.2579485358475297


In [109]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',3)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.31805929919137466
Mean Reciprocal Rank:  0.2366865614170196
NDCG at 10 Score:  0.255988127282281


In [110]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',4)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3054806828391734
Mean Reciprocal Rank:  0.2344567806158102
NDCG at 10 Score:  0.25134531442323005


In [111]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',5)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.29649595687331537
Mean Reciprocal Rank:  0.23158131176999097
NDCG at 10 Score:  0.24713229044115934


In [112]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',6)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.2973944294699012
Mean Reciprocal Rank:  0.23025820391049495
NDCG at 10 Score:  0.24637395256534306


### Conclusion 

|  Hits Rate at 10      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.214 |  0.231  | 0.239 | 0.244 | 0.243 | 0.251 |
| Abt_buy (Cleaned No Space)            | 0.352 |  0.379  | 0.389 | 0.393 | 0.402 | 0.408 |
| Abt_buy (Cleaned With Space)          | 0.310 |  0.331  | 0.349 | 0.363 | 0.377 | 0.389 |
| Amazon_Google (Raw)                   | 0.244 |  0.228  | 0.226 | 0.226 | 0.222 | 0.225 |
| Amazon_Google (Cleaned No Space)      | 0.342 |  0.344  | 0.347 | 0.339 | 0.337 | 0.337 |
| Amazon_Google (Cleaned With Space)    | 0.328 |  0.319  | 0.318 | 0.305 | 0.296 | 0.297 |

|  Mean Reciprocal Rank      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.131 |  0.138  | 0.141 | 0.143 | 0.140 | 0.145 |
| Abt_buy (Cleaned No Space)            | 0.231 |  0.246  | 0.247 | 0.251 | 0.257 | 0.262 |
| Abt_buy (Cleaned With Space)          | 0.201 |  0.214  | 0.214 | 0.224 | 0.231 | 0.234 |
| Amazon_Google (Raw)                   | 0.185 |  0.177  | 0.172 | 0.173 | 0.172 | 0.171 |
| Amazon_Google (Cleaned No Space)      | 0.254 |  0.254  | 0.253 | 0.251 | 0.252 | 0.251 |
| Amazon_Google (Cleaned With Space)    | 0.239 |  0.239  | 0.237 | 0.234 | 0.232 | 0.230 |

|  NDCG Score at 10      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.150 |  0.160  | 0.164 | 0.167 | 0.164 | 0.170 |
| Abt_buy (Cleaned No Space)            | 0.260 |  0.278  | 0.281 | 0.285 | 0.292 | 0.297 |
| Abt_buy (Cleaned With Space)          | 0.227 |  0.242  | 0.246 | 0.257 | 0.266 | 0.271 |
| Amazon_Google (Raw)                   | 0.199 |  0.189  | 0.185 | 0.185 | 0.184 | 0.184 |
| Amazon_Google (Cleaned No Space)      | 0.275 |  0.275  | 0.275 | 0.272 | 0.272 | 0.271 |
| Amazon_Google (Cleaned With Space)    | 0.260 |  0.258  | 0.256 | 0.251 | 0.247 | 0.246 |