In [1]:
import csv
import re
import numpy as np
import pandas as pd 
import math
from random import randint
import fasttext
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

### Pre-Trained CBOW

In [2]:
# Load Facebook FastText Pre-trained Model
model = fasttext.load_model('cc.en.300.bin')




## Functions Definitions
#### Preprocesing Functions

In [3]:
def read_file(filename):
    # Read in CSV File into DataFrame
    list_var = []
    data = pd.DataFrame()
    with open(filename,'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i in reader:
            list_var.append(i)
    # Header & Data
    col_name = list_var[0]
    data_val = list_var[1:]
    # Convert to DataFrame
    data = pd.DataFrame(data_val, columns=col_name)
    return data

In [4]:
def tokenize(data,clean):
    # Clean and Tokenize both Name and Description Field
    temp_name = data['name'].copy()
    temp_description = data['description'].copy()
    df = data.copy()
    # Raw
    if clean=='Raw':
        pass
    # Semi-Cleaned 
    elif clean=='Nspace':
        # Remove Symbols in Name and Description Column (no space)
        temp_name = [re.sub(r'[^\w\s]','',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]','',x) for x in temp_description]
    elif clean=='Wspace':
        # Remove Symbols in Name and Description Column (replace with space)
        temp_name = [re.sub(r'[^\w\s]',' ',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]',' ',x) for x in temp_description]
    # Tokenize: Split single sentence into list of words
    for i in range(0,len(temp_name),1):
        # Tokenize Name and Description Columns into words
        df['name'][i] = word_tokenize(temp_name[i])
        df['description'][i] = word_tokenize(temp_description[i])
        try:
            df['name'][i].remove('')
        except ValueError:
            pass  # do nothing!
        try:
            df['description'][i].remove('')
        except ValueError:
            pass  # do nothing!
    return df

In [5]:
def vectorize(df):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = [model.get_word_vector(x) for x in i]
        avg = np.average(array, axis=0)
        df_vectorize['name'][counter] = avg
        counter += 1
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [6]:
def file_to_vector(file,clean):
    # Read in Abt File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize name column
    data = vectorize(df)
    return data

#### TF-IDF

In [7]:
def create_corpus(df,df2,clean):
    # Change Column names to 'name' instead of 'title'
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    if 'title' in df2.columns:
        df2['name']=df2['title'].copy()  
    # Combine name and description text
    df['combine'] = df['name'] + ' ' + df['description']
    df2['combine'] = df2['name'] + ' ' + df2['description']
    # Create Corpus to train TFIDF
    # With option for Raw vs Cleaned
    corpus = []
    for i in range(0,len(df['combine']),1):
        temp = df['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    for i in range(0,len(df2['combine']),1):
        temp = df2['combine'][i]
        if clean=='Raw':
            pass
        elif clean=='Nspace':
            temp = re.sub(r'[^\w\s]','',temp)
        elif clean=='Wspace':
            temp = re.sub(r'[^\w\s]',' ',temp)
        corpus.append(temp)
    return corpus

In [8]:
def tf_idf_vect(file1,file2,clean):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    corpus = create_corpus(df,df2,clean)
    # Fit TF-IDF Model
    vectorizer = TfidfVectorizer(max_features=300)
    tfidf_model = vectorizer.fit(corpus)
    # Generate TF-IDF Vectors for each product
    arr = []
    for i in range(0,len(df),1):
        text = df['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df['name'] = arr
    arr = []
    for i in range(0,len(df2),1):
        text = df2['name'][i]
        arr.append(tfidf_model.transform([text]).toarray()[0])
    df2['name'] = arr
    
    return df[['id','name']], df2[['id','name']]

#### Random Indexing

In [9]:
def random_index_vectors(file1,file2,clean,k):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    # Create Corpus
    corpus = create_corpus(df,df2,clean)
    # Get Vocabulary of the Corpus
    vocab = []
    for i in corpus:
        words = word_tokenize(i)
        for j in words:
            vocab.append(j)
    vocabulary = set(vocab)
    # Create DF
    df = pd.DataFrame()
    # Insert Vocabulary Column
    df['vocab'] = list(vocabulary)
    # Insert Word Vector Column
    arr = []
    for i in df.vocab:
        # Generate Random index vector of +1 or 0 using Gaussian Distribution 
        random_vec = np.array([1 if i>=0 else 0 for i in np.random.normal(size=(300))])
        arr.append(random_vec)
    # Initialize fixed and current vector to be vectors generated from above
    df['fixed_vector'] = arr
    df['current_vector'] = arr
    # Window Size of k
    k = k
    for i in corpus:
        # Tokenize Product Name + Description into words
        words = word_tokenize(i)
        # For every word in the line, update word vector
        for j in range(0,len(words),1):
            # Obtain the current word vector
            current_index = df[df.vocab==words[j]].index[0]
            current_vector = df.loc[current_index].current_vector
            # Set up upper and lower bound
            lowerbound = j-k
            upperbound = j+k
            for x in range(lowerbound,upperbound+1,1):
                # if word within upper and lower bound and the index can be accessable and does not equal current index
                if (x>=0) and (x<=(len(words)-1)):
                    if x!=j:
                        # Obtain fixed vectors of context word 
                        context_index = df[df.vocab==words[x]].index[0]
                        context_vector = df.loc[context_index].fixed_vector
                        # Update current word vector by addint fixed vectors of context word
                        current_vector = current_vector + context_vector 
                else:
                    # Out of Bounds
                    pass    
            # Update current word vector into the df
            df.loc[current_index].current_vector = current_vector
    return df

In [10]:
def random_index_pretrain(file1,file2,clean,k):
    # Read in Files
    df = read_file(file1)
    df2 = read_file(file2)
    # Create Corpus
    corpus = create_corpus(df,df2,clean)
    # Get Vocabulary of the Corpus
    vocab = []
    for i in corpus:
        words = word_tokenize(i)
        for j in words:
            vocab.append(j)
    vocabulary = set(vocab)
    # Create DF
    df = pd.DataFrame()
    # Insert Vocabulary Column
    df['vocab'] = list(vocabulary)
    # Insert Word Vector Column
    arr = []
    for i in df.vocab:
        # Get word vector from pretrained model
        arr.append(model.get_word_vector(i))
    # Initialize fixed and current vector to be vectors generated from above
    df['fixed_vector'] = arr
    df['current_vector'] = arr
    # Window Size of k
    k = k
    for i in corpus:
        # Tokenize Product Name + Description into words
        words = word_tokenize(i)
        # For every word in the line, update word vector
        for j in range(0,len(words),1):
            # Obtain the current word vector
            current_index = df[df.vocab==words[j]].index[0]
            current_vector = df.loc[current_index].current_vector
            # Set up upper and lower bound
            lowerbound = j-k
            upperbound = j+k
            for x in range(lowerbound,upperbound+1,1):
                # if word within upper and lower bound and the index can be accessable and does not equal current index
                if (x>=0) and (x<=(len(words)-1)):
                    if x!=j:
                        # Obtain fixed vectors of context word 
                        context_index = df[df.vocab==words[x]].index[0]
                        context_vector = df.loc[context_index].fixed_vector
                        # Update current word vector by addint fixed vectors of context word
                        current_vector = current_vector + context_vector 
                else:
                    # Out of Bounds
                    pass    
            # Update current word vector into the df
            df.loc[current_index].current_vector = current_vector
    return df

In [11]:
def vectorize_indexing(df,df_vec):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = np.zeros(300)
        avg_counter = 0
        for x in i:
            array += df_vec.current_vector[df_vec.vocab==x].values[0]
            avg_counter += 1
        avg = array/avg_counter
        df_vectorize['name'][counter] = avg
        counter += 1   
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [12]:
def file_to_RI_vector(file,df_vec,clean):
    # Read in File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize
    data = vectorize_indexing(df,df_vec)
    data.head()
    return data

#### Create Ranked List Functions

In [13]:
def vector_length(a):
    return float(math.sqrt(sum(x*x for x in a)))
def dot_product(a,b):
    if len(a) == len(b):
        return sum([x*y for (x,y) in zip(a,b)])
    else:
        return "Vector Length Different"
def cos_similarity(a,b):
    return dot_product(a,b)/(vector_length(a)*vector_length(b))

In [14]:
def top_10_ranking(df1,df2):
    # Initialize Ranking Chart
    col_names = ['df1_id','1','2','3','4','5','6','7','8','9','10']
    df_ranking = pd.DataFrame(columns=col_names);
    # Loop for every Abt_id element
    for i in range(0,len(df1.name),1):
        cos_sim = []
        # Loop for computing buy_id ranking for current Abt_id
        for j in range(0,len(df2.name),1):
            # Compute Cosine Similarity for every buy_id element compared to current Abt_id
            cos_sim.append(cos_similarity(df1.name[i],df2.name[j]))
        # Sort list and pick top 10
        temp_df = pd.DataFrame(cos_sim)
        temp_sorted = temp_df.sort_values(by=0,ascending=False)
        row_ranking = temp_sorted.index.values[0:10]
        rank_buy_id = []
        # Convert row # into buy_id
        for k in row_ranking:
            rank_buy_id.append(df2.id.iloc[k])
        rank_buy_id.insert(0,df1.id[i])
        # Concat into Abt Ranking Chart
        combine_df = pd.DataFrame([rank_buy_id],columns=col_names)
        df_ranking = pd.concat([df_ranking,combine_df],names=col_names,sort=False,ignore_index=True)
    return df_ranking

In [15]:
def true_ranking_loc(df_ranking,mapping,col1,col2):
    # Find where ground truth is ranked 
    col_names = ['df1_id','Ranked_Location']
    df_rank_loc = pd.DataFrame(columns=col_names)
    # Loop for every element of Abt_id from df_ranking
    for i in range(0,len(df_ranking.df1_id),1):
        # Look for ground truth from True Mapping
        look_up_id = df_ranking.df1_id[i]
        # Check if product is in Perfect Mapping
        if len(mapping[col2][mapping[col1]==look_up_id])!=0:
            mapped_id = mapping[col2][mapping[col1]==look_up_id].values[0]
            # Check if ground truth in ranking list and note its location
            if mapped_id in list(df_ranking.iloc[i]):
                ranking = list(df_ranking.iloc[i]).index(mapped_id)
            else:
                ranking = 0
            # Concat into Abt Ranking Location df
            temp_list = list([look_up_id,ranking])
            temp_df2 = pd.DataFrame([temp_list],columns=col_names)
            df_rank_loc = pd.concat([df_rank_loc,temp_df2],names=col_names,sort=False,ignore_index=True)
    return df_rank_loc

#### Scoring Metrics Functions

In [16]:
def hits_at_n(df_rank_loc,n):
    # Sum the correct ranks at ranks 1 to n divide by n number of ranks
    HR_n_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            if df_rank_loc.Ranked_Location[i]<=n:
                HR_n_sum += 1
    Hits_Rate_at_n = HR_n_sum/len(df_rank_loc)
    return Hits_Rate_at_n

In [17]:
def mean_reciprocal_rank(df_rank_loc):
    # Sum the Reciprocal Rank of Abt Ranking Location df and divide by total number element
    MRR_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            MRR_sum += 1/df_rank_loc.Ranked_Location[i]
    mean_reciprocal_rank = MRR_sum/len(df_rank_loc)
    return mean_reciprocal_rank

In [18]:
# Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def ndcg(df_rank_loc,k):
    # Sum up all NDCG at each Queries and divided by total # of Queries
    # Initalize Ranking Encoding
    rank_encoding = {0:[0,0,0,0,0,0,0,0,0,0],
                     1:[1,0,0,0,0,0,0,0,0,0],
                     2:[0,1,0,0,0,0,0,0,0,0],
                     3:[0,0,1,0,0,0,0,0,0,0],
                     4:[0,0,0,1,0,0,0,0,0,0],
                     5:[0,0,0,0,1,0,0,0,0,0],
                     6:[0,0,0,0,0,1,0,0,0,0],
                     7:[0,0,0,0,0,0,1,0,0,0],
                     8:[0,0,0,0,0,0,0,1,0,0],
                     9:[0,0,0,0,0,0,0,0,1,0],
                     10:[0,0,0,0,0,0,0,0,0,1]}
    sum_NDCG = 0
    for i in df_rank_loc.Ranked_Location:
        sum_NDCG += ndcg_at_k(rank_encoding[i], k)
    NDCG = sum_NDCG/len(df_rank_loc)
    return NDCG

#### All in One Function

In [19]:
def all_in_one(file1,file2,mapping,clean):
    # Read File 1 & Vectorize
    data1 = file_to_vector(file1,clean)
    # Read File 2 & Vectorize
    data2 = file_to_vector(file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [20]:
def all_in_one_tfidf(file1,file2,mapping,clean):
    # Generate TF-IDF Vectors
    data1, data2 = tf_idf_vect(file1,file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [21]:
def all_in_one_random_index(file1,file2,mapping,clean,k):
    # Generate Random Indexing vectors
    df_vec = random_index_vectors(file1,file2,clean,k)
    # Vectorize File 1
    data1 = file_to_RI_vector(file1,df_vec,clean)
    # Vectorize File 2
    data2 = file_to_RI_vector(file2,df_vec,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

In [22]:
def all_in_one_random_index_pretrain(file1,file2,mapping,clean,k):
    # Generate Random Indexing vectors
    df_vec = random_index_pretrain(file1,file2,clean,k)
    # Vectorize File 1
    data1 = file_to_RI_vector(file1,df_vec,clean)
    # Vectorize File 2
    data2 = file_to_RI_vector(file2,df_vec,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

## Code

### Abt vs Buy (Raw)

In [23]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',1)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.21554116558741906
Mean Reciprocal Rank:  0.12241751464693183
NDCG at 10 Score:  0.1444482652599682


In [24]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',2)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2303422756706753
Mean Reciprocal Rank:  0.13156395459818213
NDCG at 10 Score:  0.1550386913359213


In [25]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',3)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23311748381128586
Mean Reciprocal Rank:  0.13028684492606782
NDCG at 10 Score:  0.1545811542285501


In [26]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',4)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23496762257169287
Mean Reciprocal Rank:  0.13429180799671084
NDCG at 10 Score:  0.15807660453177627


In [27]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',5)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2414431082331175
Mean Reciprocal Rank:  0.13503993950339924
NDCG at 10 Score:  0.16004600786900808


In [28]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw',6)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.23959296947271044
Mean Reciprocal Rank:  0.13886245833516875
NDCG at 10 Score:  0.16269632531220182


### Abt vs Buy (Cleaned No Space)

In [29]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',1)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.34875115633672527
Mean Reciprocal Rank:  0.2196052303129083
NDCG at 10 Score:  0.25029561347592183


In [30]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',2)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3709528214616096
Mean Reciprocal Rank:  0.22834860431992693
NDCG at 10 Score:  0.262204992793192


In [31]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',3)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3700277520814061
Mean Reciprocal Rank:  0.240146175645713
NDCG at 10 Score:  0.2710568248611075


In [32]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',4)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.37650323774283073
Mean Reciprocal Rank:  0.2435204763373126
NDCG at 10 Score:  0.2750985882079796


In [33]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',5)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.38020351526364476
Mean Reciprocal Rank:  0.24481190255935856
NDCG at 10 Score:  0.27693141204912747


In [34]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace',6)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3885291396854764
Mean Reciprocal Rank:  0.24669030732860497
NDCG at 10 Score:  0.28033666791737266


### Abt vs Buy (Cleaned With Space)

In [35]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',1)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.31267345050878814
Mean Reciprocal Rank:  0.2009830697619781
NDCG at 10 Score:  0.22760965130198804


In [36]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',2)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.32562442183163737
Mean Reciprocal Rank:  0.2108849830403946
NDCG at 10 Score:  0.23807736115098171


In [37]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',3)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3385753931544866
Mean Reciprocal Rank:  0.211325492269063
NDCG at 10 Score:  0.24157249835559927


In [38]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',4)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3422756706753006
Mean Reciprocal Rank:  0.21188934408175847
NDCG at 10 Score:  0.2428913312681477


In [39]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',5)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3515263644773358
Mean Reciprocal Rank:  0.21454818436779577
NDCG at 10 Score:  0.24701368179134


In [40]:
df_rank_loc = all_in_one_random_index_pretrain('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace',6)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.36262719703977797
Mean Reciprocal Rank:  0.21819229696195463
NDCG at 10 Score:  0.2523664481956567


### Amazon vs Google (Raw)

In [41]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',1)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.22551662174303683
Mean Reciprocal Rank:  0.18069353527574547
NDCG at 10 Score:  0.19130153618089055


In [42]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',2)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.2183288409703504
Mean Reciprocal Rank:  0.17113728090817035
NDCG at 10 Score:  0.18227449278278623


In [43]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',3)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.20575022461814915
Mean Reciprocal Rank:  0.1652947132731499
NDCG at 10 Score:  0.17490867654749825


In [44]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',4)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.20395327942497754
Mean Reciprocal Rank:  0.1620413154779731
NDCG at 10 Score:  0.17209660727691536


In [45]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',5)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.20215633423180593
Mean Reciprocal Rank:  0.16456666524051966
NDCG at 10 Score:  0.17353894154667876


In [46]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw',6)
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.20035938903863432
Mean Reciprocal Rank:  0.16313909211483332
NDCG at 10 Score:  0.17210318875378694


### Amazon vs Google (Cleaned No Space)

In [47]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',1)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3270440251572327
Mean Reciprocal Rank:  0.2521139063591893
NDCG at 10 Score:  0.2699846093017196


In [48]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',2)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.31626235399820307
Mean Reciprocal Rank:  0.24360194811677285
NDCG at 10 Score:  0.2609411060692383


In [49]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',3)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3045822102425876
Mean Reciprocal Rank:  0.24022162324049112
NDCG at 10 Score:  0.25558166117859266


In [50]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',4)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3036837376460018
Mean Reciprocal Rank:  0.23830666438483145
NDCG at 10 Score:  0.253923333740992


In [51]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',5)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.3000898472596586
Mean Reciprocal Rank:  0.23525221409318445
NDCG at 10 Score:  0.250837197684911


In [52]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace',6)
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.29829290206648695
Mean Reciprocal Rank:  0.23437549023802387
NDCG at 10 Score:  0.2496254887892268


### Amazon vs Google (Cleaned With Space)

In [53]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',1)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.3018867924528302
Mean Reciprocal Rank:  0.23190290791368956
NDCG at 10 Score:  0.24853949062632794


In [54]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',2)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.2884097035040431
Mean Reciprocal Rank:  0.22482066201742743
NDCG at 10 Score:  0.23972110778194156


In [55]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',3)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.27672955974842767
Mean Reciprocal Rank:  0.21911143912491615
NDCG at 10 Score:  0.23275379202810426


In [56]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',4)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.2749326145552561
Mean Reciprocal Rank:  0.21795804275588643
NDCG at 10 Score:  0.23153259015146263


In [57]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',5)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.27583108715184185
Mean Reciprocal Rank:  0.21596749811035523
NDCG at 10 Score:  0.230214825228943


In [58]:
df_rank_loc = all_in_one_random_index_pretrain('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace',6)
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.2776280323450135
Mean Reciprocal Rank:  0.2161293657924385
NDCG at 10 Score:  0.2307814742033369


### Conclusion 
#### FastText Pre-Trained CBOWModel + Random Indexing

|  Hits Rate at 10      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.216 |  0.230  | 0.233 | 0.235 | 0.241 | 0.240 |
| Abt_buy (Cleaned No Space)            | 0.349 |  0.371  | 0.370 | 0.377 | 0.380 | 0.389 |
| Abt_buy (Cleaned With Space)          | 0.313 |  0.326  | 0.339 | 0.342 | 0.352 | 0.363 |
| Amazon_Google (Raw)                   | 0.226 |  0.218  | 0.206 | 0.204 | 0.202 | 0.200 |
| Amazon_Google (Cleaned No Space)      | 0.327 |  0.316  | 0.305 | 0.304 | 0.300 | 0.298 |
| Amazon_Google (Cleaned With Space)    | 0.302 |  0.288  | 0.277 | 0.275 | 0.276 | 0.278 |

|  Mean Reciprocal Rank      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.122 |  0.132  | 0.130 | 0.134 | 0.135 | 0.139 |
| Abt_buy (Cleaned No Space)            | 0.220 |  0.228  | 0.240 | 0.244 | 0.245 | 0.247 |
| Abt_buy (Cleaned With Space)          | 0.201 |  0.211  | 0.211 | 0.212 | 0.215 | 0.218 |
| Amazon_Google (Raw)                   | 0.181 |  0.171  | 0.165 | 0.162 | 0.165 | 0.163 |
| Amazon_Google (Cleaned No Space)      | 0.252 |  0.244  | 0.240 | 0.238 | 0.235 | 0.234 |
| Amazon_Google (Cleaned With Space)    | 0.232 |  0.225  | 0.219 | 0.218 | 0.216 | 0.216 |

|  NDCG Score at 10      | K = 1 | K = 2 | K = 3  | K = 4 | K = 5 | K = 6 |
| ------------------------     | ---   | ---     |   --- | --- | --- | --- |
| Abt_buy (Raw)                         | 0.144 |  0.155  | 0.155 | 0.158 | 0.160 | 0.163 |
| Abt_buy (Cleaned No Space)            | 0.250 |  0.262  | 0.271 | 0.275 | 0.277 | 0.280 |
| Abt_buy (Cleaned With Space)          | 0.227 |  0.238  | 0.242 | 0.243 | 0.247 | 0.252 |
| Amazon_Google (Raw)                   | 0.191 |  0.182  | 0.175 | 0.172 | 0.174 | 0.172 |
| Amazon_Google (Cleaned No Space)      | 0.270 |  0.261  | 0.256 | 0.254 | 0.251 | 0.250 |
| Amazon_Google (Cleaned With Space)    | 0.249 |  0.240  | 0.233 | 0.232 | 0.230 | 0.231 |