In [1]:
import re
import csv
import numpy as np
import pandas as pd 
import math
import fasttext
from sklearn import metrics
from nltk.tokenize import word_tokenize

In [2]:
# Load Facebook FastText Pre-trained Model
model = fasttext.load_model('cc.en.300.bin')




## Functions Definitions
#### Preprocesing Functions

In [3]:
def read_file(filename):
    # Read in CSV File into DataFrame
    list_var = []
    data = pd.DataFrame()
    with open(filename,'r') as f:
        reader = csv.reader(f, delimiter=',')
        for i in reader:
            list_var.append(i)
    # Header & Data
    col_name = list_var[0]
    data_val = list_var[1:]
    # Convert to DataFrame
    data = pd.DataFrame(data_val, columns=col_name)
    return data

In [4]:
def tokenize(data,clean):
    # Clean and Tokenize both Name and Description Field
    temp_name = data['name'].copy()
    temp_description = data['description'].copy()
    df = data.copy()
    # Raw
    if clean=='Raw':
        pass
    # Semi-Cleaned 
    elif clean=='Nspace':
        # Remove Symbols in Name and Description Column (no space)
        temp_name = [re.sub(r'[^\w\s]','',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]','',x) for x in temp_description]
    elif clean=='Wspace':
        # Remove Symbols in Name and Description Column (replace with space)
        temp_name = [re.sub(r'[^\w\s]',' ',x) for x in temp_name]
        temp_description = [re.sub(r'[^\w\s]',' ',x) for x in temp_description]
    # Tokenize: Split single sentence into list of words
    for i in range(0,len(temp_name),1):
        # Tokenize Name and Description Columns into words
        df['name'][i] = word_tokenize(temp_name[i])
        df['description'][i] = word_tokenize(temp_description[i])
        try:
            df['name'][i].remove('')
        except ValueError:
            pass  # do nothing!
        try:
            df['description'][i].remove('')
        except ValueError:
            pass  # do nothing!
    return df

In [5]:
def vectorize(df):
    # Vectorize the list of words and then average the vectors into one sentence vector
    df_vectorize = df.copy()
    counter = 0
    for i in df['name']:
        array = [model.get_word_vector(x) for x in i]
        avg = np.average(array, axis=0)
        df_vectorize['name'][counter] = avg
        counter += 1
    df_concat = pd.concat([df.id,df_vectorize.name],axis=1)
    return df_concat

In [6]:
def file_to_vector(file,clean):
    # Read in Abt File
    df = read_file(file)
    if 'title' in df.columns:
        df['name']=df['title'].copy()
    # Clean & Tokenize
    temp = tokenize(df,clean)
    df.name = temp.name
    # Vectorize name column
    data = vectorize(df)
    return data

#### Create Ranked List Functions

In [7]:
def vector_length(a):
    return float(math.sqrt(sum(x*x for x in a)))
def dot_product(a,b):
    if len(a) == len(b):
        return sum([x*y for (x,y) in zip(a,b)])
    else:
        return "Vector Length Different"
def cos_similarity(a,b):
    return dot_product(a,b)/(vector_length(a)*vector_length(b))

In [8]:
def top_10_ranking(df1,df2):
    # Initialize Ranking Chart
    col_names = ['df1_id','1','2','3','4','5','6','7','8','9','10']
    df_ranking = pd.DataFrame(columns=col_names);
    # Loop for every Abt_id element
    for i in range(0,len(df1.name),1):
        cos_sim = []
        # Loop for computing buy_id ranking for current Abt_id
        for j in range(0,len(df2.name),1):
            # Compute Cosine Similarity for every buy_id element compared to current Abt_id
            cos_sim.append(cos_similarity(df1.name[i],df2.name[j]))
        # Sort list and pick top 10
        temp_df = pd.DataFrame(cos_sim)
        temp_sorted = temp_df.sort_values(by=0,ascending=False)
        row_ranking = temp_sorted.index.values[0:10]
        rank_buy_id = []
        # Convert row # into buy_id
        for k in row_ranking:
            rank_buy_id.append(df2.id.iloc[k])
        rank_buy_id.insert(0,df1.id[i])
        # Concat into Abt Ranking Chart
        combine_df = pd.DataFrame([rank_buy_id],columns=col_names)
        df_ranking = pd.concat([df_ranking,combine_df],names=col_names,sort=False,ignore_index=True)
    return df_ranking

In [9]:
def true_ranking_loc(df_ranking,mapping,col1,col2):
    # Find where ground truth is ranked 
    col_names = ['df1_id','Ranked_Location']
    df_rank_loc = pd.DataFrame(columns=col_names)
    # Loop for every element of Abt_id from df_ranking
    for i in range(0,len(df_ranking.df1_id),1):
        # Look for ground truth from True Mapping
        look_up_id = df_ranking.df1_id[i]
        # Check if product is in Perfect Mapping
        if len(mapping[col2][mapping[col1]==look_up_id])!=0:
            mapped_id = mapping[col2][mapping[col1]==look_up_id].values[0]
            # Check if ground truth in ranking list and note its location
            if mapped_id in list(df_ranking.iloc[i]):
                ranking = list(df_ranking.iloc[i]).index(mapped_id)
            else:
                ranking = 0
            # Concat into Abt Ranking Location df
            temp_list = list([look_up_id,ranking])
            temp_df2 = pd.DataFrame([temp_list],columns=col_names)
            df_rank_loc = pd.concat([df_rank_loc,temp_df2],names=col_names,sort=False,ignore_index=True)
    return df_rank_loc

#### Scoring Metrics Functions

In [10]:
def hits_at_n(df_rank_loc,n):
    # Sum the correct ranks at ranks 1 to n divide by n number of ranks
    HR_n_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            if df_rank_loc.Ranked_Location[i]<=n:
                HR_n_sum += 1
    Hits_Rate_at_n = HR_n_sum/len(df_rank_loc)
    return Hits_Rate_at_n

In [11]:
def mean_reciprocal_rank(df_rank_loc):
    # Sum the Reciprocal Rank of Abt Ranking Location df and divide by total number element
    MRR_sum = 0
    for i in range(0,len(df_rank_loc),1):
        if df_rank_loc.Ranked_Location[i]!=0:
            MRR_sum += 1/df_rank_loc.Ranked_Location[i]
    mean_reciprocal_rank = MRR_sum/len(df_rank_loc)
    return mean_reciprocal_rank

In [12]:
# Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def ndcg(df_rank_loc,k):
    # Sum up all NDCG at each Queries and divided by total # of Queries
    # Initalize Ranking Encoding
    rank_encoding = {0:[0,0,0,0,0,0,0,0,0,0],
                     1:[1,0,0,0,0,0,0,0,0,0],
                     2:[0,1,0,0,0,0,0,0,0,0],
                     3:[0,0,1,0,0,0,0,0,0,0],
                     4:[0,0,0,1,0,0,0,0,0,0],
                     5:[0,0,0,0,1,0,0,0,0,0],
                     6:[0,0,0,0,0,1,0,0,0,0],
                     7:[0,0,0,0,0,0,1,0,0,0],
                     8:[0,0,0,0,0,0,0,1,0,0],
                     9:[0,0,0,0,0,0,0,0,1,0],
                     10:[0,0,0,0,0,0,0,0,0,1]}
    sum_NDCG = 0
    for i in df_rank_loc.Ranked_Location:
        sum_NDCG += ndcg_at_k(rank_encoding[i], k)
    NDCG = sum_NDCG/len(df_rank_loc)
    return NDCG

#### All in One Function

In [13]:
def all_in_one(file1,file2,mapping,clean):
    # Read File 1 & Vectorize
    data1 = file_to_vector(file1,clean)
    # Read File 2 & Vectorize
    data2 = file_to_vector(file2,clean)
    # Read Perfect Mapping
    mapping = read_file(mapping)
    # Create Ranking List
    df_ranking = top_10_ranking(data1,data2)
    # Find Location where True Ranking Occured 
    df_rank_loc = true_ranking_loc(df_ranking,mapping,mapping.columns[0],mapping.columns[1])
    return df_rank_loc

## Code

### Abt vs Buy (Raw)

In [14]:
df_rank_loc = all_in_one('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Raw')
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.5550416281221091
Mean Reciprocal Rank:  0.3741523868258368
NDCG at 10 Score:  0.4168313769100672


### Abt vs Buy (Cleaned No Space)

In [15]:
df_rank_loc = all_in_one('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Nspace')
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.6540240518038853
Mean Reciprocal Rank:  0.47013494266038197
NDCG at 10 Score:  0.5136935885442538


### Abt vs Buy (Cleaned With Space)

In [16]:
df_rank_loc = all_in_one('Abt.csv','Buy.csv','abt_buy_perfectMapping.csv','Wspace')
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.5411655874190564
Mean Reciprocal Rank:  0.37096126455515854
NDCG at 10 Score:  0.41145888784242196


### Amazon vs Google (Raw)

In [17]:
df_rank_loc = all_in_one('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Raw')
print("Raw")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

Raw
Hit Rate at 10:  0.43755615453728663
Mean Reciprocal Rank:  0.32753248049744
NDCG at 10 Score:  0.35343548576432254


### Amazon vs Google (Cleaned No Space)

In [18]:
df_rank_loc = all_in_one('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Nspace')
print("No Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

No Space
Hit Rate at 10:  0.5696316262353999
Mean Reciprocal Rank:  0.42598689370926546
NDCG at 10 Score:  0.46064076567524664


### Amazon vs Google (Cleaned With Space)

In [19]:
df_rank_loc = all_in_one('Amazon.csv','GoogleProducts.csv','Amzon_GoogleProducts_perfectMapping.csv','Wspace')
print("With Space")
print("Hit Rate at 10: ", hits_at_n(df_rank_loc,10))
print("Mean Reciprocal Rank: ", mean_reciprocal_rank(df_rank_loc))
print('NDCG at 10 Score: ', ndcg(df_rank_loc,10))

With Space
Hit Rate at 10:  0.5426774483378257
Mean Reciprocal Rank:  0.40922360558479126
NDCG at 10 Score:  0.4413201217866638


## Conclusion
### Using the Pre-Trained FastText Model (Wikipedia, CBOW)

|   Pre-Trained (Wiki CBOW)    | Hits Rate at 10 | Mean Reciprocal Rank | NDCG Score at 10 |
| ------------------------     | ---   | ---     |   --- |
| Abt_buy (Raw)                         | 0.555 |  0.374  | 0.417 |
| Abt_buy (Cleaned No Space)            | 0.654 |  0.470  | 0.514 |
| Abt_buy (Cleaned With Space)          | 0.541 |  0.371  | 0.411 |
| Amazon_Google (Raw)                   | 0.438 |  0.328  | 0.353 |
| Amazon_Google (Cleaned No Space)      | 0.570 |  0.426  | 0.461 |
| Amazon_Google (Cleaned With Space)    | 0.543 |  0.409  | 0.441 |

#### Raw + Semi-Cleaned (-,/ Removed)

|   *                  | Hits Rate at 5 | Hits Rate at 10 | Mean Reciprocal Rank | NDCG Score at 10 |
| --------------       | ---      | --- | --- | ---|
| Abt_buy (Raw)           | 0.494 | 0.578 |  0.384  | 0.430 |
| Abt_buy (Cleaned)       | 0.620 | 0.702 |  0.501  | 0.549 |
| Amazon_Google (Raw)     | 0.504 | 0.554 |  0.433  | 0.462 |
| Amazon_Google (Cleaned) | 0.541  | 0.590 |  0.455 | 0.487 |