In [78]:
import pandas as pd

DATASET_NAME = "walmart-amazon"

train_catalog_df = pd.read_csv(f"data/{DATASET_NAME}/train_catalog.csv")
train_queries_df = pd.read_csv(f"data/{DATASET_NAME}/train_queries.csv")
val_catalog_df = pd.read_csv(f"data/{DATASET_NAME}/val_catalog.csv")
val_queries_df = pd.read_csv(f"data/{DATASET_NAME}/val_queries.csv")
print(f"Loaded {len(train_catalog_df.index)} Documents")
print(f"Loaded {len(train_queries_df.index)} Judgments")

Loaded 2554 Documents
Loaded 10905 Judgments


In [79]:
import numpy.random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

class RandomRanker:
    def __init__(self):
        pass
    
    def train(self, catalog_df, queries_df):
        pass
    
    def get_score(self, query, catalog_df):
        text = query["input_text"]
        return {
            "scores": np.random.uniform(0,1,size=len(catalog_df))
        }
    
def levenshtein_distance(word1, word2):
    if len(word1) < len(word2):
        return levenshtein_distance(word2, word1)

    if len(word2) == 0:
        return len(word1)

    previous_row = range(len(word2) + 1)

    for i, c1 in enumerate(word1):
        current_row = [i + 1]

        for j, c2 in enumerate(word2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))

        previous_row = current_row

    return previous_row[-1]
    

def normalized_levenshtein_distance(word1, word2):
    distance = levenshtein_distance(word1, word2)
    max_length = max(len(word1), len(word2))
    return distance / max_length

    
class LevensteinRanker:
    def __init__(self):
        print("Wanring! This is a slow ranker")
        pass
    
    def train(self, catalog_df, queries_df):
        pass
    
    
    def get_score(self, query, catalog_df):
        text = query["input_text"]
        
        return {
            "scores": catalog_df["title"].apply(lambda x: -normalized_levenshtein_distance(x, text)).values
        }
    
class TfidfRanker:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b', lowercase=True)

    def train(self, catalog_df, queries_df):
        self.vectorizer.fit(catalog_df['title'].str.lower())

    def get_score(self, query, catalog_df):
        text = query["input_text"].lower()
        tfidf_matrix = self.vectorizer.transform(catalog_df['title'].str.lower())
        query_vector = self.vectorizer.transform([text])
        scores = (tfidf_matrix * query_vector.T).toarray()
        return {
            "scores": scores.flatten()
        }
    
tf_idf_ranker = TfidfRanker()
tf_idf_ranker.train(train_catalog_df, train_queries_df)
tf_idf_ranker.get_score({"input_text": "Remote"}, train_catalog_df)

{'scores': array([0.51749004, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ])}

In [80]:
from tqdm import tqdm

def evaluate(ranker, catalog_df, queries_df):
    ranks = []
    for i,row in tqdm(queries_df.iterrows(), total=len(queries_df.index)):
        input_query = dict(row)
        target_id = input_query["match_id"]
        judgment = input_query["judgment"]
        
        if judgment == True:
            del input_query["match_id"]
            del input_query["judgment"]
            
            scores = ranker.get_score(input_query, catalog_df)["scores"]
#             print(catalog_df.iloc[np.argsort(scores)])
            sorted_catalog = catalog_df.iloc[np.argsort(-scores)]
            rank = np.where(sorted_catalog["subject_id"].values == target_id)
            rank = rank[0][0] # FIXME: This could file if target_id is not in the catalog_df, in that case, skip
            ranks.append(rank)
          
    ranks = np.array(ranks)
    return {
        "ranks": ranks,
        "top_1": sum(ranks < 1) / len(ranks),
        "top_10": sum(ranks < 10) / len(ranks),
        "top_100": sum(ranks < 100) / len(ranks),
        "top_1000": sum(ranks < 1000) / len(ranks),
    }
            
print(f'Top 10 TF-IDF: {evaluate(tf_idf_ranker, val_catalog_df, val_queries_df)["top_10"]}')
print(f'Top 10 Random: {evaluate(RandomRanker(), val_catalog_df, val_queries_df)["top_10"]}')

100%|██████████| 10905/10905 [00:15<00:00, 696.81it/s]


Top 100 TF-IDF: 0.9900990099009901


100%|██████████| 10905/10905 [00:01<00:00, 10573.42it/s]

Top 100 Random: 0.0024752475247524753





In [81]:
res = evaluate(tf_idf_ranker, train_catalog_df, train_queries_df)

100%|██████████| 10905/10905 [00:15<00:00, 692.79it/s]
