In [1]:
import os

os.environ["DATASET_NAME"] = "covid-abstracts" # choose from ["news-aggregator", "covid-abstracts", "walmart-amazon"]
os.environ["AUGMENTATION"] = "none" # choose from ["synonym", "none"]


In [2]:
import pandas as pd

if "DATASET_NAME" in os.environ:
    DATASET_NAME = os.environ["DATASET_NAME"]
else:
    DATASET_NAME= "news-aggregator"
    
print(f"Dataset name is: {DATASET_NAME}")

train_catalog_df = pd.read_csv(f"data/{DATASET_NAME}/train_catalog.csv")
train_queries_df = pd.read_csv(f"data/{DATASET_NAME}/train_queries.csv")
val_catalog_df = pd.read_csv(f"data/{DATASET_NAME}/val_catalog.csv")
val_queries_df = pd.read_csv(f"data/{DATASET_NAME}/val_queries.csv")
print(f"Loaded {len(train_catalog_df.index)} Documents")
print(f"Loaded {len(train_queries_df.index)} Judgments")

Dataset name is: covid-abstracts
Loaded 6000 Documents
Loaded 12000 Judgments


In [3]:
import torch

use_cuda = torch.cuda.is_available()
torch.set_default_dtype(torch.float32)
print(use_cuda)


True


In [4]:
#Override
#use_cuda = False
#os.environ["CUDA_VISIBLE_DEVICES"]=""

In [5]:
# Data Augmentation

if "AUGMENTATION" in os.environ:
    AUGMENTATION = os.environ["AUGMENTATION"]
else:
    AUGMENTATION = "synonym"
    
print(f"Augmentation strategy is: {AUGMENTATION}")
    
if AUGMENTATION == "none":
    pass
elif AUGMENTATION == "synonym":

    import nlpaug.augmenter.word as naw

    aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.25)
    train_queries_df["input_text"] = train_queries_df["input_text"].apply(lambda x: aug.augment(x)[0])
    val_queries_df["input_text"] = val_queries_df["input_text"].apply(lambda x: aug.augment(x)[0])

Augmentation strategy is: none


In [6]:
import numpy.random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, InputExample, losses
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data.dataloader import DataLoader

In [7]:
class RandomRanker:
    def __init__(self):
        pass
    
    def train(self, catalog_df, queries_df):
        pass
    
    def prerun(self, catalog_df):
        pass
    
    def get_score(self, query, catalog_df):
        text = query["input_text"]
        return {
            "scores": np.random.uniform(0,1,size=len(catalog_df))
        }
    
def levenshtein_distance(word1, word2):
    if len(word1) < len(word2):
        return levenshtein_distance(word2, word1)

    if len(word2) == 0:
        return len(word1)

    previous_row = range(len(word2) + 1)

    for i, c1 in enumerate(word1):
        current_row = [i + 1]

        for j, c2 in enumerate(word2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))

        previous_row = current_row

    return previous_row[-1]
    

def normalized_levenshtein_distance(word1, word2):
    distance = levenshtein_distance(word1, word2)
    max_length = max(len(word1), len(word2))
    return distance / max_length

    
class LevensteinRanker:
    def __init__(self):
        print("Wanring! This is a slow ranker")
        pass
    
    def train(self, catalog_df, queries_df):
        pass
    
    def prerun(self, catalog_df):
        pass
    
    def get_score(self, query, catalog_df):
        text = str(query["input_text"])
        
        return {
            "scores": catalog_df["text"].apply(lambda x: -normalized_levenshtein_distance(x, text)).values
        }
    
class BoWRanker:
    def __init__(self):
        self.vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', lowercase=True)

    def train(self, catalog_df, queries_df):
        self.vectorizer.fit(catalog_df['text'].str.lower())

    def prerun(self, catalog_df):
        self.bow_matrix = self.vectorizer.transform(catalog_df['text'].str.lower())
        
    def get_score(self, query, catalog_df):
        text = str(query["input_text"]).lower()
        query_vector = self.vectorizer.transform([text])
        scores = (self.bow_matrix * query_vector.T).toarray()
        return {
            "scores": scores.flatten()
        }
    
class TfidfRanker:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b', lowercase=True)

    def train(self, catalog_df, queries_df):
        self.vectorizer.fit(catalog_df['text'].str.lower())

    def prerun(self, catalog_df):
        self.tfidf_matrix = self.vectorizer.transform(catalog_df['text'].str.lower())
        
    def get_score(self, query, catalog_df):
        text = str(query["input_text"]).lower()
        query_vector = self.vectorizer.transform([text])
        scores = (self.tfidf_matrix * query_vector.T).toarray()
        return {
            "scores": scores.flatten()
        }
    
class BM25Ranker:
    def __init__(self):
        pass
        
    def train(self, catalog_df, queries_df):
        pass
        
    def prerun(self, catalog_df):
        corpus = catalog_df['text'].str.lower().tolist()
        tokenized_corpus = [doc.split(" ") for doc in corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)

    def get_score(self, query, catalog_df):
        text = str(query["input_text"]).lower()
        query_vector = text.split(" ")
        scores = self.bm25.get_scores(query_vector)
        return {
            "scores": scores
        }
    
class EmbeddingRanker:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def train(self, catalog_df, queries_df, epochs=0):
        if epochs != 0:
            
            # Prepare the data for training
            examples = []
            for _, row in queries_df.iterrows():
                text = str(row['input_text'])
                positive_id = row['match_id']
                try:
                    positive_text = catalog_df.loc[catalog_df['catalog_id'] == positive_id, 'text'].values[0]
                    negative_ids = catalog_df.loc[catalog_df['catalog_id'] != positive_id, 'catalog_id'].sample(n=1).values
                    negative_text = catalog_df.loc[catalog_df['catalog_id'] == negative_ids[0], 'text'].values[0]
                    examples.append(InputExample(texts=[text, positive_text, negative_text]))
                except Exception as e:
                    pass

            print(f"{len(examples)} Examples Found")

            train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
            train_loss = losses.TripletLoss(self.model)

            self.model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs, optimizer_params={'lr': 2e-6})

    def prerun(self, catalog_df):
        self.corpus = catalog_df['text'].str.lower().tolist()
        self.corpus_embeddings = self.get_embeddings(self.corpus)

    def get_score(self, query, catalog_df):
        query_embedding = self.get_embeddings([query["input_text"]])
        scores = cosine_similarity(query_embedding, self.corpus_embeddings)
        return {
            "scores": scores.flatten()
        }

    def get_embeddings(self, texts):
        return self.model.encode(texts)
    
embedding_ranker = EmbeddingRanker()
embedding_ranker.train(train_catalog_df, train_queries_df, epochs=0)
embedding_ranker.prerun(train_catalog_df)
embedding_ranker.get_score({"input_text": "Remote"}, train_catalog_df)
    
trained_embedding_ranker_1 = EmbeddingRanker()
trained_embedding_ranker_1.train(train_catalog_df, train_queries_df, epochs=1)
trained_embedding_ranker_1.prerun(train_catalog_df)
trained_embedding_ranker_1.get_score({"input_text": "Remote"}, train_catalog_df)

bow_ranker = BoWRanker()
bow_ranker.train(train_catalog_df, train_queries_df)
bow_ranker.prerun(train_catalog_df)
bow_ranker.get_score({"input_text": "Remote"}, train_catalog_df)

tf_idf_ranker = TfidfRanker()
tf_idf_ranker.train(train_catalog_df, train_queries_df)
tf_idf_ranker.prerun(train_catalog_df)
tf_idf_ranker.get_score({"input_text": "Remote"}, train_catalog_df)

bm_25_ranker = BM25Ranker()
bm_25_ranker.train(train_catalog_df, train_queries_df)
bm_25_ranker.prerun(train_catalog_df)
bm_25_ranker.get_score({"input_text": "Remote"}, train_catalog_df)

9586 Examples Found


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/600 [00:00<?, ?it/s]

{'scores': array([0., 0., 0., ..., 0., 0., 0.])}

In [8]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

class SpladeRanker:
    def __init__(self, model_id='naver/splade-cocondenser-ensembledistil'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForMaskedLM.from_pretrained(model_id)
        if use_cuda:
            self.model.to("cuda:0")
        self.model.eval()

        self.document_vectors = None

    def train(self, catalog_df, queries_df):
        pass

    def prerun(self, catalog_df):
        # Precompute and store SPLADE vectors for all documents in the catalog
        texts = catalog_df['text'].tolist()
        #self.document_vectors = [self.encode(text) for text in texts]
        self.document_vectors = self.encode_batch(texts)
    
    def get_score(self, query, catalog_df):
        # Encode the query using SPLADE
        query_text = str(query["input_text"])
        query_vector = self.encode(query_text)

        scores = [self.cosine_similarity(query_vector, doc_vector) for doc_vector in self.document_vectors]

        return {
            "scores": np.array(scores)
        }

    def encode(self, text):
        #Text to SPLADE sparse vectors
        tokens = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        if use_cuda:
            tokens.to("cuda:0")
        with torch.no_grad():
            output = self.model(**tokens)

        vec = torch.max(
            torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1),
            dim=1
        )[0].squeeze()

        return vec.cpu().numpy()
    
    def encode_batch(self, texts, batch_size=128):
        #Batches of text to SPLADE sparse vectors, only works correctly for prerun
        all_vecs = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            tokens = self.tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
            if use_cuda:
                tokens = tokens.to("cuda:0")
            with torch.no_grad():
                outputs = self.model(**tokens)
            vecs = torch.max(
                torch.log(1 + torch.relu(outputs.logits)) * tokens.attention_mask.unsqueeze(-1),
                dim=1
            )[0].squeeze()
            all_vecs.extend(vecs.cpu().numpy())
        return all_vecs

    @staticmethod
    def cosine_similarity(vec1, vec2):
        """Calculate the cosine similarity between two vectors."""
        dot_product = np.dot(vec1, vec2)
        norm_a = np.linalg.norm(vec1)
        norm_b = np.linalg.norm(vec2)
        return dot_product / (norm_a * norm_b)


In [9]:
from tqdm import tqdm

def evaluate(ranker, catalog_df, queries_df):
    ranks = []
    ranker.prerun(catalog_df)
    for i,row in tqdm(queries_df.iterrows(), total=len(queries_df.index)):
        input_query = dict(row)
        target_id = input_query["match_id"]
        judgment = input_query["judgment"]
        
        if judgment == True:
            del input_query["match_id"]
            del input_query["judgment"]
            
            scores = ranker.get_score(input_query, catalog_df)["scores"]
            sorted_catalog = catalog_df.iloc[np.argsort(-scores)]
            rank = np.where(sorted_catalog["catalog_id"].values == target_id)
            rank = rank[0][0] # FIXME: This could file if target_id is not in the catalog_df, in that case, skip
            ranks.append(rank)
          
    ranks = np.array(ranks)
    return {
        "ranks": ranks,
        "top_1": sum(ranks < 1) / len(ranks),
        "top_3": sum(ranks < 3) / len(ranks),
        "top_5": sum(ranks < 5) / len(ranks),
        "top_10": sum(ranks < 10) / len(ranks),
        "top_25": sum(ranks < 25) / len(ranks),
        "top_50": sum(ranks < 50) / len(ranks),
        "top_100": sum(ranks < 100) / len(ranks),
        "top_1000": sum(ranks < 1000) / len(ranks),
    }

In [14]:
report = ""

metrics_random = evaluate(RandomRanker(), val_catalog_df, val_queries_df)
metrics = metrics_random
report += (f'Random | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')

metrics_bow = evaluate(bow_ranker, val_catalog_df, val_queries_df)
metrics = metrics_bow
report += (f'Bag of Words | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')

metrics_tfidf = evaluate(tf_idf_ranker, val_catalog_df, val_queries_df)
metrics = metrics_tfidf
report += (f'TF-IDF | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')

metrics_bm25 = evaluate(bm_25_ranker, val_catalog_df, val_queries_df)
metrics = metrics_bm25
report += (f'BM25   | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')

metrics_sentence = evaluate(embedding_ranker, val_catalog_df, val_queries_df)
metrics = metrics_sentence
report += (f'Sentence Transformer | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')

metrics_fine_tuned_sentence = evaluate(trained_embedding_ranker_1, val_catalog_df, val_queries_df)
metrics = metrics_fine_tuned_sentence
report += (f'Fine Tuned Sentence Transformer | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')

splade_ranker = SpladeRanker()
metrics_splade = evaluate(splade_ranker, val_catalog_df, val_queries_df)
metrics = metrics_splade
report += (f'SPLADE | Top 1: {metrics["top_1"]} | Top 3: {metrics["top_3"]} | Top 5: {metrics["top_5"]} | Top 10: {metrics["top_10"]} | Top 25: {metrics["top_25"]} | Top 50: {metrics["top_50"]} | Top 100: {metrics["top_100"]}\n')
print(report)

100%|██████████| 4000/4000 [00:00<00:00, 4302.81it/s]
  8%|▊         | 328/4000 [00:00<00:04, 845.97it/s]


KeyboardInterrupt: 

In [12]:
print(report)

Random | Top 1: 0.001 | Top 3: 0.0015 | Top 5: 0.0025 | Top 10: 0.004 | Top 25: 0.0145 | Top 50: 0.0225 | Top 100: 0.0415
Bag of Words | Top 1: 0.0495 | Top 3: 0.1015 | Top 5: 0.137 | Top 10: 0.1965 | Top 25: 0.3025 | Top 50: 0.411 | Top 100: 0.522
TF-IDF | Top 1: 0.794 | Top 3: 0.905 | Top 5: 0.941 | Top 10: 0.9615 | Top 25: 0.978 | Top 50: 0.986 | Top 100: 0.989
BM25   | Top 1: 0.8995 | Top 3: 0.949 | Top 5: 0.957 | Top 10: 0.97 | Top 25: 0.9825 | Top 50: 0.988 | Top 100: 0.9905
Sentence Transformer | Top 1: 0.81 | Top 3: 0.9095 | Top 5: 0.933 | Top 10: 0.955 | Top 25: 0.977 | Top 50: 0.9905 | Top 100: 0.9965
Fine Tuned Sentence Transformer | Top 1: 0.8255 | Top 3: 0.912 | Top 5: 0.9365 | Top 10: 0.9595 | Top 25: 0.979 | Top 50: 0.9915 | Top 100: 0.9965
SPLADE | Top 1: 0.8715 | Top 3: 0.9415 | Top 5: 0.9545 | Top 10: 0.9735 | Top 25: 0.989 | Top 50: 0.992 | Top 100: 0.9965


In [13]:
import time

# Save run
with open(f'outputs/report_{DATASET_NAME}_{AUGMENTATION}_{int(time.time())}.txt', 'w') as f:
    f.write(report)

In [None]:
# def compare_rankers(ranker1, ranker2, catalog_df, queries_df, cutoff=10):
#     ranks = []
#     ranker1.prerun(catalog_df)
#     ranker2.prerun(catalog_df)
#     for i,row in tqdm(queries_df.iterrows(), total=len(queries_df.index)):
#         input_query = dict(row)
#         target_id = input_query["match_id"]
#         judgment = input_query["judgment"]
        
#         if judgment == True:
#             del input_query["match_id"]
#             del input_query["judgment"]
            
#             scores1 = ranker1.get_score(input_query, catalog_df)["scores"]
#             scores2 = ranker2.get_score(input_query, catalog_df)["scores"]
#             sorted_catalog1 = catalog_df.iloc[np.argsort(-scores1)]
#             sorted_catalog2 = catalog_df.iloc[np.argsort(-scores2)]
#             rank1 = np.where(sorted_catalog1["catalog_id"].values == target_id)
#             rank1 = rank1[0][0]
#             rank2 = np.where(sorted_catalog2["catalog_id"].values == target_id)
#             rank2 = rank2[0][0]
            
#             if rank1 < cutoff and rank2 > cutoff:
#                 print("Ranker 1 was better at matching \"" + input_query["input_text"] + "\" to \"" + catalog_df[catalog_df["catalog_id"] == target_id]["text"].values[0] + "\"")
#             if rank2 < cutoff and rank1 > cutoff:
#                 print("Ranker 2 was better at matching \"" + input_query["input_text"] + "\" to \"" + catalog_df[catalog_df["catalog_id"] == target_id]["text"].values[0] + "\"")
          
# compare_rankers(tf_idf_ranker, trained_embedding_ranker_1, val_catalog_df, val_queries_df)

In [None]:
val_catalog_df

In [None]:
# res = evaluate(bm_25_ranker, val_catalog_df, val_queries_df)

In [None]:
# !pip install nlpaug