In [1]:
import pandas as pd

DATASET_NAME = "covid-abstracts"

train_catalog_df = pd.read_csv(f"data/{DATASET_NAME}/train_catalog.csv")
train_queries_df = pd.read_csv(f"data/{DATASET_NAME}/train_queries.csv")
val_catalog_df = pd.read_csv(f"data/{DATASET_NAME}/val_catalog.csv")
val_queries_df = pd.read_csv(f"data/{DATASET_NAME}/val_queries.csv")
print(f"Loaded {len(train_catalog_df.index)} Documents")
print(f"Loaded {len(train_queries_df.index)} Judgments")

Loaded 6000 Documents
Loaded 12000 Judgments


In [2]:
train_catalog_df

Unnamed: 0,catalog_id,title,text,url
0,9761,SARS-CoV-2 spike protein displays sequence sim...,Recent emergence of SARS-CoV-2 and associated ...,https://pubmed.ncbi.nlm.nih.gov/34855795
1,3159,Effect of Different Disease-Modifying Therapie...,Objectives Vaccination against COVID-19 is hi...,https://pubmed.ncbi.nlm.nih.gov/34956211
2,1489,APOBEC-mediated Editing of SARS-CoV-2 Genomic ...,During COVID-19 pandemic mutations of SARS-Co...,https://pubmed.ncbi.nlm.nih.gov/34981048
3,2987,PEDIATRIC NEPHROLOGY IN THE TIME OF CORONA,INTRODUCTION Symptomatic coronavirus 2019 CO...,https://pubmed.ncbi.nlm.nih.gov/34957711
4,812,Postoperative Outcomes Analysis After Pancreat...,Background Surgical resection is the only pos...,https://pubmed.ncbi.nlm.nih.gov/34993230
...,...,...,...,...
5995,5734,Perception of the risk of COVID-19 and prevent...,Background The characteristics of the transmi...,https://pubmed.ncbi.nlm.nih.gov/34914344
5996,5191,Maximizing the Role of Emergency Medical Servi...,The Centers for Disease Control and Prevention...,https://pubmed.ncbi.nlm.nih.gov/34924090
5997,5390,Vaccine effectiveness against COVID-19 hospita...,BACKGROUND Measuring vaccine effectiveness V...,https://pubmed.ncbi.nlm.nih.gov/34920180
5998,860,Potential Molecular Mechanisms and Remdesivir ...,Introduction Severe acute respiratory syndrom...,https://pubmed.ncbi.nlm.nih.gov/34992355


In [12]:
import numpy.random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class RandomRanker:
    def __init__(self):
        pass
    
    def train(self, catalog_df, queries_df):
        pass
    
    def prerun(self, catalog_df):
        pass
    
    def get_score(self, query, catalog_df):
        text = query["input_text"]
        return {
            "scores": np.random.uniform(0,1,size=len(catalog_df))
        }
    
def levenshtein_distance(word1, word2):
    if len(word1) < len(word2):
        return levenshtein_distance(word2, word1)

    if len(word2) == 0:
        return len(word1)

    previous_row = range(len(word2) + 1)

    for i, c1 in enumerate(word1):
        current_row = [i + 1]

        for j, c2 in enumerate(word2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))

        previous_row = current_row

    return previous_row[-1]
    

def normalized_levenshtein_distance(word1, word2):
    distance = levenshtein_distance(word1, word2)
    max_length = max(len(word1), len(word2))
    return distance / max_length

    
class LevensteinRanker:
    def __init__(self):
        print("Wanring! This is a slow ranker")
        pass
    
    def train(self, catalog_df, queries_df):
        pass
    
    def prerun(self, catalog_df):
        pass
    
    def get_score(self, query, catalog_df):
        text = query["input_text"]
        
        return {
            "scores": catalog_df["text"].apply(lambda x: -normalized_levenshtein_distance(x, text)).values
        }
    
class TfidfRanker:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b', lowercase=True)

    def train(self, catalog_df, queries_df):
        self.vectorizer.fit(catalog_df['text'].str.lower())

    def prerun(self, catalog_df):
        pass
        
    def get_score(self, query, catalog_df):
        text = query["input_text"].lower()
        tfidf_matrix = self.vectorizer.transform(catalog_df['text'].str.lower())
        query_vector = self.vectorizer.transform([text])
        scores = (tfidf_matrix * query_vector.T).toarray()
        return {
            "scores": scores.flatten()
        }
    
class BM25Ranker:
    def __init__(self):
        pass
        
    def train(self, catalog_df, queries_df):
        pass
        
    def prerun(self, catalog_df):
        corpus = catalog_df['text'].str.lower().tolist()
        tokenized_corpus = [doc.split(" ") for doc in corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)

    def get_score(self, query, catalog_df):
        text = query["input_text"].lower()
        query_vector = text.split(" ")
        scores = self.bm25.get_scores(query_vector)
        return {
            "scores": scores
        }
    
class EmbeddingRanker:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def train(self, catalog_df, queries_df, epochs=0):
        if epochs != 0:
            # Prepare the data for training
            examples = []
            for _, row in queries_df.iterrows():
                text = row['input_text']
                positive_id = row['match_id']
                try:
                    positive_text = catalog_df.loc[catalog_df['catalog_id'] == positive_id, 'text'].values[0]
                    negative_ids = catalog_df.loc[catalog_df['catalog_id'] != positive_id, 'catalog_id'].sample(n=1).values
                    negative_text = catalog_df.loc[catalog_df['catalog_id'] == negative_ids[0], 'text'].values[0]
                    examples.append(InputExample(texts=[text, positive_text, negative_text]))
                except:
                    pass

            print(f"{len(examples)} Examples Found")

            train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
            train_loss = losses.TripletLoss(self.model)

            self.model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs)

    def prerun(self, catalog_df):
        self.corpus = catalog_df['text'].str.lower().tolist()
        self.corpus_embeddings = self.get_embeddings(self.corpus)

    def get_score(self, query, catalog_df):
        query_embedding = self.get_embeddings([query["input_text"]])
        scores = cosine_similarity(query_embedding, self.corpus_embeddings)
        return {
            "scores": scores.flatten()
        }

    def get_embeddings(self, texts):
        return self.model.encode(texts)
    
embedding_ranker = EmbeddingRanker()
embedding_ranker.train(train_catalog_df, train_queries_df, epochs=0)
embedding_ranker.prerun(train_catalog_df)
embedding_ranker.get_score({"input_text": "Remote"}, train_catalog_df)
    
trained_embedding_ranker = EmbeddingRanker()
trained_embedding_ranker.train(train_catalog_df, train_queries_df, epochs=1)
trained_embedding_ranker.prerun(train_catalog_df)
trained_embedding_ranker.get_score({"input_text": "Remote"}, train_catalog_df)
    
tf_idf_ranker = TfidfRanker()
tf_idf_ranker.train(train_catalog_df, train_queries_df)
tf_idf_ranker.prerun(train_catalog_df)
tf_idf_ranker.get_score({"input_text": "Remote"}, train_catalog_df)

bm_25_ranker = BM25Ranker()
bm_25_ranker.train(train_catalog_df, train_queries_df)
bm_25_ranker.prerun(train_catalog_df)
bm_25_ranker.get_score({"input_text": "Remote"}, train_catalog_df)

9586 Examples Found


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/600 [00:00<?, ?it/s]

{'scores': array([0., 0., 0., ..., 0., 0., 0.])}

In [13]:
from tqdm import tqdm

def evaluate(ranker, catalog_df, queries_df):
    ranks = []
    ranker.prerun(catalog_df)
    for i,row in tqdm(queries_df.iterrows(), total=len(queries_df.index)):
        input_query = dict(row)
        target_id = input_query["match_id"]
        judgment = input_query["judgment"]
        
        if judgment == True:
            del input_query["match_id"]
            del input_query["judgment"]
            
            scores = ranker.get_score(input_query, catalog_df)["scores"]
            sorted_catalog = catalog_df.iloc[np.argsort(-scores)]
            rank = np.where(sorted_catalog["catalog_id"].values == target_id)
            rank = rank[0][0] # FIXME: This could file if target_id is not in the catalog_df, in that case, skip
            ranks.append(rank)
          
    ranks = np.array(ranks)
    return {
        "ranks": ranks,
        "top_1": sum(ranks < 1) / len(ranks),
        "top_10": sum(ranks < 10) / len(ranks),
        "top_100": sum(ranks < 100) / len(ranks),
        "top_1000": sum(ranks < 1000) / len(ranks),
    }

print(f'Top 10 Random: {evaluate(RandomRanker(), val_catalog_df, val_queries_df)["top_10"]}')
print(f'Top 10 TF-IDF: {evaluate(tf_idf_ranker, val_catalog_df, val_queries_df)["top_10"]}')
print(f'Top 10 BM-25: {evaluate(bm_25_ranker, val_catalog_df, val_queries_df)["top_10"]}')
print(f'Top 10 Sentence Transformer: {evaluate(embedding_ranker, val_catalog_df, val_queries_df)["top_10"]}')
print(f'Top 10 Fine Tuned Sentence Transformer: {evaluate(trained_embedding_ranker, val_catalog_df, val_queries_df)["top_10"]}')

100%|██████████| 4000/4000 [00:01<00:00, 3845.52it/s]


Top 10 Random: 0.005


100%|██████████| 4000/4000 [07:42<00:00,  8.66it/s]


Top 10 TF-IDF: 0.9615


100%|██████████| 4000/4000 [00:13<00:00, 288.70it/s]


Top 10 BM-25: 0.97


100%|██████████| 4000/4000 [00:19<00:00, 200.94it/s]


Top 10 Sentence Transformer: 0.955


100%|██████████| 4000/4000 [00:19<00:00, 200.13it/s]

Top 10 Fine Tuned Sentence Transformer: 0.96





In [36]:
val_catalog_df

Unnamed: 0,catalog_id,title,text,url
0,8113,Persistence of Symptoms After Discharge of Pat...,Many patients who had coronavirus disease 2019...,https://pubmed.ncbi.nlm.nih.gov/34881263
1,1562,Impact of the Salud Mesoamerica Initiative on ...,BACKGROUND The Salud Mesoamerica Initiative ...,https://pubmed.ncbi.nlm.nih.gov/34979990
2,5735,SARS-CoV-2 damages male fertility How and why,The male reproductive system has a structural ...,https://pubmed.ncbi.nlm.nih.gov/34914248
3,9834,Exploring Knowledge Prevention Methods and P...,BACKGROUND As the COVID-19 pandemic has unfol...,https://pubmed.ncbi.nlm.nih.gov/34854344
4,5379,Prognostic role of Interleukin-6 lymphocytes r...,INTRODUCTION AND AIM Interleukin-6 to lymphoc...,https://pubmed.ncbi.nlm.nih.gov/34920336
...,...,...,...,...
1995,2515,Pediatric Emergency Medicine Didactics and Sim...,Introduction Hyperleukocytosis an infrequent...,https://pubmed.ncbi.nlm.nih.gov/34963902
1996,852,How and When Telework Improves Job Performance...,Purpose Literature shows that it is a paradox...,https://pubmed.ncbi.nlm.nih.gov/34992479
1997,9115,The real economic costs of COVID-19 Insights ...,The COVID-19 pandemic has caused extreme econo...,https://pubmed.ncbi.nlm.nih.gov/34866706
1998,4359,The second mother How the baby food industry...,Most babies in France are fed with infant form...,https://pubmed.ncbi.nlm.nih.gov/34935291


In [38]:
res = evaluate(bm_25_ranker, val_catalog_df, val_queries_df)

100%|██████████| 4000/4000 [00:13<00:00, 300.94it/s]
