In [1]:
import pandas as pd
import string
from rank_bm25 import BM25Okapi

import numpy as np




# We want the following features in the search engine
1. Given technical terms, return sorted relevant papers, showing the prof name, prof contact, num cite, snippet, abstract, arxiv link
    - show profs, allow filtering of profs
2. display all prof list (name, contact, area of interest, papers)

In [2]:
def replace_punct_with_space(text):
    return text.translate(str.maketrans(string.punctuation, " " * 32))


def replace_multiple_spaces_with_single_space(text):
    return " ".join(text.split())


def preprocess_pipeline(text):
    preproc_text = text.lower()
    preproc_text = replace_punct_with_space(preproc_text)
    preproc_text = replace_multiple_spaces_with_single_space(preproc_text)
    return preproc_text


In [3]:
def prepare_prof_info():
    prof_info_df = pd.read_csv("prof_info.csv")
    prof_info_df["preproc_name"] = prof_info_df["name"].apply(
        lambda x: preprocess_pipeline(x)
    )
    prof_info_df["preproc_area_of_interest"] = prof_info_df["area_of_interest"].apply(
        lambda x: preprocess_pipeline(x)
    )
    return prof_info_df


def get_id_to_name(prof_info_df):
    # map prof_id to prof name
    id_list = prof_info_df["id"].tolist()
    name_list = prof_info_df["name"].tolist()
    id_to_name = dict(map(lambda i, j: (i, j), id_list, name_list))
    return id_to_name


def prepare_paper_data():
    paper_data_w_abstract_df = pd.read_csv("paper_data_w_abstract.csv")
    paper_data_w_abstract_df["preproc_title"] = paper_data_w_abstract_df["title"].apply(
        lambda x: preprocess_pipeline(x)
    )
    paper_data_w_abstract_df["preproc_snippet"] = paper_data_w_abstract_df[
        "snippet"
    ].apply(lambda x: preprocess_pipeline(x))
    paper_data_w_abstract_df["preproc_abstract"] = (
        paper_data_w_abstract_df["abstract"]
        .fillna("")
        .apply(lambda x: preprocess_pipeline(x))
    )

    ## construct doc corpus
    #   join title, abstract

    paper_data_w_abstract_df["corpus"] = ""

    msk_empty_abstract = paper_data_w_abstract_df["preproc_abstract"] == ""
    paper_data_w_abstract_df.loc[msk_empty_abstract, "corpus"] = (
        paper_data_w_abstract_df.loc[msk_empty_abstract, "preproc_title"]
        + " "
        + paper_data_w_abstract_df.loc[msk_empty_abstract, "preproc_snippet"]
    )  # join title and snippet
    paper_data_w_abstract_df.loc[~msk_empty_abstract, "corpus"] = (
        paper_data_w_abstract_df.loc[~msk_empty_abstract, "preproc_title"]
        + " "
        + paper_data_w_abstract_df.loc[~msk_empty_abstract, "preproc_abstract"]
    )  # join title and snippet

    return paper_data_w_abstract_df


def init_search_engine(corpus):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25


In [4]:
def prepare_search_engine(prof_info_df, paper_data_w_abstract_df):
    id_to_name = get_id_to_name(prof_info_df)

    corpus = paper_data_w_abstract_df["corpus"].tolist()
    search_engine = init_search_engine(corpus)

    return search_engine, id_to_name


In [5]:
prof_info_df = prepare_prof_info()
paper_data_w_abstract_df = prepare_paper_data()
search_engine, id_to_name = prepare_search_engine(
    prof_info_df, paper_data_w_abstract_df
)


In [6]:
def get_relevant_documents_indexes(search_engine, query, k):
    # Get the top k indexes
    query = preprocess_pipeline(query)
    tokenized_query = query.split(" ")

    doc_scores = search_engine.get_scores(tokenized_query)
    num_of_rel_docs = sum(doc_scores != 0)
    k = min(num_of_rel_docs, k)

    top_k_indexs = np.argsort(doc_scores)[::-1][:k]
    return top_k_indexs


In [7]:
def get_all_relevant_info(top_k_indexes, prof_info_df, paper_data_w_abstract_df):
    extracted_prof_ids = paper_data_w_abstract_df.loc[top_k_indexes, "prof_id"]

    extracted_paper_data = paper_data_w_abstract_df.loc[
        top_k_indexes, ["title", "link", "snippet", "abstract"]
    ]

    extracted_prof_info = prof_info_df.loc[
        extracted_prof_ids, ["title", "name", "email", "link", "area_of_interest"]
    ]

    return extracted_paper_data, extracted_prof_info


In [8]:
def search_pipeline(prof_info_df, paper_data_w_abstract_df, search_engine, query, k):
    top_k_indexes = get_relevant_documents_indexes(search_engine, query, k)
    extracted_paper_data, extracted_prof_info = get_all_relevant_info(
        top_k_indexes, prof_info_df, paper_data_w_abstract_df
    )
    return extracted_paper_data, extracted_prof_info


In [10]:
query = "variational inference"
extracted_paper_data, extracted_prof_info = search_pipeline(
    prof_info_df, paper_data_w_abstract_df, search_engine, query, k=10
)


In [11]:
extracted_paper_data


Unnamed: 0,title,link,snippet,abstract
565,Ship detection in polarimetric SAR images via ...,https://ieeexplore.ieee.org/abstract/document/...,"In this paper, we propose a novel ship detecti...",
434,On the necessity and effectiveness of learning...,https://arxiv.org/abs/1905.13452,Using powerful posterior distributions is a po...,
435,Unsupervised clustering through gaussian mixtu...,https://ieeexplore.ieee.org/abstract/document/...,Clustering has long been an important research...,
734,Vflow: More expressive generative flows with v...,http://proceedings.mlr.press/v119/chen20p.html,Generative flows are promising tractable model...,\nAbstract: Generative flows are promising tr...
45,Vflow: More expressive generative flows with v...,http://proceedings.mlr.press/v119/chen20p.html,Generative flows are promising tractable model...,\nAbstract: Generative flows are promising tr...
514,Tr-bert: Dynamic token reduction for accelerat...,https://arxiv.org/abs/2105.11618,Existing pre-trained language models (PLMs) ar...,
544,Low latency RNN inference with cellular batching,https://dl.acm.org/doi/abs/10.1145/3190508.319...,Performing inference on pre-trained neural net...,
47,ZhuSuan: A library for Bayesian deep learning,https://arxiv.org/abs/1709.05870,"In this paper we introduce ZhuSuan, a python p...",\nAbstract: In this paper we introduce ZhuSua...
51,Big learning with Bayesian methods,https://academic.oup.com/nsr/article-abstract/...,The explosive growth in data volume and the av...,\nAbstract: Bayesian inference provides a met...
75,Svqn: Sequential variational soft q-learning n...,https://openreview.net/pdf?id=9_oTSv35eS3,Partially Observable Markov Decision Processes...,


In [12]:
extracted_prof_info


Unnamed: 0,title,name,email,link,area_of_interest
28,Associate Professor,Bin Xu,xubin@tsinghua.edu.cn,http://keg.cs.tsinghua.edu.cn/persons/xubin/,"Computer Vision, computational neuroscience, d..."
21,Associate Professor,Dan Pei,peidan@tsinghua.edu.cn,https://netman.aiops.org/~peidan/,"Data Mining, Autonomous IT Operations, AIOps (..."
21,Associate Professor,Dan Pei,peidan@tsinghua.edu.cn,https://netman.aiops.org/~peidan/,"Data Mining, Autonomous IT Operations, AIOps (..."
36,Professor,Jun Zhu,dcszj@tsinghua.edu.cn,http://ml.cs.tsinghua.edu.cn/~jun/index.shtml,Machine Learning
2,Assistant Professor,Jianfei Chen,jianfeic@tsinghua.edu.cn,https://ml.cs.tsinghua.edu.cn/~jianfei/,Deep Learning
25,Professor,Maosong Sun,sms@tsinghua.edu.cn,http://www.cs.tsinghua.edu.cn/publish/csen/462...,"Artificial intelligence, deep learning, natura..."
27,Professor,Yongwei Wu,wuyw@tsinghua.edu.cn,http://madsys.cs.tsinghua.edu.cn/~yongweiwu/,Parallel and Distributed Computing
2,Assistant Professor,Jianfei Chen,jianfeic@tsinghua.edu.cn,https://ml.cs.tsinghua.edu.cn/~jianfei/,Deep Learning
2,Assistant Professor,Jianfei Chen,jianfeic@tsinghua.edu.cn,https://ml.cs.tsinghua.edu.cn/~jianfei/,Deep Learning
3,Professor,Ting Chen,tingchen@tsinghua.edu.cn,http://www.cssb.tsinghua.edu.cn/en/participant...,"Machine Learning, Computational Biology, Medic..."


$\Lambda$