In [11]:
import pandas as pd
import string
from rank_bm25 import BM25Okapi

import numpy as np

# API 1: Given search terms
- prof_id
- paper_id

In [22]:
def replace_punct_with_space(text):
    return text.translate(str.maketrans(string.punctuation, " " * 32))


def replace_multiple_spaces_with_single_space(text):
    return " ".join(text.split())


def preprocess_pipeline(text):
    preproc_text = text.lower()
    preproc_text = replace_punct_with_space(preproc_text)
    preproc_text = replace_multiple_spaces_with_single_space(preproc_text)
    return preproc_text


def prepare_prof_info():
    prof_info_df = pd.read_csv("data/prof_info.csv")
    prof_info_df["preproc_name"] = prof_info_df["name"].apply(
        lambda x: preprocess_pipeline(x)
    )
    prof_info_df["preproc_area_of_interest"] = prof_info_df["area_of_interest"].apply(
        lambda x: preprocess_pipeline(x)
    )
    return prof_info_df


def get_id_to_name(prof_info_df):
    # map prof_id to prof name
    id_list = prof_info_df["id"].tolist()
    name_list = prof_info_df["name"].tolist()
    id_to_name = dict(map(lambda i, j: (i, j), id_list, name_list))
    return id_to_name


def prepare_paper_data():
    paper_data_w_abstract_df = pd.read_csv("data/paper_data_NEW.csv")
    paper_data_w_abstract_df["preproc_title"] = paper_data_w_abstract_df[
        "paper_title"
    ].apply(lambda x: preprocess_pipeline(x))

    paper_data_w_abstract_df["preproc_abstract"] = (
        paper_data_w_abstract_df["paper_abstract"]
        .fillna("")
        .apply(lambda x: preprocess_pipeline(x))
    )

    ## construct doc corpus
    #   join title, abstract

    paper_data_w_abstract_df["corpus"] = (
        paper_data_w_abstract_df["preproc_title"]
        + " "
        + paper_data_w_abstract_df["preproc_abstract"]
    )

    return paper_data_w_abstract_df


def init_search_engine(corpus):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25


def prepare_search_engine(prof_info_df, paper_data_w_abstract_df):
    id_to_name = get_id_to_name(prof_info_df)

    corpus = paper_data_w_abstract_df["corpus"].tolist()
    search_engine = init_search_engine(corpus)

    return search_engine, id_to_name


def get_relevant_documents_indexes(search_engine, query, k):
    # Get the top k indexes
    query = preprocess_pipeline(query)
    tokenized_query = query.split(" ")

    doc_scores = search_engine.get_scores(tokenized_query)
    num_of_rel_docs = sum(doc_scores != 0)
    k = min(num_of_rel_docs, k)

    top_k_indexs = np.argsort(doc_scores)[::-1][:k]
    return top_k_indexs


def get_all_relevant_info(top_k_indexes, prof_info_df, paper_data_w_abstract_df):
    extracted_prof_ids = paper_data_w_abstract_df.loc[top_k_indexes, "prof_id"]

    extracted_paper_data = paper_data_w_abstract_df.loc[
        top_k_indexes, ["title", "link", "snippet", "abstract"]
    ]

    extracted_prof_info = prof_info_df.loc[
        extracted_prof_ids, ["title", "name", "email", "link", "area_of_interest"]
    ]

    return extracted_paper_data, extracted_prof_info


def search_pipeline(prof_info_df, paper_data_w_abstract_df, search_engine, query, k):
    top_k_indexes = get_relevant_documents_indexes(search_engine, query, k)
    extracted_paper_data, extracted_prof_info = get_all_relevant_info(
        top_k_indexes, prof_info_df, paper_data_w_abstract_df
    )
    return extracted_paper_data, extracted_prof_info


def search_pipeline_indexes(
    prof_info_df, paper_data_w_abstract_df, search_engine, query, k
):
    top_k_indexes = get_relevant_documents_indexes(search_engine, query, k)
    return top_k_indexes.tolist()

In [23]:
prof_info_df = prepare_prof_info()
paper_data_w_abstract_df = prepare_paper_data()
search_engine, id_to_name = prepare_search_engine(
    prof_info_df, paper_data_w_abstract_df
)

In [24]:
query = "variational inference"
doc_ids = search_pipeline_indexes(
    prof_info_df, paper_data_w_abstract_df, search_engine, query, k=20
)

In [25]:
prof_ids = paper_data_w_abstract_df.loc[doc_ids, "prof_id"].tolist()

In [26]:
N = len(doc_ids)
json_output = [{"doc_id": doc_ids[i], "prof_id": prof_ids[i]} for i in range(N)]

In [27]:
import json

json_output = json.dumps(json_output)

In [28]:
json_output

'[{"doc_id": 1171, "prof_id": 25}, {"doc_id": 1028, "prof_id": 21}, {"doc_id": 38, "prof_id": 2}, {"doc_id": 41, "prof_id": 2}, {"doc_id": 44, "prof_id": 2}, {"doc_id": 42, "prof_id": 2}, {"doc_id": 1693, "prof_id": 36}, {"doc_id": 24, "prof_id": 2}, {"doc_id": 1689, "prof_id": 36}, {"doc_id": 452, "prof_id": 10}, {"doc_id": 1527, "prof_id": 33}, {"doc_id": 1167, "prof_id": 25}, {"doc_id": 45, "prof_id": 2}, {"doc_id": 1518, "prof_id": 33}, {"doc_id": 912, "prof_id": 17}, {"doc_id": 532, "prof_id": 11}, {"doc_id": 1215, "prof_id": 25}, {"doc_id": 1112, "prof_id": 23}, {"doc_id": 1022, "prof_id": 21}, {"doc_id": 816, "prof_id": 15}]'

# API 2: Given prof_id
- name
- email
- area of interest

In [29]:
prof_info_df.loc[prof_ids, ["name", "title", "email", "area_of_interest", "link"]]

Unnamed: 0,name,title,email,area_of_interest,link
25,Maosong Sun,Professor,sms@tsinghua.edu.cn,"Artificial intelligence, deep learning, natura...",http://www.cs.tsinghua.edu.cn/publish/csen/462...
21,Dan Pei,Associate Professor,peidan@tsinghua.edu.cn,"Data Mining, Autonomous IT Operations, AIOps (...",https://netman.aiops.org/~peidan/
2,Jianfei Chen,Assistant Professor,jianfeic@tsinghua.edu.cn,Deep Learning,https://ml.cs.tsinghua.edu.cn/~jianfei/
2,Jianfei Chen,Assistant Professor,jianfeic@tsinghua.edu.cn,Deep Learning,https://ml.cs.tsinghua.edu.cn/~jianfei/
2,Jianfei Chen,Assistant Professor,jianfeic@tsinghua.edu.cn,Deep Learning,https://ml.cs.tsinghua.edu.cn/~jianfei/
2,Jianfei Chen,Assistant Professor,jianfeic@tsinghua.edu.cn,Deep Learning,https://ml.cs.tsinghua.edu.cn/~jianfei/
36,Jun Zhu,Professor,dcszj@tsinghua.edu.cn,Machine Learning,http://ml.cs.tsinghua.edu.cn/~jun/index.shtml
2,Jianfei Chen,Assistant Professor,jianfeic@tsinghua.edu.cn,Deep Learning,https://ml.cs.tsinghua.edu.cn/~jianfei/
36,Jun Zhu,Professor,dcszj@tsinghua.edu.cn,Machine Learning,http://ml.cs.tsinghua.edu.cn/~jun/index.shtml
10,Minlie Huang,Associate Professor,aihuang@tsinghua.edu.cn,"Machine learning applications, deep learning, ...",http://coai.cs.tsinghua.edu.cn/hml


# API 2: Given doc_id
- title
- abstract
- link

In [31]:
paper_data_w_abstract_df

Unnamed: 0,prof_id,paper_link,paper_title,paper_abstract,preproc_title,preproc_abstract,corpus
0,1,https://arxiv.org/abs/2304.02948,FengWu: Pushing the Skillful Global Medium-ran...,"We present FengWu, an advanced data-driven glo...",fengwu pushing the skillful global medium rang...,we present fengwu an advanced data driven glob...,fengwu pushing the skillful global medium rang...
1,1,https://arxiv.org/abs/2303.02635,VTQA: Visual Text Question Answering via Entit...,The ideal form of Visual Question Answering re...,vtqa visual text question answering via entity...,the ideal form of visual question answering re...,vtqa visual text question answering via entity...
2,1,https://arxiv.org/abs/2212.02122,CLIPVG: Text-Guided Image Manipulation Using D...,Considerable progress has recently been made i...,clipvg text guided image manipulation using di...,considerable progress has recently been made i...,clipvg text guided image manipulation using di...
3,1,https://arxiv.org/abs/2209.08455,TODE-Trans: Transparent Object Depth Estimatio...,Transparent objects are widely used in industr...,tode trans transparent object depth estimation...,transparent objects are widely used in industr...,tode trans transparent object depth estimation...
4,1,https://arxiv.org/abs/2209.06122,What You See is What You Grasp: User-Friendly ...,This work presents a next-generation human-rob...,what you see is what you grasp user friendly g...,this work presents a next generation human rob...,what you see is what you grasp user friendly g...
...,...,...,...,...,...,...,...
1888,38,https://arxiv.org/abs/2201.02658,Fair and efficient contribution valuation for ...,Federated learning is a popular technology for...,fair and efficient contribution valuation for ...,federated learning is a popular technology for...,fair and efficient contribution valuation for ...
1889,38,https://arxiv.org/abs/2112.12896,Dewetting Characteristics of Contact Lenses Co...,Hypothesis:\n Although wetting agents have be...,dewetting characteristics of contact lenses co...,hypothesis although wetting agents have been d...,dewetting characteristics of contact lenses co...
1890,38,https://arxiv.org/abs/2112.12251,ML4CO: Is GCNN All You Need? Graph Convolution...,The 2021 NeurIPS Machine Learning for Combinat...,ml4co is gcnn all you need graph convolutional...,the 2021 neurips machine learning for combinat...,ml4co is gcnn all you need graph convolutional...
1891,38,https://arxiv.org/abs/2112.07835,Mining Minority-class Examples With Uncertaint...,"In the real world, the frequency of occurrence...",mining minority class examples with uncertaint...,in the real world the frequency of occurrence ...,mining minority class examples with uncertaint...


In [32]:
paper_data_w_abstract_df.loc[doc_ids, ["paper_title", "paper_abstract", "paper_link"]]

Unnamed: 0,paper_title,paper_abstract,paper_link
1171,Fuse It More Deeply! A Variational Transformer...,The past several years have witnessed Variatio...,https://arxiv.org/abs/2207.06130
1028,On the Necessity and Effectiveness of Learning...,Using powerful posterior distributions is a po...,https://arxiv.org/abs/1905.13452
38,VFlow: More Expressive Generative Flows with V...,Generative flows are promising tractable model...,https://arxiv.org/abs/2002.09741
41,ZhuSuan: A Library for Bayesian Deep Learning,"In this paper we introduce ZhuSuan, a python p...",https://arxiv.org/abs/1709.05870
44,Scaling up Dynamic Topic Models,Dynamic topic models (DTMs) are very effective...,https://arxiv.org/abs/1602.06049
42,Scalable Inference for Nested Chinese Restaura...,Nested Chinese Restaurant Process (nCRP) topic...,https://arxiv.org/abs/1702.07083
1693,Improved Techniques for Maximum Likelihood Est...,Diffusion models have exhibited excellent perf...,https://arxiv.org/abs/2305.03935
24,Improved Techniques for Maximum Likelihood Est...,Diffusion models have exhibited excellent perf...,https://arxiv.org/abs/2305.03935
1689,ProlificDreamer: High-Fidelity and Diverse Tex...,Score distillation sampling (SDS) has shown gr...,https://arxiv.org/abs/2305.16213
452,DiscoDVT: Generating Long Text with Discourse-...,Despite the recent advances in applying pre-tr...,https://arxiv.org/abs/2110.05999


In [40]:
query = "diffusion model"

In [41]:
def search_pipeline_indexes(search_engine, query, k):
    top_k_indexes = get_relevant_documents_indexes(search_engine, query, k)
    return top_k_indexes.tolist()
k = 20
doc_ids = search_pipeline_indexes(search_engine, query, k=k)
prof_ids = paper_data_w_abstract_df.loc[doc_ids, "prof_id"].tolist()

prof_names = prof_info_df.loc[prof_ids, "name"].tolist()
prof_links = prof_info_df.loc[prof_ids, "link"].tolist()

paper_titles = paper_data_w_abstract_df.loc[doc_ids, "paper_title"].tolist()
paper_abstracts = paper_data_w_abstract_df.loc[doc_ids, "paper_abstract"].tolist()
paper_links = paper_data_w_abstract_df.loc[doc_ids, "paper_link"].tolist()
# prof_info_df.loc[prof_ids, ["name", "title", "email", "area_of_interest", "link"]]
# paper_data_w_abstract_df.loc[doc_ids, ["title", "abstract", "link"]]

N = len(doc_ids)
json_output = [
    {
        "rank": i,
        "prof_name": prof_names[i],
        "prof_link": prof_links[i],
        "paper_title": paper_titles[i],
        "paper_abstract": paper_abstracts[i],
        "paper_link": paper_links[i],
    }
    for i in range(N)
]
# json_output = json.dumps(json_output)

In [43]:
##get unique paper titles
unique_paper_titles = []
for title in paper_titles:
    if title not in unique_paper_titles:
        unique_paper_titles.append(title)

In [45]:
for unique_title in unique_paper_titles:
    for title in paper_titles:
        

['Robust Classification via a Single Diffusion Model',
 'Locally Attentional SDF Diffusion for Controllable 3D Shape Generation',
 'One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale',
 'Improved Techniques for Maximum Likelihood Estimation for Diffusion ODEs',
 'Improved Techniques for Maximum Likelihood Estimation for Diffusion ODEs',
 'Pattern Formation in a Coupled Membrane-Bulk Reaction-Diffusion Model for Intracellular Polarization and Oscillations',
 'Analog Bits: Generating Discrete Data using Diffusion Models with Self-Conditioning',
 'Efficient Cross-Lingual Transfer for Chinese Stable Diffusion with Images as Pivots',
 'Efficient Cross-Lingual Transfer for Chinese Stable Diffusion with Images as Pivots',
 'All are Worth Words: A ViT Backbone for Diffusion Models',
 'DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps',
 'DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps',


In [46]:
unique_paper_titles

['Robust Classification via a Single Diffusion Model',
 'Locally Attentional SDF Diffusion for Controllable 3D Shape Generation',
 'One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale',
 'Improved Techniques for Maximum Likelihood Estimation for Diffusion ODEs',
 'Pattern Formation in a Coupled Membrane-Bulk Reaction-Diffusion Model for Intracellular Polarization and Oscillations',
 'Analog Bits: Generating Discrete Data using Diffusion Models with Self-Conditioning',
 'Efficient Cross-Lingual Transfer for Chinese Stable Diffusion with Images as Pivots',
 'All are Worth Words: A ViT Backbone for Diffusion Models',
 'DPM-Solver: A Fast ODE Solver for Diffusion Probabilistic Model Sampling in Around 10 Steps',
 'A Closer Look at Parameter-Efficient Tuning in Diffusion Models',
 'Latent Video Diffusion Models for High-Fidelity Long Video Generation',
 'Can Diffusion Model Achieve Better Performance in Text Generation? Bridging the Gap between Training and Inference!',