In [48]:
import pandas as pd
import string
from rank_bm25 import BM25Okapi

import numpy as np

In [49]:
def replace_punct_with_space(text):
    return text.translate(str.maketrans(string.punctuation, " " * 32))


def replace_multiple_spaces_with_single_space(text):
    return " ".join(text.split())


def preprocess_pipeline(text):
    preproc_text = text.lower()
    preproc_text = replace_punct_with_space(preproc_text)
    preproc_text = replace_multiple_spaces_with_single_space(preproc_text)
    return preproc_text

In [50]:
def prepare_paper_data(paper_df):
    paper_df["preproc_paper_title"] = paper_df["paper_title"].apply(
        lambda x: preprocess_pipeline(x)
    )

    paper_df["preproc_paper_abstract"] = paper_df["paper_abstract"].apply(
        lambda x: preprocess_pipeline(x)
    )

    ## construct doc corpus
    #   join title, abstract

    paper_df["doc"] = (
        paper_df["preproc_paper_title"] + " " + paper_df["preproc_paper_abstract"]
    )

    paper_df = paper_df.drop("preproc_paper_title", axis=1)
    paper_df = paper_df.drop("preproc_paper_abstract", axis=1)
    return paper_df

In [51]:
def init_search_engine(corpus):
    tokenized_corpus = [doc.split(" ") for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25

In [52]:
def get_relevant_documents_indexes(search_engine, query, k):
    # Get the top k indexes
    query = preprocess_pipeline(query)
    tokenized_query = query.split(" ")

    doc_scores = search_engine.get_scores(tokenized_query)
    num_of_rel_docs = sum(doc_scores != 0)
    k = min(num_of_rel_docs, k)

    top_k_indexs = np.argsort(doc_scores)[::-1][:k]
    return top_k_indexs

In [53]:
def get_prof_ids_given_paper_id(prof_paper_df, paper_id):
    return prof_paper_df.loc[
        prof_paper_df["paper_id"].apply(lambda x: x == paper_id), "prof_id"
    ].tolist()

In [54]:
def get_search_display_ids(paper_df, prof_paper_df, top_k_indexs):
    retrieved_paper_ids = paper_df.loc[top_k_indexs, "paper_id"].tolist()
    retrieved_prof_ids = [
        get_prof_ids_given_paper_id(prof_paper_df, i) for i in retrieved_paper_ids
    ]
    return retrieved_paper_ids, retrieved_prof_ids

In [55]:
def get_prof_name_from_id(prof_id, prof_df):
    return prof_df.loc[prof_id, "prof_name"]


def get_prof_title_from_id(prof_id, prof_df):
    return prof_df.loc[prof_id, "prof_title"]


def get_prof_email_from_id(prof_id, prof_df):
    return prof_df.loc[prof_id, "prof_email"]


def get_prof_aoi_from_id(prof_id, prof_df):
    return prof_df.loc[prof_id, "prof_area_of_interest"]


def get_prof_link_from_id(prof_id, prof_df):
    return prof_df.loc[prof_id, "prof_link"]

In [56]:
def get_paper_title_from_id(paper_id, paper_df):
    return paper_df.loc[paper_id, "paper_title"]


def get_paper_abstract_from_id(paper_id, paper_df):
    return paper_df.loc[paper_id, "paper_abstract"]


def get_paper_link_from_id(paper_id, paper_df):
    return paper_df.loc[paper_id, "paper_link"]

In [57]:
def get_search_display_data(retrieved_paper_ids, retrieved_prof_ids, prof_df, paper_df):
    prof_names = [
        [get_prof_name_from_id(prof_id, prof_df) for prof_id in prof_ids]
        for prof_ids in retrieved_prof_ids
    ]

    paper_titles = [
        get_paper_title_from_id(paper_id, paper_df) for paper_id in retrieved_paper_ids
    ]

    paper_abstracts = [
        get_paper_abstract_from_id(paper_id, paper_df)
        for paper_id in retrieved_paper_ids
    ]

    paper_links = [
        get_paper_link_from_id(paper_id, paper_df) for paper_id in retrieved_paper_ids
    ]

    N = len(paper_links)

    json_output = [
        {
            "rank": i,
            "prof_names": prof_names[i],
            "paper_title": paper_titles[i],
            "paper_abstract": paper_abstracts[i],
            "paper_link": paper_links[i],
        }
        for i in range(N)
    ]

    return json_output

In [58]:
def perform_search(search_engine, query, k, paper_df, prof_df, prof_paper_df):
    top_k_indexs = get_relevant_documents_indexes(search_engine, query, k)

    retrieved_paper_ids, retrieved_prof_ids = get_search_display_ids(
        paper_df, prof_paper_df, top_k_indexs
    )

    json_output = get_search_display_data(
        retrieved_paper_ids, retrieved_prof_ids, prof_df, paper_df
    )

    return json_output

In [59]:
paper_df = pd.read_csv("data/paper_df.csv")
prof_df = pd.read_csv("data/prof_df.csv")
prof_paper_df = pd.read_csv("data/prof_paper_df.csv")

paper_df = prepare_paper_data(paper_df)

search_engine = init_search_engine(paper_df["doc"])

query = "game"
k = 20
top_k_indexs = get_relevant_documents_indexes(search_engine, query, k)

retrieved_paper_ids, retrieved_prof_ids = get_search_display_ids(
    paper_df, prof_paper_df, top_k_indexs
)

json_output = get_search_display_data(
    retrieved_paper_ids, retrieved_prof_ids, prof_df, paper_df
)

In [60]:
json_output

[{'rank': 0,
  'prof_names': ['Zhengfeng Ji'],
  'paper_title': 'A three-player coherent state embezzlement game',
  'paper_abstract': 'We introduce a three-player nonlocal game, with a finite number of classical questions and answers, such that the optimal success probability of $1$ in the game can only be achieved in the limit of strategies using arbitrarily high-dimensional entangled states. Precisely, there exists a constant $0 <c\\leq 1$ such that to succeed with probability $1-\\varepsilon$ in the game it is necessary to use an entangled state of at least $Ω(\\varepsilon^{-c})$ qubits, and it is sufficient to use a state of at most $O(\\varepsilon^{-1})$ qubits.\n  The game is based on the coherent state exchange game of Leung et al. (CJTCS 2013). In our game, the task of the quantum verifier is delegated to a third player by a classical referee. Our results complement those of Slofstra (arXiv:1703.08618) and Dykema et al. (arXiv:1709.05032), who obtained two-player games with si

In [61]:
def get_prof_display_data(prof_id, prof_df):
    prof_name = get_prof_name_from_id(prof_id, prof_df)
    prof_title = get_prof_title_from_id(prof_id, prof_df)
    prof_email = get_prof_email_from_id(prof_id, prof_df)
    prof_aoi = get_prof_aoi_from_id(prof_id, prof_df)
    prof_link = get_prof_link_from_id(prof_id, prof_df)

    json_output = {
        "prof_name": prof_name,
        "prof_title": prof_title,
        "prof_email": prof_email,
        "prof_aoi": prof_aoi,
        "prof_link": prof_link,
    }
    return json_output

In [62]:
get_prof_display_data(0, prof_df)

{'prof_name': 'Carlo V. Cannistraci',
 'prof_title': 'Professor',
 'prof_email': 'kailong@mail.tsinghua.edu.cn',
 'prof_aoi': 'Information theory, Machine intelligence, Network science, Socio-economic systems, Complex big data in neuro and life sciences.',
 'prof_link': 'https://brain.tsinghua.edu.cn/en/info/1010/1003.htm'}

In [76]:
def get_all_profs_info(prof_df):
    N = len(prof_df)
    json_output = []
    for i in range(N):
        row = prof_df.loc[i]
        prof_id = row["prof_id"]
        prof_name = row["prof_name"]
        prof_title = row["prof_title"]
        prof_email = row["prof_email"]
        prof_area_of_interest = row["prof_area_of_interest"]
        prof_link = row["prof_link"]

        paper_ids = prof_paper_df.loc[
            prof_paper_df["prof_id"].apply(lambda x: x == prof_id), "paper_id"
        ].tolist()

        paper_titles = paper_df.loc[
            paper_df["paper_id"].apply(lambda x: x in paper_ids), "paper_title"
        ].tolist()

        json_output.append(
            {
                "prof_id": prof_id,
                "prof_name": prof_name,
                "prof_title": prof_title,
                "prof_email": prof_email,
                "prof_area_of_interest": prof_area_of_interest,
                "prof_link": prof_link,
                "paper_titles": paper_titles,
            }
        )
    return json_output

In [77]:
N = len(prof_df)
json_output = []
for i in range(N):
    row = prof_df.loc[i]
    prof_id = row["prof_id"]
    prof_name = row["prof_name"]
    prof_title = row["prof_title"]
    prof_email = row["prof_email"]
    prof_area_of_interest = row["prof_area_of_interest"]
    prof_link = row["prof_link"]

In [81]:
prof_paper_df["prof_id"].apply(lambda x: x == prof_id)

0       False
1       False
2       False
3       False
4       False
        ...  
1888     True
1889     True
1890     True
1891     True
1892     True
Name: prof_id, Length: 1893, dtype: bool

In [86]:
paper_ids = prof_paper_df.loc[
    prof_paper_df["prof_id"].apply(lambda x: x == prof_id), "paper_id"
].tolist()

paper_tiitles = paper_df.loc[
    paper_df["paper_id"].apply(lambda x: x in paper_ids), "paper_title"
].tolist()

In [87]:
paper_tiitles

['The First LHAASO Catalog of Gamma-Ray Sources',
 'FAST search for circumstellar atomic hydrogen. II. Is BD+303639 an interacting planetary nebula?',
 'Measurement of ultra-high-energy diffuse gamma-ray emission of the Galactic plane from 10 TeV to 1 PeV with LHAASO-KM2A',
 'UCF: Uncovering Common Features for Generalizable Deepfake Detection',
 'Multi-User Matching and Resource Allocation in Vision Aided Communications',
 'Generalized Color Orderings: CEGM Integrands and Decoupling Identities',
 'Sign-changing solution for an overdetermined elliptic problem on unbounded domain',
 'Bifurcation of sign-changing solutions for an overdetermined boundary problem in bounded domains',
 'Improved Test-Time Adaptation for Domain Generalization',
 'High-Fidelity Clothed Avatar Reconstruction from a Single Image',
 'Generation of rotational ground state HD$^+$ ions in an ion trap using a resonance-enhanced threshold photoionization process',
 'Improving Fast Adversarial Training with Prior-Guid