In [3]:
import requests
from bs4 import BeautifulSoup
from serpapi import GoogleSearch
import pandas as pd




# Faculties


In [2]:
URL = "https://ac.cs.tsinghua.edu.cn/faculty.html"


In [3]:
def parase_url(url):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup


soup = parase_url(URL)


In [24]:
def get_profs_infos(soup):
    extracted_profs = soup.find_all(class_="card-content")
    names = [prof.span.text for prof in extracted_profs]
    links = [prof.a["href"] for prof in extracted_profs]

    prof_infos_list = [prof.p.text for prof in extracted_profs]
    area_of_interests = [
        prof_infos.split("Area of Research Interests")[1][1:].strip()
        for prof_infos in prof_infos_list
    ]  # area of interest
    titles = [prof_infos.split("\n")[0] for prof_infos in prof_infos_list]  # title
    emails = [
        prof_infos.split("\n")[1].split(" ")[-1] for prof_infos in prof_infos_list
    ]  # emails

    return names, links, titles, emails, area_of_interests


In [26]:
names, links, titles, emails, area_of_interests = get_profs_infos(soup)




In [35]:
N = len(names)

prof_df = pd.DataFrame(
    {
        "id": range(N),
        "name": names,
        "title": titles,
        "email": emails,
        "area_of_interest": area_of_interests,
    }
)


In [38]:
prof_df.to_csv("prof_info.csv", index=False)


# Get research papers and abstracts
- for each prof, find the 20 latest papers, top 20 most cited paper

In [63]:
SERP_API_KEY = "3fb66225911929b1e816688b7da5c052c289e6b1f797b9ff0500916033e73e08"


In [64]:
# some profs haave to search with name, some by email


In [107]:
def get_google_scholar_search_params(query):
    params = {
        "engine": "google_scholar",
        "q": query,
        "api_key": SERP_API_KEY,
        "hl": "en",
        "as_ylo": 2017,
        "num": 20,
    }

    return params


def perform_google_search(params):
    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results["organic_results"]
    return organic_results


def get_paper_details(organic_results):
    title_list = []
    link_list = []
    snippet_list = []
    num_cited_list = []

    for r in organic_results:
        title = r["title"]
        link = r["link"]
        snippet = r["snippet"]

        if "cited_by" in r["inline_links"]:
            num_cited = r["inline_links"]["cited_by"]["total"]
        else:
            num_cited = 0

        title_list.append(title)
        link_list.append(link)
        snippet_list.append(snippet)
        num_cited_list.append(num_cited)

    return title_list, link_list, snippet_list, num_cited_list


In [170]:
def obtain_full_paper_df(names):
    full_paper_df = pd.DataFrame(
        columns=["prof_id", "title", "link", "snippet", "num_cited"]
    )
    for i, n in enumerate(names):
        query = n + " tsinghua"
        params = get_google_scholar_search_params(query)
        organic_results = perform_google_search(params)
        title_list, link_list, snippet_list, num_cited_list = get_paper_details(
            organic_results
        )
        paper_result_df = pd.DataFrame(
            {
                "prof_id": i,
                "title": title_list,
                "link": link_list,
                "snippet": snippet_list,
                "num_cited": num_cited_list,
            }
        )
        full_paper_df = pd.concat([full_paper_df, paper_result_df])
    return full_paper_df


In [171]:
full_paper_df = obtain_full_paper_df(names)


In [172]:
full_paper_df.to_csv("paper_data.csv", index=False)


In [7]:
full_paper_df = pd.read_csv("paper_data.csv")


In [37]:
full_paper_df["base_url"] = full_paper_df["link"].apply(
    lambda url: "/".join(url.split("/")[:3])
)


In [94]:
full_paper_df


Unnamed: 0,prof_id,title,link,snippet,num_cited,base_url
0,0,"Prevalence, characteristics, and outcomes of C...",https://www.ahajournals.org/doi/abs/10.1161/CI...,Background: Acute myocarditis (AM) is thought ...,71,https://www.ahajournals.org
1,0,Machine learning of human plasma lipidomes for...,https://journals.plos.org/plosbiology/article?...,Obesity is associated with changes in the plas...,52,https://journals.plos.org
2,0,Use of steroid profiling combined with machine...,https://jamanetwork.com/journals/jamanetworkop...,"… Eisenhofer G, Durán C, Cannistraci CV, et al...",42,https://jamanetwork.com
3,0,"Geometrical congruence, greedy navigability an...",https://www.nature.com/articles/s41467-022-346...,… by our group in the studies of Muscoloni and...,2,https://www.nature.com
4,0,Functional annotation of human long noncoding ...,https://genome.cshlp.org/content/30/7/1060.short,Long noncoding RNAs (lncRNAs) constitute the m...,106,https://genome.cshlp.org
...,...,...,...,...,...,...
775,38,Carbothermal shock enabled facile and fast gro...,https://link.springer.com/article/10.1007/s122...,Carbon nanotubes (CNTs) hold great promise in ...,7,https://link.springer.com
776,38,Ionic liquid-wrapped MXene film with bowl-like...,https://link.springer.com/article/10.1007/s122...,Recent researches in the development of in-pla...,2,https://link.springer.com
777,38,Phase/size dual controlled 2D semiconductor In...,https://link.springer.com/article/10.1007/s122...,The production of two-dimensional nanosheets (...,2,https://link.springer.com
778,38,Physical sensors for skin‐inspired electronics,https://onlinelibrary.wiley.com/doi/abs/10.100...,"Skin, the largest organ in the human body, is ...",138,https://onlinelibrary.wiley.com


# ArXiv Scraper

In [305]:
import string


def replace_punct_with_space(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def replace_multiple_spaces_with_single_space(text):
    return " ".join(text.split())


def preprocess_pipeline(text):
    preproc_text = text.lower()
    preproc_text = replace_punct_with_space(preproc_text)
    preproc_text = replace_multiple_spaces_with_single_space(preproc_text)
    return preproc_text


In [298]:
paper_data_df = pd.read_csv("paper_data.csv")


In [326]:
def get_init_arxiv_search_url(title):
    plain_title = preprocess_pipeline(title)
    query_insert = "+".join(plain_title.split())
    arxiv_search_url = (
        f"https://arxiv.org/search/?query={query_insert}+&searchtype=all&source=header"
    )
    return arxiv_search_url


def get_arxiv_parper_link(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    arxiv_link_block = soup.find(class_="list-title is-inline-block")
    arxiv_paper_link = None
    if arxiv_link_block:
        arxiv_paper_link = arxiv_link_block.a["href"]

    return arxiv_paper_link


def get_abstract(arxiv_paper_link):
    page = requests.get(arxiv_paper_link)
    soup = BeautifulSoup(page.content, "html.parser")
    abstract = soup.find(class_="abstract mathjax").text

    return abstract


def get_abstract_pipeline(title):
    arxiv_search_url = get_init_arxiv_search_url(title)
    arxiv_paper_link = get_arxiv_parper_link(arxiv_search_url)
    abstract = ""
    if arxiv_paper_link:
        abstract = get_abstract(arxiv_paper_link)

    return abstract


In [339]:
from tqdm import tqdm


In [341]:
paper_data_df["abstract"] = ""
for i, title in tqdm(enumerate(paper_data_df["title"])):
    abstract = get_abstract_pipeline(title)
    paper_data_df.loc[i, "abstract"] = abstract

    paper_data_df.to_csv("paper_data_w_abstract.csv", index=False)


780it [25:05,  1.93s/it]


In [328]:
paper_data_df["abstract"] = paper_data_df["title"].apply(
    lambda title: get_abstract_pipeline(title)
)


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
paper_data_df.to_csv("paper_data.csv", index=False)


In [None]:
# features of ranking:
## 1. num_cited
## 2. keyword matches(bm25)
## 3. recency?
## 4. aread of interest?


In [95]:
## preprocess - prof info


## preprocess - paper_data


In [236]:
prof_info_df = pd.read_csv("prof_info.csv")


In [239]:
prof_info_df["preproc_name"] = prof_info_df["name"].apply(
    lambda x: preprocess_pipeline(x)
)
prof_info_df["preproc_area_of_interest"] = prof_info_df["area_of_interest"].apply(
    lambda x: preprocess_pipeline(x)
)


In [240]:
paper_data_df = pd.read_csv("paper_data.csv")


In [241]:
paper_data_df["preproc_title"] = paper_data_df["title"].apply(
    lambda x: preprocess_pipeline(x)
)
paper_data_df["preproc_snippet"] = paper_data_df["snippet"].apply(
    lambda x: preprocess_pipeline(x)
)


In [251]:
id_list = prof_info_df["id"].tolist()
name_list = prof_info_df["name"].tolist()
id_to_name = dict(map(lambda i, j: (i, j), id_list, name_list))


In [242]:
paper_data_df.head()


Unnamed: 0,prof_id,title,link,snippet,num_cited,preproc_title,preproc_snippet
0,0,"Prevalence, characteristics, and outcomes of C...",https://www.ahajournals.org/doi/abs/10.1161/CI...,Background: Acute myocarditis (AM) is thought ...,71,prevalence characteristics and outcomes of cov...,background acute myocarditis am is thought to ...
1,0,Machine learning of human plasma lipidomes for...,https://journals.plos.org/plosbiology/article?...,Obesity is associated with changes in the plas...,52,machine learning of human plasma lipidomes for...,obesity is associated with changes in the plas...
2,0,Use of steroid profiling combined with machine...,https://jamanetwork.com/journals/jamanetworkop...,"… Eisenhofer G, Durán C, Cannistraci CV, et al...",42,use of steroid profiling combined with machine...,… eisenhofer g durán c cannistraci cv et al us...
3,0,"Geometrical congruence, greedy navigability an...",https://www.nature.com/articles/s41467-022-346...,… by our group in the studies of Muscoloni and...,2,geometrical congruence greedy navigability and...,… by our group in the studies of muscoloni and...
4,0,Functional annotation of human long noncoding ...,https://genome.cshlp.org/content/30/7/1060.short,Long noncoding RNAs (lncRNAs) constitute the m...,106,functional annotation of human long noncoding ...,long noncoding rnas lncrnas constitute the maj...


In [243]:
from rank_bm25 import BM25Okapi


In [244]:
corpus = paper_data_df["preproc_snippet"].tolist()
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)


In [246]:
import numpy as np


In [256]:
query = "diffusion"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
k = 10
top_k_indexs = np.argsort(doc_scores)[::-1][:k]


In [257]:
tokenized_query


['diffusion']

In [267]:
 np.sort(doc_scores)[::-1]

array([5.97846928, 5.66184587, 5.3770728 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [262]:
top_k_prof_ids = bm25.get_top_n(tokenized_query, paper_data_df["prof_id"], n=20)
top_k_snippets = bm25.get_top_n(tokenized_query, paper_data_df["snippet"], n=20)


In [259]:
top_k_prof_ids


[2, 2, 34, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 38, 12, 13, 12, 12, 12, 12]

In [260]:
[id_to_name[i] for i in top_k_prof_ids]


['Jianfei Chen',
 'Jianfei Chen',
 'Fang Zheng',
 'Dan Li',
 'Guoliang Li',
 'Guoliang Li',
 'Guoliang Li',
 'Guoliang Li',
 'Guoliang Li',
 'Guoliang Li',
 'Dan Li',
 'Dan Li',
 'Dan Li',
 'Yong Zhang',
 'Dan Li',
 'Guoliang Li',
 'Dan Li',
 'Dan Li',
 'Dan Li',
 'Dan Li']

In [265]:
top_k_snippets[3]


'… Wu at Tsinghua University for their great support of this work during the COVID-19 pandemic. We thank all of the members of the Jiang laboratory for their technical assistance and/or …'

In [177]:
corpus = prof_info_df["preproc_area_of_interest"].tolist()
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)


In [182]:
query = "diffusion models"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
k = 10
top_k_indexs = np.argsort(doc_scores)[::-1][:k]


In [183]:
bm25.get_top_n(tokenized_query, prof_info_df["name"], n=10)


['Yuxiao Dong',
 'Yong Zhang',
 'Xiaolin Hu',
 'Yongjin Liu',
 'Yang Liu',
 'Juanzi Li',
 'Guoliang Li',
 'Dan Li',
 'Zhengfeng Ji',
 'Minlie Huang']

In [235]:
paper_data_df[paper_data_df["prof_id"] == 36]["snippet"]


720    Abstract Machine learning models, especially D...
721    In this paper, we present Tianshou, a highly m...
722    Spiking neural networks (SNNs) are promising i...
723    During conventional chemotherapy for cancer, n...
724    Learning the principal eigenfunctions of an in...
725    Neural networks are vulnerable to adversarial ...
726    Tree boosting, which combines weak learners (t...
727    … Jun Zhu is a professor of computer science a...
728    Nanowire devices have attracted considerable a...
729    For the first time, this paper develops a nove...
730    … Jun Zhu Bo Zhang Tsinghua National Lab for I...
731    In this paper, we present Tianshou, a highly m...
732    … of seven machine learning experts from the T...
733    … Tianyu Pang, Kun Xu, Chongxuan Li, Yang Song...
734    Generative flows are promising tractable model...
735    … Jun Zhu Fan Bao Tsinghua National Lab for In...
736    … Correspondence to: Jianfei Chen <jianfeic@ts...
737    Circuit routing has been