In [4]:
import requests
from bs4 import BeautifulSoup
from serpapi import GoogleSearch
import pandas as pd
import string
from tqdm import tqdm




# There are 3 levels of scraping required
1. Scrape professor information from `https://ac.cs.tsinghua.edu.cn/faculty.html`
2. Scrape paper information using SerpAPI
3. Scrape abstracts from arXiv

## 1. Scrape professor information from `https://ac.cs.tsinghua.edu.cn/faculty.html`

In [5]:
def parase_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup


def get_profs_infos(soup):
    """Extracts names, links, titles, emails, area_of_interests from  "https://ac.cs.tsinghua.edu.cn/faculty.html"

    Args:
        soup (html): html of "https://ac.cs.tsinghua.edu.cn/faculty.html"

    Returns:
        names, links, titles, emails, area_of_interests
    """
    extracted_profs = soup.find_all(class_="card-content")
    names = [prof.span.text for prof in extracted_profs]
    links = [prof.a["href"] for prof in extracted_profs]

    prof_infos_list = [prof.p.text for prof in extracted_profs]
    area_of_interests = [
        prof_infos.split("Area of Research Interests")[1][1:].strip()
        for prof_infos in prof_infos_list
    ]  # area of interest
    titles = [prof_infos.split("\n")[0] for prof_infos in prof_infos_list]  # title
    emails = [
        prof_infos.split("\n")[1].split(" ")[-1] for prof_infos in prof_infos_list
    ]  # emails

    return names, links, titles, emails, area_of_interests


In [24]:
URL = "https://ac.cs.tsinghua.edu.cn/faculty.html"

soup = parase_url(URL)  # get html
names, links, titles, emails, area_of_interests = get_profs_infos(
    soup
)  # extract names, links, titles, emails, area_of_interests

N = len(names)

prof_df = pd.DataFrame(
    {
        "id": range(N),
        "name": names,
        "title": titles,
        "email": emails,
        "area_of_interest": area_of_interests,
        "link": links,
    }
)  # send to pd.DataFrame

prof_df.to_csv("prof_info.csv", index=False)  # save file


## 2. Scrape paper information using SerpAPI

In [None]:
SERP_API_KEY = "3fb66225911929b1e816688b7da5c052c289e6b1f797b9ff0500916033e73e08"


In [None]:
def get_google_scholar_search_params(query):
    """Prepares params for SerpAPI

    Args:
        query (str): the query to scrape from google scholar. Use "*prof name* tsinghua"

    Returns:
        dict: params for SerpAPI
    """
    params = {
        "engine": "google_scholar",
        "q": query,
        "api_key": SERP_API_KEY,
        "hl": "en",
        "as_ylo": 2017,
        "num": 20,
    }

    return params


def perform_google_search(params):
    """Scrapes google scholar using SerpAPI

    Args:
        params (dict): params for SerpAPI

    Returns:
        dict: results of scraping
    """
    search = GoogleSearch(params)
    results = search.get_dict()
    organic_results = results["organic_results"]
    return organic_results


def get_paper_details(organic_results):
    """Extracts title, link, snippet, number cited from organic_results

    Args:
        organic_results (dict): scrape output from serpAPI

    Returns:
        lists of extracted results
    """
    title_list = []
    link_list = []
    snippet_list = []
    num_cited_list = []

    for r in organic_results:
        title = r["title"]
        link = r["link"]
        snippet = r["snippet"]

        if "cited_by" in r["inline_links"]:
            num_cited = r["inline_links"]["cited_by"]["total"]
        else:
            num_cited = 0

        title_list.append(title)
        link_list.append(link)
        snippet_list.append(snippet)
        num_cited_list.append(num_cited)

    return title_list, link_list, snippet_list, num_cited_list


def obtain_full_paper_df(names):
    """For each prof, perform scraping to obtain most relevant recent papers

    Args:
        names (list): names of professors

    Returns:
        pd.DataFrame: Final results of scraping using serpAPI
    """
    full_paper_df = pd.DataFrame(
        columns=["prof_id", "title", "link", "snippet", "num_cited"]
    )
    for i, n in enumerate(names):
        query = n + " tsinghua"
        params = get_google_scholar_search_params(query)
        organic_results = perform_google_search(params)
        title_list, link_list, snippet_list, num_cited_list = get_paper_details(
            organic_results
        )
        paper_result_df = pd.DataFrame(
            {
                "prof_id": i,
                "title": title_list,
                "link": link_list,
                "snippet": snippet_list,
                "num_cited": num_cited_list,
            }
        )
        full_paper_df = pd.concat([full_paper_df, paper_result_df])
    return full_paper_df


In [None]:
names = prof_df["name"].tolist()  # get prof names
paper_data_df = obtain_full_paper_df(names)  # scrape and return results
paper_data_df.to_csv("paper_data.csv", index=False)  # save to file


## 3. Scrape abstracts from arXiv

In [None]:
def replace_punct_with_space(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def replace_multiple_spaces_with_single_space(text):
    return " ".join(text.split())


def preprocess_pipeline(text):
    preproc_text = text.lower()
    preproc_text = replace_punct_with_space(preproc_text)
    preproc_text = replace_multiple_spaces_with_single_space(preproc_text)
    return preproc_text


def get_init_arxiv_search_url(title):
    plain_title = preprocess_pipeline(title)
    query_insert = "+".join(plain_title.split())
    arxiv_search_url = (
        f"https://arxiv.org/search/?query={query_insert}+&searchtype=all&source=header"
    )
    return arxiv_search_url


def get_arxiv_parper_link(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    arxiv_link_block = soup.find(class_="list-title is-inline-block")
    arxiv_paper_link = None
    if arxiv_link_block:
        arxiv_paper_link = arxiv_link_block.a["href"]

    return arxiv_paper_link


def get_abstract(arxiv_paper_link):
    page = requests.get(arxiv_paper_link)
    soup = BeautifulSoup(page.content, "html.parser")
    abstract = soup.find(class_="abstract mathjax").text

    return abstract


def get_abstract_pipeline(title):
    arxiv_search_url = get_init_arxiv_search_url(title)
    arxiv_paper_link = get_arxiv_parper_link(arxiv_search_url)
    abstract = ""
    if arxiv_paper_link:
        abstract = get_abstract(arxiv_paper_link)

    return abstract


In [None]:
paper_data_df["abstract"] = ""
for i, title in tqdm(enumerate(paper_data_df["title"])):
    abstract = get_abstract_pipeline(title)
    paper_data_df.loc[i, "abstract"] = abstract

    paper_data_df.to_csv("paper_data_w_abstract.csv", index=False)
