In [3]:
##
# @file scrape_research.com.ipynb
#
# @brief Scrape people from www.research.com for world & national ranking, university, citations, photo, etc.
##

# Setup
from bs4 import BeautifulSoup
import requests, pickle


In [77]:
# list of URLs to scrape
base_urls = [
    "https://research.com/scientists-rankings/computer-science/2021",
    "https://research.com/scientists-rankings/computer-science/2021/page/2",
    "https://research.com/scientists-rankings/computer-science/2021/page/3",
    "https://research.com/scientists-rankings/computer-science/2021/page/4",
    "https://research.com/scientists-rankings/computer-science/2021/page/5",
    "https://research.com/scientists-rankings/computer-science/2021/page/6",
    "https://research.com/scientists-rankings/computer-science/2021/page/7",
    "https://research.com/scientists-rankings/computer-science/2021/page/8",
    "https://research.com/scientists-rankings/computer-science/2021/page/9",
    "https://research.com/scientists-rankings/computer-science/2021/page/10",
]


In [94]:
data = {}

# fetch each URL
for url in base_urls:
    page_data = requests.get(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
        },
    ).text
    soup = BeautifulSoup(page_data, features="html.parser")

    rows = soup.select(".rankings-content__item")

    # for each person on that page
    for row in rows:
        # there is some hidden unwanted html, so remove it
        hidden = row.select(".show-tablet")
        for e in hidden:
            e.decompose()

        positions_container = row.select_one("span.positions")
        info_container = row.select_one("span.info")
        ranking_container = row.select_one("span.rankings-info")

        world_ranking = positions_container.select_one("span:nth-child(1)").text.strip()
        national_ranking = positions_container.select_one(
            "span:nth-child(2)"
        ).text.strip()

        name = info_container.select_one("h4").text
        university = info_container.select_one("span.sh").text
        img = info_container.select_one("img")["src"]
        url_element = info_container.select_one("h4 a")

        h_index = ranking_container.select_one("span:nth-child(1)").text.strip()
        citations = ranking_container.select_one("span:nth-child(2)").text.strip()
        dblp_articles = ranking_container.select_one("span:nth-child(3)").text.strip()

        # for some reason, 3 persons have duplicate entries, so skip if already done
        if name in data:
            print(name + " already there")
            continue

        data[name] = {}
        data[name]["world_ranking"] = world_ranking
        data[name]["national_ranking"] = national_ranking
        data[name]["university"] = university
        data[name]["img"] = img
        data[name]["h_index"] = h_index
        data[name]["citations"] = citations
        data[name]["dblp_articles"] = dblp_articles

        if url_element:
            data[name]["url"] = url_element["href"]

print(len(data))


Fei-Yue Wang already there
Chin-Hui Lee already there
Gerhard Fischer already there
997


In [112]:
# now go to the dedicated page for each person, if it exists
for name in data:
    if "url" not in data[name]:
        continue

    page_data = requests.get(
        data[name]["url"],
        headers={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
        },
    ).text
    soup = BeautifulSoup(page_data, features="html.parser")

    publications_element = soup.select_one(".metrics-table__row span:nth-child(4)")
    awards_elements = soup.select(".profile-achievements .tab-slide > p")
    external_link_elements = soup.select(".sidebar > ul > li > a")

    if publications_element:
        data[name]["publications"] = publications_element.text

    if awards_elements:
        data[name]["awards"] = list(map(lambda p: p.text, awards_elements))

    if external_link_elements:
        for a in external_link_elements:
            data[name][a.text.strip()] = a["href"]


In [113]:
# Purpose of this cell: rename keys like 'Personal Website for <name>' to 'Personal Website'

for name in data:
    print(data[name])
    break

for name in data:
    key_to_remove = None

    for key in data[name]:
        if key.startswith("Personal Website") and key != "Personal Website":
            key_to_remove = key
            break

    if key_to_remove:
        data[name]["Personal Website"] = data[name][key_to_remove]
        del data[name][key_to_remove]

for name in data:
    print(data[name])
    break


{'world_ranking': '1', 'national_ranking': '1', 'university': 'Michigan State University, United States', 'img': 'https://s.research.com/images/f99a49fa716c01363bf32dbfac706dc775c771af-65x65.jpeg', 'h_index': '196', 'citations': '221,989', 'dblp_articles': '740', 'url': 'https://research.com/u/anil-k-jain', 'publications': '740', 'awards': ['2019 - Fellow, The World Academy of Sciences ', '2019 - Foreign Member, Chinese Academy of Sciences ', '2016 - Foreign Member, Indian National Academy of Engineering ', '2016 - Member, National Academy of Engineering (US) ', '2015 - Fellow, National Academy of Inventors ', '2007 - W. Wallace McDowell Award For pioneering contributions to theory, technique, and practice of pattern recognition, computer vision, and biometric recognition systems.', '2006 - SPIE Fellow ', '2005 - Fellow, American Association for the Advancement of Science (AAAS) ', '2003 - ACM Fellow ', '2003 - IEEE CS Technical Achievement Award For contributions to pattern recognitio

In [7]:
# print(len(data))
# pickle.dump(data, open('pickle_research.com', 'wb'))
temp = pickle.load(open("pickle_research.com", "rb"))
print(len(temp))

google_scholar = []

for name in temp:
    if "Google Scholar Profile" in temp[name]:
        google_scholar.append(temp[name]["Google Scholar Profile"])

# dump the google scholar profile links to further scrape that
pickle.dump(google_scholar, open("pickle_research.com_gs_profiles", "wb"))
print(google_scholar)

# file = open('temp', 'w')
# for k,v in data.items():
#     file.write(k + ' -> ' + str(v) + '\n\n')


997
['https://scholar.google.com/citations?hl=pl&user=g-_ZXGsAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=kukA0LcAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=tk2qT34AAAAJ', 'https://scholar.google.com/citations?hl=pl&user=Kv9AbjMAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=yxUduqMAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=S2OjOvYAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=D0lL1r0AAAAJ', 'https://scholar.google.com/citations?hl=pl&user=LQ87h3sAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=UZ5wscMAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=m1qAiOUAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=rGF6-WkAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=JicYPdAAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=GUAoEcAAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=7K34d7cAAAAJ', 'https://scholar.google.com/citations?hl=pl&user=DZ-fHPgAAAAJ', 'https://scholar.google.com/citatio

In [1]:
import pickle, json

# pickle.dump(data, open('pickle_mgp', 'wb'))
temp = pickle.load(open("pickle_research.com", "rb"))
json.dump(temp, open("research.json", "w"), default=str, indent=4)
print(len(temp))


997
