In [2]:
##
# @file scrape_mgp.ipynb
#
# @brief Scrape data (name, student details, education, etc.) from Math Genealogy Project (www.genealogy.math.ndsu.nodak.edu)
##

# Setup
from bs4 import BeautifulSoup
import requests, json


In [4]:
base_url = "https://genealogy.math.ndsu.nodak.edu/id.php?id="


In [5]:
# Load data to get MGP IDs
data = json.load(open("math_genea_mapping.json"))
print(len(data))


3469


In [6]:
# get dictionary containing all attributes of a student of that professor
def get_student(student_row):
    """! Takes student element and properly format/clean it to return dictionary

    @param student_row   student details element

    @return     Dictionary containing name, id, school, year and descendants of the student
    """

    cols = student_row.select("td")

    obj = {}
    obj["name"] = cols[0].text
    obj["id"] = cols[0].select_one("a")["href"]
    obj["school"] = cols[1].text
    obj["year"] = cols[2].text
    obj["descendants"] = cols[3].text

    return obj


# objects in which some error occured while parsing (3 out of 1000 comes)
# there is no proper class names or id in elements of these pages, so finding exact element required was difficult and hence even after a lot of efforts 3 still give error
error_objs = []

itr = 0

for obj in data:
    print("itr", itr)
    itr += 1
    if itr % 20 == 0:
        json.dump(data, open("data.json", "w"), indent=4, default=str)

    try:
        id = obj["Mathematics Genealogy Project ID (P549)"]
        name = obj["ire_person_name"]

        # id is null for some of the people
        if not id:
            continue

        print("id", id)

        try:
            # just checking if id is a string
            id.strip()
        except:
            # if not, then (for some reason) there are more than 1 id, take the first and continue
            id = id[0]

        page_data = requests.get(
            base_url + id,
            headers={
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
            },
        ).text
        soup = BeautifulSoup(page_data, features="html.parser")

        cur_data = {}  # just for storing MGP data for current obj
        cur_data["name"] = soup.select_one("h2").text.strip()

        advisor_element = soup.select_one("h2 + p + div + div + div ~ p > a")
        cur_data["advisor"] = {
            "name": advisor_element.text,
            id: advisor_element["href"],
        }

        cur_data["education"] = soup.select_one("h2 + p + div + div").text.strip()
        cur_data["dissertation"] = soup.select_one("#thesisTitle").text.strip()

        student_table = soup.select_one("h2 + p + div + div + div ~ table")
        if student_table:
            student_rows = student_table.select("tr:not(:nth-child(1))")
            cur_data["students"] = list(map(get_student, student_rows))

        obj["mgp_data"] = cur_data
    except:
        error_objs.append(obj)


itr 0
itr 1
itr 2
itr 3
itr 4
itr 5
itr 6
itr 7
itr 8
itr 9
itr 10
itr 11
itr 12
itr 13
itr 14
itr 15
itr 16
itr 17
itr 18
itr 19
itr 20
itr 21
itr 22
itr 23
itr 24
itr 25
itr 26
itr 27
itr 28
itr 29
itr 30
itr 31
itr 32
itr 33
itr 34
itr 35
itr 36
itr 37
itr 38
itr 39
itr 40
itr 41
itr 42
itr 43
itr 44
itr 45
itr 46
itr 47
itr 48
itr 49
itr 50
itr 51
itr 52
itr 53
itr 54
itr 55
itr 56
itr 57
itr 58
itr 59
itr 60
itr 61
itr 62
itr 63
itr 64
itr 65
itr 66
itr 67
itr 68
itr 69
itr 70
itr 71
itr 72
itr 73
itr 74
itr 75
itr 76
itr 77
itr 78
itr 79
itr 80
itr 81
itr 82
itr 83
itr 84
itr 85
itr 86
itr 87
itr 88
itr 89
itr 90
itr 91
itr 92
itr 93
itr 94
itr 95
itr 96
itr 97
itr 98
itr 99
itr 100
itr 101
itr 102
itr 103
itr 104
itr 105
itr 106
itr 107
itr 108
itr 109
itr 110
itr 111
itr 112
itr 113
itr 114
itr 115
itr 116
itr 117
itr 118
itr 119
itr 120
itr 121
itr 122
itr 123
itr 124
itr 125
itr 126
itr 127
itr 128
itr 129
itr 130
itr 131
itr 132
itr 133
itr 134
itr 135
itr 136
itr 137
itr 13

In [35]:
print(len(error_names))
print(len(data))


3
1362


In [20]:
json.dump(data, open("data.json", "w"), indent=4, default=str)


In [12]:
old_data = json.load(open("mgp.json"))
len(old_data)


1362

In [13]:
data = json.load(open("data.json"))
len(data)


3469

In [16]:
"94554" in old_data


True