# Extracting the Graph of Mathematicians
Amirabbas Asadi

In [None]:
import requests as rq
from bs4 import BeautifulSoup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Extracting doctoral advisors and doctoral students

In [None]:
url_pref = 'https://en.wikipedia.org'
def get_info(url):
  page = rq.get(url_pref+url)
  soup = BeautifulSoup(page.content, 'html.parser')
  info = soup.find('table', class_='infobox')
  rows = info.find_all('tr')
  advisors = []
  index = None
  for i, row in enumerate(rows):
    t = row.get_text().lower()
    if t.find('advisor') > -1:
      index = i
      break
  if index != None:
    advisors_temp = rows[index].find('td').find_all('a', href=True)
    advisors_list = [ (i.get_text(), i['href'])  for i in advisors_temp]
    for i_n, i_u in advisors_list:
      if(i_n[0].isupper() and i_n[0].lower().find('university') == -1):
        advisors.append((i_n, i_u))
  
  students = []
  s_index = None
  for i, row in enumerate(rows):
    t = row.get_text().lower()
    if t.find('student') > -1:
      s_index = i
      break
  if s_index != None:
    students_temp = rows[s_index].find('td').find_all('a', href=True)
    students_list = [ (i.get_text(), i['href'])  for i in students_temp]
    for i_n, i_u in students_list:
      if(i_n[0].isupper() and i_n[0].lower().find('university') == -1):
        students.append((i_n, i_u))
  return advisors, students

In [None]:
get_info('/wiki/Rudolf_Lipschitz')

([('Gustav Dirichlet', '/wiki/Peter_Gustav_Lejeune_Dirichlet'),
  ('Martin Ohm', '/wiki/Martin_Ohm')],
 [('Felix Klein', '/wiki/Felix_Klein')])

## Performing BFS

In [None]:
import networkx as nx
S = [('William Feller', '/wiki/William_Feller')]
graph = nx.DiGraph()
visited = {}
max_visit = 1000000
while len(S) > 0 and len(visited) < max_visit:
  v = S.pop(0)
  visited[v[1]] = v[0]
  advisors = None
  students = None
  try:
    advisors, students = get_info(v[1])
  except:
    advisors = None
    students = None

  if(advisors != None):
    for adv_n, adv_u in advisors:
      graph.add_edge(adv_u, v[1])
      if(adv_u not in visited):
        S.append((adv_n, adv_u))
        visited[adv_u] = adv_n
        print(adv_n)

  if(students != None):
    for std_n, std_u in students:
      graph.add_edge(v[1], std_u)
      if(std_u not in visited):
        S.append((std_n, std_u))
        visited[std_u] = std_n
        print(std_n)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Geoffrey Riley
Thomas Ward
Narendra Ahuja
Larry Davis
Matti Pietikäinen
Angela Y. Wu
David S. Doermann
Fritz Sauter
Rudolf Haag
Klaus Samelson
Friedrich Ludwig Bauer
Bingsheng He
Hjalmar Mellin
Ernst Lindelöf
Rolf Nevanlinna
Paul Garabedian
Dale Husemoller
Albert Marden
Robert Osserman
Henry Pollak
Halsey Royden
George Springer
Burton Rodin
Abraham Silvers
Dennis Hada
Pentti Suomela
András Sebő
Éva Tardos
Luigi Puccianti
Paul Ehrenfest
Harold Agnew
Edoardo Amaldi
Owen Chamberlain
Geoffrey Chew
Mildred Dresselhaus
Jerome Friedman
Richard Garwin
Marvin Goldberger
Tsung-Dao Lee
Ettore Majorana
Arthur Rosenfeld
Emilio Segrè
Sam Treiman
Peter D. Jarvis
George M. Church
Helen Donis-Keller
Jack Greenblatt
Gerald Guralnik
Benno Müller-Hill
David Fairlie
Peter Goddard
Peter Landshoff
Sir Tom Kibble
James Stirling
Richard J. Eden
Paul Frampton
John Harnad
David Olive
Douglas Ross
Raymond Streater
P. T. Matthews
John Wheater
Ian Hin

## Saving the graph

In [None]:
import pickle 
with open('gdrive/My Drive/graph.data', 'wb') as fp:
  pickle.dump(graph, fp, pickle.HIGHEST_PROTOCOL)
with open('gdrive/My Drive/visited.data', 'wb') as fp:
  pickle.dump(visited, fp, pickle.HIGHEST_PROTOCOL)

## Searching for a mathematician

In [None]:
for i in graph.nodes:
  if i.find('Henri') > -1:
    print(i)

/wiki/Peter_Henrici_(mathematician)
/wiki/Olaus_Henrici
/wiki/Peter_K._Henrici
/wiki/Henri_Villat
/wiki/Henri_Lebesgue
/wiki/Henri_Pad%C3%A9
/wiki/Henri_Poincar%C3%A9
/wiki/Henri_Bortoft
/wiki/%C3%89mile_Henriot_(chemist)
/wiki/Henri_Cartan
/wiki/Henri_Hogbe_Nlend
/wiki/Henri_Darmon
/wiki/Henrik_Steffens
/wiki/Henri_Gouraud_(computer_scientist)
/wiki/Henri_Gillet
/wiki/Jacobus_Henricus_van_%27t_Hoff
/wiki/Henri_Berestycki
/wiki/Henri_Victor_Regnault
/wiki/Henri_Moscovici
/wiki/Henricus_Regius


## Saving the graph as csv for visualization

In [None]:
edges = []
for s, t in graph.edges:
  edges.append([visited[s], visited[t]])

In [None]:
import pandas as pd
graph_df = pd.DataFrame(edges)
graph_df

Unnamed: 0,0,1
0,Richard Courant,William Feller
1,Richard Courant,Leifur Ásgeirsson
2,Richard Courant,Herbert Busemann
3,Richard Courant,Kurt Friedrichs
4,Richard Courant,Harold Grad
...,...,...
8862,Frederick Gowland Hopkins,Antoinette Pirie
8863,Frederick Gowland Hopkins,Judah Hirsch Quastel
8864,Frederick Gowland Hopkins,Malcolm Dixon
8865,Manuela Veloso,Peter Stone


In [None]:
graph_df.to_csv('math-graph.csv', index=False, header=False)