In [None]:
# In this file we grab all relevant information from the price faculty table and from each faculty's
# individual page on price website
# we turn it all into a csv with email / name as primary keys

In [None]:
# Important Links -
# https://priceschool.usc.edu/faculty/
# https://scholar.google.com/
# https://colab.research.google.com/github/aeturrell/coding-for-economists/blob/main/data-databases.ipynb#scrollTo=0074d4d3


In [None]:
# want your google drive connected
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# import necessary packages
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import re
import csv
import pandas as pd

In [None]:
url = "https://priceschool.usc.edu/faculty/"
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()
# grab all the contents of the webpage
page_soup = soup(webpage, "html.parser")

# select only the faculty table we are interested in
details_list = page_soup.find(id="full-faculty-list-items")

In [None]:
# go through the table on the webpage and get all relevant information
profPrice_list = details_list.find_all("tr", class_="item clickable")
links = []
names = []
positions = []
expertises = []

# loop through each row and get relevant information
for item in profPrice_list:
    profileLink = item.get('data-url')
    name = item.find("td", class_="name").text.strip()
    position = item.find("td", class_="position").text.strip()
    expertise = item.find("td", class_="expertise").text.strip()

    links.append(profileLink)
    names.append(name)
    positions.append(position)
    expertises.append(expertise)

# put info into a table
data = {
    "Name":names,
    "Link": links,
    "Position": positions,
    "Expertise": expertises,
}
df = pd.DataFrame(data)

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
print(df)

                      Name                                               Link  \
0              Emma Aguila    https://priceschool.usc.edu/people/emma-aguila/   
1              Grace Bahng    https://priceschool.usc.edu/people/grace-bahng/   
2     Christine M. Beckman  https://priceschool.usc.edu/people/christine-m...   
3         Antonio M. Bento  https://priceschool.usc.edu/people/antonio-m-b...   
4               Tara Blanc     https://priceschool.usc.edu/people/tara-blanc/   
..                     ...                                                ...   
74            Michael Thom   https://priceschool.usc.edu/people/michael-thom/   
75          Bryan Tysinger  https://priceschool.usc.edu/people/bryan-tysin...   
76  Detlof von Winterfeldt  https://priceschool.usc.edu/people/detlof-von-...   
77       Frank V. Zerunyan  https://priceschool.usc.edu/people/frank-zerun...   
78  Julie M. Zissimopoulos  https://priceschool.usc.edu/people/julie-m-zis...   

                           

In [None]:
# now go through each faculty's link and get relevant information from that page
emails = []
degrees = []
degrees_inst = []
scholar_links = []
scholar_on_webpage = []
# publications = []

# loop through each faculty page
for i in range(len(links)):
    flag = 0 # for scholar link
    print(".", end="")
    URL = links[i]
    req = Request(URL, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    page_soup = soup(webpage, "html.parser")
    try: # first try email
      # email is encoded so we can't directly acces, need to decode it
      email = page_soup.find(class_="email").a["href"]
      code = email.split("#") # 2nd item of this split returns the email encoding
      def cfDecodeEmail(s): # helper function to decode the email
        r = int(s[:2],16)
        email = ''.join([chr(int(s[i:i+2], 16) ^ r) for i in range(2, len(s), 2)])
        return email
      x = cfDecodeEmail(code[1])
      emails.append(x) # add it to the list
    except: # no email found
      emails.append("NA")

    try:
        # Extracting degree information
        degree_elem = page_soup.find(class_="degree")
        if degree_elem:
            degree = degree_elem.text.strip()
        else:
            degree = "NA"
        degrees.append(degree)

        # Extracting degree institution information
        degree_inst_elem = page_soup.find(class_="deg_inst")
        if degree_inst_elem:
            degree_inst = degree_inst_elem.text.strip()
        else:
            degree_inst = "NA"
        degrees_inst.append(degree_inst)
    except:
        degrees.append("NA")
        degrees_inst.append("NA")

    try: # try finding google scholar
      bio = page_soup.find(id="bio")
      hrefs = bio.find_all('a')
      for item in hrefs:
        if re.search('^(https://scholar.google.com/)', item['href']):
          scholar_links.append(item['href'])
          scholar_on_webpage.append("yes")
          flag = 1
          # some teachers have 2 google scholar links, just grab the first one
          break
      if flag == 0:
          scholar_links.append("NA")
          scholar_on_webpage.append("no")
    except: # no google scholar found
      scholar_links.append("NA")
      scholar_on_webpage.append("no")

df["Email"] = emails
df["Has_Scholar"] = scholar_on_webpage
df["Scholar_Link"] = scholar_links
df["Degree"] = degrees
df["Degree_Institution"] = degrees_inst


...............................................................................

In [None]:
df = df.reindex(columns=['Email', 'Name', 'Link', 'Position', 'Expertise', 'Has_Scholar', 'Scholar_Link', 'Degree', 'Degree_Institution'])
print(df)

                     Email                    Name  \
0         eaguilav@usc.edu             Emma Aguila   
1           gbahng@usc.edu             Grace Bahng   
2         cbeckman@usc.edu    Christine M. Beckman   
3           abento@usc.edu        Antonio M. Bento   
4            blanc@usc.edu              Tara Blanc   
..                     ...                     ...   
74          mdthom@usc.edu            Michael Thom   
75        btysinge@usc.edu          Bryan Tysinger   
76        winterfe@usc.edu  Detlof von Winterfeldt   
77  frank.zerunyan@usc.edu       Frank V. Zerunyan   
78        zissimop@usc.edu  Julie M. Zissimopoulos   

                                                 Link  \
0     https://priceschool.usc.edu/people/emma-aguila/   
1     https://priceschool.usc.edu/people/grace-bahng/   
2   https://priceschool.usc.edu/people/christine-m...   
3   https://priceschool.usc.edu/people/antonio-m-b...   
4      https://priceschool.usc.edu/people/tara-blanc/   
..       

In [None]:
# write it as a csv, this will override whatever is already in folder for faculty.csv!
df.to_csv(r'/content/drive/MyDrive/urban_futures/data/faculty.csv', index=False)