In [1]:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

class UMichJob:
    def __init__(self, job_id):
        self.url = f'https://careers.umich.edu/job_detail/{job_id}/'
        self.job_id = job_id
        self.title = ''
        self.location = ''
        self.reg_temp = ''
        self.dept = ''
        self.start_dt = ''
        self.end_dt = ''
        self.salary_low = ''
        self.salary_high = ''
        self.career_interest = ''
    
    def __str__(self):
        return self.url
    
    def __repr__(self):
        return self.__str__()
    
    def __eq__(self, other):
        return self.url == other.url
    
    def __hash__(self):
        return hash(self.url)


def reached_end(soup):
    end_text = 'There are currently no posted jobs fitting the criteria you selected'
    p_tags = soup.find_all('p')
    for p in p_tags:
        if p != None and end_text in p.text:
            return True
    return False

def get_jobs(career_interest = 'All', page_limit = 50, job_limit = None, title = '', keyword = ''):
    jobs = []
    
    for pageNum in range(0,page_limit):
        print(f'Scanning page {pageNum+1}...')
        
        url = f'https://careers.umich.edu/search-jobs?career_interest={career_interest}&page={pageNum}&title={title}&keyword={keyword}'
        response = requests.get(url, verify=False) # TODO: remove verify=False once their site starts working again
        soup = BeautifulSoup(response.text, 'html.parser')

        if reached_end(soup):
            print(f'\nReached the end on page number {pageNum+1}')
            break

        a_tags = soup.find_all('a')
        for a in a_tags:
            href = a.get('href')
            if href != None and 'job_detail' in href:
                job_id = href.split('/')[2]
                jobs.append(UMichJob(job_id))
        
        if job_limit != None and len(jobs) >= job_limit:
            break

    return jobs

def get_job_info(job):
        response = requests.get(job.url, verify=False)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        div_tags = soup.find_all('div')
        
        for div in div_tags:
            h3 = div.find('h3')
            if h3 == None:
                continue
                
            p = div.find('p')
            if p == None:
                continue
                
            h3_text = h3.text.lower()
            p_text = p.text
                    
            if 'working title' in h3_text:
                job.title = p_text
            elif 'work location' in h3_text:
                job.location = p_text
            elif 'regular/temporary' in h3_text:
                job.reg_temp = p_text
            elif 'department' in h3_text:
                job.dept = p_text
            elif 'date' in h3_text:
                date_range = p_text.split(' - ')
                try:
                    job.start_dt = date_range[0]
                    job.end_dt = date_range[1]
                except:
                    print('\tError scraping dates')
            elif 'salary' in h3_text:
                salary_range = p_text.split(' - ')
                try:
                    job.salary_low = salary_range[0]
                    job.salary_high = salary_range[1]
                except:
                    print('\tError scraping salary')
            elif 'interest' in h3_text:
                interests = div.find_all('p')
                for i in interests:
                    job.career_interest += ';' + i.text
                job.career_interest = job.career_interest[1:]
                       

In [3]:
# IT = 210
jobs = get_jobs(career_interest=210)
jobs.extend(get_jobs(title='analyst'))
jobs.extend(get_jobs(title='data'))
jobs.extend(get_jobs(keyword='python'))
jobs = list(set(jobs))

print(f'{len(jobs)} jobs found')
print('Scraping job info...')
count = 0
for job in jobs:
    count += 1
    print(f'{count}: {job}')
    get_job_info(job)

job_dicts = []

for job in jobs:
    job_dicts.append(vars(job))
    
df = pd.DataFrame(job_dicts)
df

Scanning page 1...
Scanning page 2...
Scanning page 3...

Reached the end on page number 3
Scanning page 1...
Scanning page 2...

Reached the end on page number 2
Scanning page 1...
Scanning page 2...

Reached the end on page number 2
Scanning page 1...
Scanning page 2...
Scanning page 3...

Reached the end on page number 3
65 jobs found
Scraping job info...
1: https://careers.umich.edu/job_detail/239645/
2: https://careers.umich.edu/job_detail/238453/
3: https://careers.umich.edu/job_detail/231143/
4: https://careers.umich.edu/job_detail/239849/
5: https://careers.umich.edu/job_detail/239765/
6: https://careers.umich.edu/job_detail/229396/
7: https://careers.umich.edu/job_detail/238762/
8: https://careers.umich.edu/job_detail/233405/
9: https://careers.umich.edu/job_detail/239192/
10: https://careers.umich.edu/job_detail/239652/
11: https://careers.umich.edu/job_detail/239492/
12: https://careers.umich.edu/job_detail/229901/
	Error scraping dates
13: https://careers.umich.edu/job_deta

Unnamed: 0,url,job_id,title,location,reg_temp,dept,start_dt,end_dt,salary_low,salary_high,career_interest
0,https://careers.umich.edu/job_detail/239645/,239645,Claims Data Analyst Sr/Inter,Ann Arbor Campus,Regular,MM Family Medicine,9/19/2023,9/26/2023,,,Research
1,https://careers.umich.edu/job_detail/238453/,238453,RESEARCH FELLOW,Ann Arbor Campus,Regular,MM Int Med-Hematology/Oncology,8/31/2023,9/30/2023,,,Research Fellows
2,https://careers.umich.edu/job_detail/231143/,231143,Research App Programmer/Analyst,Ann Arbor Campus,Regular,Biostatistics Department,9/11/2023,9/30/2023,"$87,324.00","$107,871.00",Information Technology
3,https://careers.umich.edu/job_detail/239849/,239849,Business Intelligence Analyst Senior / Interme...,Michigan Medicine - Ann Arbor,Regular,MM HITS EDIS Dpt Info Delivery,9/22/2023,10/06/2023,,,Administration;Information Technology
4,https://careers.umich.edu/job_detail/239765/,239765,RESEARCH ASST I (TEMP) - Data Science,Flint Campus,Temporary,Flint IDEAS,9/21/2023,1/13/2024,$15.00,$15.00,Temporary Job Opening
...,...,...,...,...,...,...,...,...,...,...,...
60,https://careers.umich.edu/job_detail/239791/,239791,Clinical Info Analyst Sr / Inter,Michigan Medicine - Ann Arbor,Regular,MM Quality - Quality Reporting,9/20/2023,10/04/2023,"$58,540.00","$102,610.00",Healthcare Admin & Support
61,https://careers.umich.edu/job_detail/239751/,239751,Business Systems Analyst Senior / Intermediate,Michigan Medicine - Ann Arbor,Regular,HITS AcadIT Application Servic,9/21/2023,10/05/2023,,,Information Technology
62,https://careers.umich.edu/job_detail/239686/,239686,Business Systems Analyst Intermediate,Michigan Medicine - Ann Arbor,Regular,HITS AcadIT Application Servic,9/18/2023,10/02/2023,,,Information Technology
63,https://careers.umich.edu/job_detail/239799/,239799,Training Specialist Intermediate,Michigan Medicine - Ann Arbor,Regular,MM HITS BusIT LDD Wolverine,9/21/2023,10/05/2023,,,Human Resources;Information Technology


In [4]:
df.to_csv('umich_jobs.csv', index=False)