In [2]:
import requests
import time
import re
from bs4 import BeautifulSoup
import pandas as pd

#### Define functions to extract different attributes from html

In [3]:
def get_job_title(_tag: BeautifulSoup) -> str:
    _a: BeautifulSoup = _tag.findAll(name='a', attrs={'data-tn-element': 'jobTitle'})
    _title = _a[0].get('title') if len(_a) > 0 else ''
    return _title

def get_job_location(_tag: BeautifulSoup) -> str:
    _div: BeautifulSoup = _tag.findAll(name='div', attrs={'class': 'recJobLoc'})
    _location = _div[0].get('data-rc-loc') if len(_div) > 0 else ''
    return _location

def get_company_name(_tag: BeautifulSoup) -> str:
    _a: BeautifulSoup = _tag.findAll(name='a', attrs={'data-tn-element': 'companyName'})
    _company = _a[0].contents[0] if len(_a) > 0 else ''
    return _company.replace('\n', '')

def get_job_salary(_tag: BeautifulSoup) -> str:
    _div: BeautifulSoup = _tag.findAll(name='span', attrs={'class': 'salaryText'})
    _salary = _div[0].contents[0] if len(_div) > 0 else ''
    return _salary.replace('\n', '')

#### Function to calculate minimum & maximum salaries from the text
- Splits the salary range into minimum & maximum
- Converts the salary to a number
- Converts it to an annual salary

In [4]:
def calc_annual_salary(_salary: str) -> list:
    if len(_salary) == 0:
        return [0, 0]
    _salaries = re.findall(r'\$[0-9.,]+', _salary)
    _salaries = [re.sub('[$,]', '', s) for s in _salaries]
    if 'year' in _salary:
        return [float(s) for s in _salaries]
    if 'month' in _salary:
        return [float(s) * 12 for s in _salaries]
    if 'hour' in _salary:
        return [float(s) * 2000 for s in _salaries]
    return [0, 0]

#### Functions to to build dynamic urls & invoke the urls & build job data by extracting html tags

In [5]:
def extract_jobs_for_page(_soup: BeautifulSoup, _topic: str) -> list:
    _jobs_for_page = []
    for div in _soup.body.findAll(name='div', attrs={'class': 'unifiedRow'}):
        _job = {
            'topic': _topic,
            'title': get_job_title(div), 
            'company': get_company_name(div),
            'location': get_job_location(div),
            'salary': get_job_salary(div),
        }
        _salaries = calc_annual_salary(_job['salary'])
        _job['min_salary'] = _salaries[0]
        _job['max_salary'] = _salaries[0]
        if len(_salaries) > 1:
            _job['max_salary'] = _salaries[1]
        _jobs_for_page.append(_job)
    return _jobs_for_page

def build_url(_topic: str, _city: str, _start: int) -> str:
    _topic = _topic.lower().replace(' ', '+')
    return 'https://www.indeed.com/jobs?q=' + _topic + '&l=' + _city + '&start=' + str(_start)

def invoke_url(_url: str) -> BeautifulSoup:
    time.sleep(1)
    _response = requests.get(url)
    _soup = BeautifulSoup(_response.text, 'html.parser')
    return _soup

#### Start processing the web scraping task
- Fetch for the 3 search topics specified - Machine Learning, Data Scientist, Software Developer
- Fetch for the 15 cities specified below
- Fetch a maximum of 100 jobs (10 per page x 10 pages) for each city + search topic combination

In [6]:
max_jobs = 10
topics = ['Machine Learning', 'Data Scientist', 'Software Developer']
cities = ['Charlotte%2C+NC', 'New+York%2C+NY', 'San+Francisco%2C+CA', 'Atlanta%2C+GA', 'Dallas%2C+TX',
          'Houston%2C+TX', 'Raleigh%2C+NC', 'Phoenix%2C+AZ', 'Los+Angeles%2C+CA', 'Seattle%2C+WA', 
          'Columbus%2C+OH', 'St.+Louis%2C+MO', 'Cedar+Rapids%2C+IA', 'Denver%2C+CO', 'Minneapolis%2C+MN']

all_jobs = []
for topic in topics:
    for city in cities:
        print('Processing -', topic, '-', city)
        for start in range(0, max_jobs, 10):
            url = build_url(topic, city, start)
            soup = invoke_url(url)
            jobs_for_page = extract_jobs_for_page(soup, topic)
            all_jobs.extend(jobs_for_page)

Processing - Machine Learning - Charlotte%2C+NC
Processing - Machine Learning - New+York%2C+NY
Processing - Machine Learning - San+Francisco%2C+CA
Processing - Machine Learning - Atlanta%2C+GA
Processing - Machine Learning - Dallas%2C+TX
Processing - Machine Learning - Houston%2C+TX
Processing - Machine Learning - Raleigh%2C+NC
Processing - Machine Learning - Phoenix%2C+AZ
Processing - Machine Learning - Los+Angeles%2C+CA
Processing - Machine Learning - Seattle%2C+WA
Processing - Machine Learning - Columbus%2C+OH
Processing - Machine Learning - St.+Louis%2C+MO
Processing - Machine Learning - Cedar+Rapids%2C+IA
Processing - Machine Learning - Denver%2C+CO
Processing - Machine Learning - Minneapolis%2C+MN
Processing - Data Scientist - Charlotte%2C+NC
Processing - Data Scientist - New+York%2C+NY
Processing - Data Scientist - San+Francisco%2C+CA
Processing - Data Scientist - Atlanta%2C+GA
Processing - Data Scientist - Dallas%2C+TX
Processing - Data Scientist - Houston%2C+TX
Processing - Da

#### Build a DataFrame & store as CSV

In [7]:
df = pd.DataFrame(data=all_jobs)
print(df.shape)

(779, 7)


In [112]:
df.to_csv('./out_data/all_jobs.csv')
print('File write completed.')


File write completed.
