# Indeed Job Scraper

Creating a general purpose job scraper for www.indeed.com

In [14]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [15]:
template = 'https://www.indeed.com/jobs?q={}&l={}'

In [16]:
def get_url(position, location):
    '''Generate a url from position and location'''
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    url = template.format(position, location)
    return url

In [17]:
url = get_url('data analyst', 'chicago il')

In [18]:
response = requests.get(url)

In [19]:
response

<Response [200]>

In [20]:
response.reason

'OK'

In [21]:
soup = BeautifulSoup(response.text, 'html.parser')

In [22]:
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

In [23]:
len(cards)

15

In [24]:
cards

[<div class="jobsearch-SerpJobCard unifiedRow row result" data-ci="303289378" data-empn="2765878446998520" data-jk="7b1922bbba748faf" data-tu="https://click.appcast.io/jview-te8/a31.png?ent=8&amp;e=4140&amp;jid=4140_228366C2_338480743&amp;org=false&amp;src=192&amp;bid=KHgwUWSxZ4GFz7JIIHMg4A==&amp;jg=12rw&amp;indeed=sp" id="pj_7b1922bbba748faf">
 <style>
 .jobcard_logo{margin:6px 0}.jobcard_logo img{width:auto;max-width:80px;max-height:30px}.jasxrefreshcombotst .jobcard_logo img{max-height:2rem;max-width:100%}
 </style>
 <h2 class="title">
 <a class="jobtitle turnstileLink" data-tn-element="jobTitle" href="/pagead/clk?mo=r&amp;ad=-6NYlbfkN0DE3Uz-7uTnSJNZlBBgxWZakqjdzsm9IcLRq2ubJUfUV1hRgQLwqvIKdyaOD2havv9T1kZAwXCbXk4WknHdMUER3FwQGnP_FyJbZTsd4c2NBzF-EolLURjB1ZsYZ3Ro-p_OI--l4uTC7Z9Tj_FTOL1C6uDXNEykRDnRZKQ-xPWwlo7g1mOIOFXWtAu-OZpOVw26pCQFR-1JasoUxEQiX09mA_hxgC-rUISX5Ho7rUzCtBewSl3F8uELW-ZyC115igKiZItJIb5tupOtLaDaai3PBFvEZgp02XECptvKKReU1bYju58FZ6iIKLxH9eXWpGGxrH5T1IJzmjbw9xf9Cj8-Fh340QMSVct

# Prototype the model with a single record

In [None]:
card = cards[5]

In [None]:
cards[5]

In [None]:
atag = card.h2.a

In [None]:
job_title = atag.get('title')

In [None]:
job_url = 'https://www.indeed.com' + atag.get('href')

In [None]:
company = card.find('span', 'company').text.strip()

In [None]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')

In [None]:
job_summary = card.find('div', 'summary').text.strip().replace('\n', ' ')

In [None]:
post_date = card.find('span', 'date').text

In [None]:
today = datetime.today().strftime('%Y-%m-%d')

In [None]:
try:
    card.find('span', 'salaryText').text.strip()
except AttributeError:
    job_salary = ''

# Generalize the model with a function

In [None]:
def get_record(card):
    '''Extract job data from a single record'''
    atag = card.h2.a
    
    job_title = atag.get('title')
    
    company = card.find('span', 'company').text.strip()
    
    try:
        job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    except AttributeError:
        job_location = ''
        
    try:
        job_summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    except AttributeError:
        job_summary = ''
    
    try:
        job_salary = card.find('span', 'salaryText').text.strip()
    except AttributeError:
        job_salary = ''
        
    try:
        post_date = card.find('span', 'date').text
    except AttributeError:
        post_date = ''
        
    today = datetime.today().strftime('%Y-%m-%d')
        
    job_url = 'https://www.indeed.com' + atag.get('href')
    
    record = (job_title, company,job_location, job_summary, job_salary, post_date, today, job_url)
    
    return record

In [None]:
records = []

for card in cards:
    record=get_record(card)
    records.append(record)

In [None]:
records[5]

# Get to next page of results

In [None]:
while True:
    try:
        url = 'https://www.indeed.com'+soup.find('a', {'aria-label':'Next'}).get('href')
    except AttributeError:
        break
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    
    for card in cards:
        record = get_record(card)
        records.append(record)

In [None]:
len(records)

# Put it all together

In [25]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

def get_record(card):
    '''Extract job data from a single record'''
    atag = card.h2.a
    job_title = atag.get('title')
    company = card.find('span', 'company').text.strip()
    try:
        job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    except AttributeError:
        job_location = ''
    try:
        job_summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    except AttributeError:
        job_summary = ''
    try:
        job_salary = card.find('span', 'salaryText').text.strip()
    except AttributeError:
        job_salary = ''
    try:
        post_date = card.find('span', 'date').text
    except AttributeError:
        post_date = ''
    today = datetime.today().strftime('%Y-%m-%d')
    job_url = 'https://www.indeed.com' + atag.get('href')
    
    record = (job_title, company,job_location, job_summary, job_salary, post_date, today, job_url)
    return record

def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
            
    #save the job data
    with open('scrape.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'Summary', 'Salary', 'PostDate', 'ExtractDate', 'JobUrl'])
        
        writer.writerows(records)

In [26]:
main('analyst', 'chicago il')