In [1]:
# # Scraper Version 3.0 -- Job Descriptions and locations from multiple search pages, duplications removed

from bs4 import BeautifulSoup
import urllib.request
import time
import pandas as pd
import random  # for generating random delay times, to confuse Indeed
from urllib.error import URLError, HTTPError


def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
    # Print New Line on Complete
    if iteration == total:
        print()

In [14]:
# Number of search result pages to scrape (~18 posts per page)
num_pages = 1

# Job title to search for
job = "Data Engineer"

# URL's we need
base_url = "https://www.indeed.com/"
base_search_url = base_url + "jobs?q=" + job.replace(" ","_") + "&start="

# HTML class id used to grab the html element containing the job description
class_id = "jobsearch-JobComponent-description"


# These lists will store each job description and location as individual list elements
job_descriptions = list()
job_locations = list()
job_titles = list()


print("Looking for {0} pages of '{1}' job postings".format(num_pages, job))
# Start Scraping here

num_done = 0


for page_num in range(0, num_pages):
    # Get links from each page of the search results, up to a specified number of pages
    time.sleep(float(random.randrange(5, 50)/100))
    # Retrieve the search results one page at a time (starting at 0, 10, 20, ....)
    while True:
        try:
            soup = BeautifulSoup(urllib.request.urlopen(base_search_url + str(page_num*10)), 'html.parser')
            break
        except (URLError, HTTPError) as e:
            time.sleep(10)
            continue
        break

    
    search_results = soup.find_all("div", attrs={"data-tu":""})
    
    # Will hold dictionaries, each being one job posting
    jobs = list()
    
    for result in search_results:
        title = result.find("a", attrs={"data-tn-element":"jobTitle"})
        loc = result.find("div", attrs={"class":"sjcl"})
        if title is not None and loc is not None:
            # We got a match for a sponsored job - 
            
            # Grab the Title of the job, the job description, and the location
            job_location = loc.get_text()
            job_title = title.get_text()
            
            job_link = base_url + str(title.get('href'))
            print(job_link)
            while True:
                # Force Indeed to give us the job description
                try:
                    soup_job = BeautifulSoup(urllib.request.urlopen(job_link), 'html.parser')
                    #job_desc = soup_job.find("div", attrs={"class":"jobsearch-JobComponent-description icl-u-xs-mt--md"})
                    break
                except (URLError, HTTPError) as e:
                    time.sleep(5)
                    continue
                break
                
            job_desc = soup_job.find(class_=class_id).get_text()
            print(job_desc)

            if job_location is not None and job_title is not None and job_desc is not None:
                # This job has a location, title, and description. Add it to our data
                print("here 2")
                jobs.append({"job_location": job_location,
                            "job_title": job_title,
                             "job_description": job_desc.get_text(),  # Extract job desc text here
                            "sponsored": True})
                
                # Update progress bar
                printProgressBar(num_done, num_pages*11, prefix='Progress:', suffix='Complete', length=50)
                num_done += 1
        
        else:
            # Check if this element is an organic job (not sponsored)
            title = result.find("h2", attrs={"class":"jobtitle"})
            loc = result.find("span", attrs={"class":"location"})
            
            if title is not None and loc is not None:
                # We have a match for an organic job, extract info
                
                # Grab the Title of the job
                title_element = title.find("a", attrs={"data-tn-element":"jobTitle"})
                job_title = title_element.get_text()

                
                # Extract Location Text
                job_location = loc.get_text()
            
                while True:
                    # Force Indeed to give us the job description
                    try:
                        soup_job = BeautifulSoup(urllib.request.urlopen(base_url + str(title_element.get('href'))), 'html.parser')
                        break
                    except (URLError, HTTPError) as e:
                        time.sleep(5)
                        continue
                    break
                
                # Extract Job Description
                job_desc = soup_job.find(class_=class_id).get_text()
            
                print("here 3")
                if job_location is not None and job_title is not None and job_desc is not None:
                    # This job has a location, title, and description. Add it to our data
                
                    jobs.append({"job_location": job_location,
                                "job_title": job_title,
                                 "job_description": job_desc,
                                "sponsored": False})
                    print("here 4")
                    # Update progress bar
                    printProgressBar(num_done, num_pages*11, prefix='Progress:', suffix='Complete', length=50)
                    num_done += 1

print(pd.DataFrame(jobs))

Looking for 1 pages of 'Data Engineer' job postings


AttributeError: 'NoneType' object has no attribute 'get_text'

In [4]:
jobs_df = pd.DataFrame(jobs)

In [6]:
jobs_df

Unnamed: 0,job_description,job_location,job_title,sponsored
0,InternshipPosition Summary:\nThe New York Coun...,"New York, NY",Data Engineer Internship,False
1,Book of the Month is looking for a detail-orie...,"New York, NY 10001 (Chelsea area)",Data Engineer,False
2,"$50,000 a yearJob Description\n\nNo Computer S...","Tampa, FL 33612",Entry Level Data Engineer,False
3,A Junior Data Engineer will leverage a dynamic...,"Poulsbo, WA",Junior Data Engineer (2 yrs experience required),False
4,Company DescriptionNovantas Solutions is a div...,"New York, NY 10017 (Midtown area)",Data Engineer,False
5,You will have the opportunity to work as part ...,"New York, NY",DATA ENGINEER,False
6,Job Description\nWe are a startup within one o...,"New York, NY",Data Engineer,False
7,Job Description\n-Thinking Big-\nDo you want t...,"Santa Monica, CA",Data Engineer - Amazon Studios Research,False
8,ContractNo OPT's.GC/USC preferable.Long term c...,"New York, NY",Data Engineer,False
9,We are Farmers!\n\nJob Summary:\n\nThe Data Ro...,"Woodland Hills, CA",Data Engineer I (seeking the December 2018 gra...,False
