## Scrape job postings data from the website Seek.com

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Define search_terms
search_terms = [["data","engineer"],["data","scientist"],["analyst","data"], ["business","intelligence"],
                ["machine learning","engineer"],["quantitative","analyst"], ["data","warehouse","architect"]]

# Calculate the number of pages for each search_term
num_list = []
for search_term in search_terms:
    search_url = "https://www.seek.com.au/" + "-".join(search_term) + "-jobs"
    response_search = requests.get(search_url)
    html_search = response_search.text
    soup_search = BeautifulSoup(html_search, 'lxml')
    # find the content that indicates the total jobs count for the search result
    count_str = soup_search.select_one('strong[data-automation="totalJobsCount"]').text.replace(',', '')
    # calculate the possible number of pages for the search result(average 20 job postings each page)
    num_of_pages = int(int(count_str) / 20)
    num_list.append(num_of_pages)
    
num_list

[188, 30, 132, 143, 19, 11, 8]

In [3]:
# Find out the url for each job posting, store them in a list.
posting_urls = []
for i in range(len(num_list)):
    for n in range(1, num_list[i]+1):
        page_url = "https://www.seek.com.au/" + "-".join(search_terms[i]) + "-jobs?page=" + str(n)
        response_page = requests.get(page_url)
        html = response_page.text
        soup = BeautifulSoup(html, 'lxml')
        
        anchors = soup.find_all('a',{'class':'_2iNL7wI'})
        url_base='https://www.seek.com.au'
        for anchor in anchors:
            posting_urls.append(url_base + anchor.get('href'))

In [35]:
# Check how many job postings I have got.
len(posting_urls)

10695

In [8]:
# store the posting_urls into a dataframe so that I can export it for further use.
import pandas as pd
df_urls = pd.DataFrame(data=posting_urls, columns=['url'])
df_urls.head()
df_urls.to_csv('posting_urls')

In [54]:
from multiprocessing import Pool
import sys
import time
import traceback

sys.setrecursionlimit(30000)

allJobs = {}
failedUrls = []

def extractJobId(url):
    prefix_len = len('https://www.seek.com.au/job/')
    q_index = url.index('?')
    return url[prefix_len:q_index]

    
def fetchJob(url):
    job = {}


    try:
        job['job_id'] = extractJobId(url)
        posting_resp = requests.get(url)
        posting_html = posting_resp.text
        postingSoup = BeautifulSoup(posting_html, 'lxml')
        
        job['job_title'] = postingSoup.select_one('span[data-automation="job-detail-title"] h1').text    
        job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
        section = postingSoup.select_one('article[aria-labelledby="jobDescription"] section[aria-labelledby="jobInfoHeader"]')
        job['posted_date'] = section.select_one('dd[data-automation="job-detail-date"] span.lwHBT6d').text
        
        locationDD = section.select_one('dd[data-automation="job-detail-date"] + dt + dd')   
        job['city'] = locationDD.select_one('strong').text
        job['job_type'] = section.select_one('dd[data-automation="job-detail-work-type"]> span > span').text
        
        classificationDD = section.select_one('dd[data-automation="job-detail-work-type"] + div dd')
        job['classification'] = classificationDD.select_one('strong').text
    except Exception as e:
        traceback.print_exc()
        failedUrls.append(url)
        return {}

    try: 
        job['suburb'] = locationDD.select_one('span > span > span').contents[1]
    except:
        job['suburb'] = ''

    try:
        job['salary'] = section.select_one('dl > div span[class="lwHBT6d"]').text
    except:
        job['salary'] = ''
          
    try:
        section = postingSoup.select_one('article[aria-labelledby="jobDescription"] section[aria-labelledby="jobInfoHeader"]')
        classificationDD = section.select_one('dd[data-automation="job-detail-work-type"] + div dd')
        job['subclassification'] = classificationDD.select_one('span > span > span').contents[1]
    except:
        job['subclassification'] = ''
    try:
        job['recruiter_rating'] = postingSoup.select_one('span[data-automation="job-header-company-review-title"] + span span._1erK2ob').contents[0]
    except:
        job['recruiter_rating'] = ''

    try:
        job['description'] = postingSoup.select_one('div[data-automation="jobDescription"] div[data-automation="mobileTemplate"]').text
    except:
        job['description'] = ''
    return job


In [58]:
def fetchJobs(start, end):
    for url in posting_urls[start:end]:
        job = fetchJob(url)
        if job:
            allJobs[job['job_id']] = job

fetchJobs(0, 10695)

print("AllJobs:", len(allJobs))
print("FailedURLs:", failedUrls)


Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has 

Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has 

Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has 

Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has 

Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has no attribute 'text'
Traceback (most recent call last):
  File "<ipython-input-54-32b914f58bfa>", line 28, in fetchJob
    job['recruiter'] = postingSoup.select_one('span[class="_3FrNV7v _2QG7TNq E6m4BZb"] span').text
AttributeError: 'NoneType' object has 

AllJobs: 7818
FailedURLs: ['https://www.seek.com.au/job/40918623?type=standard#searchRequestToken=98e46458-12df-4de7-a85e-bd38e1074817', 'https://www.seek.com.au/job/40842834?type=standard#searchRequestToken=aba23ccc-8ca4-4d1b-a314-e525e3d363dd', 'https://www.seek.com.au/job/40918623?type=standard#searchRequestToken=98e46458-12df-4de7-a85e-bd38e1074817', 'https://www.seek.com.au/job/40842834?type=standard#searchRequestToken=aba23ccc-8ca4-4d1b-a314-e525e3d363dd', 'https://www.seek.com.au/job/40878511?type=standard#searchRequestToken=ca51fa32-91d9-4e0c-a4ba-3f5e2933a6d6', 'https://www.seek.com.au/job/40916466?type=standard#searchRequestToken=60da5cc3-56e1-4135-b939-44fa9248433f', 'https://www.seek.com.au/job/40849022?type=standout#searchRequestToken=8caed537-0ec3-46b9-abb5-b8a86f6bf897', 'https://www.seek.com.au/job/40806877?type=standard#searchRequestToken=342203cb-3ef3-4e15-9a1a-e662c94d51be', 'https://www.seek.com.au/job/40882223?type=standard#searchRequestToken=f96fda3c-44b2-4ec6-bc8

In [74]:
# Convert the dictionary allJobs to a dataframe

df_all_jobs = pd.DataFrame.from_dict(allJobs, orient='index')
df_all_jobs.head()


Unnamed: 0,job_id,job_title,recruiter,posted_date,city,job_type,classification,suburb,salary,subclassification,recruiter_rating,description
40771115,40771115,Python Expert,Atlastix,19 Jan 2020,Melbourne,Contract/Temp,Science & Technology,CBD & Inner Suburbs,,"Mathematics, Statistics & Information Sciences",,Are you a gun coder with a deep appreciation o...
40855053,40855053,Operations Command Center Engineer T2 - Data,The Vocus Group,31 Jan 2020,Melbourne,Full Time,Information & Communication Technology,CBD & Inner Suburbs,,Telecommunications,2.8,"Do you love technology, network operations an..."
40925581,40925581,Data Engineer,The Onset,10 Feb 2020,Sydney,Full Time,Information & Communication Technology,"CBD, Inner West & Eastern Suburbs","$170,000 + Super + Bonus",Engineering - Software,,It's your call whether to garner support from ...
40925535,40925535,Data Engineer,Allura Partners,10 Feb 2020,Sydney,Full Time,Information & Communication Technology,"CBD, Inner West & Eastern Suburbs",,Database Development & Administration,,"As a Junior Data Scientist, you will focus on ..."
40924597,40924597,Data Engineer,Precision Sourcing,10 Feb 2020,Sydney,Full Time,Information & Communication Technology,,$110k - $120k p.a.,Developers/Programmers,,About the business This is an amazing time to...


In [76]:
# Save the dataframe as a csv file so that I can use it for analysis later (When next time I open the notebook, 
# I don't have to re-run the code to scrap the data again)
df_all_jobs.to_csv('Data_jobs_full_version.csv')