In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

import time
from tqdm import tqdm_notebook

#Function definitions

def extract_jobtitle_from_result(result):
    try:
        return result.find('a',attrs={'class':'jobtitle'})['title']
    except:
        return np.nan

def extract_location_from_result(result):    
    try:
        return result.find('span',attrs={'class':'location accessible-contrast-color-location'}).text
    except:
        return np.nan
        
def extract_salary_from_result(result):
    try:
        return result.find('span',attrs={'class':"salaryText"}).text.strip() 
    except: 
        return np.nan

def extract_company_from_result(result):
    try:
        return result.find('span',attrs={'class':"company"}).text.strip()  
    except:
        return np.nan
    
def extract_summary_from_result(result):
    try:
        return result.find('div',attrs={'class':re.compile(r'^jobsearch-jobDescriptionText?')})   
    except:
        return np.nan
    
def extract_href_from_result(result):
    try:
        return result.find('a')['href']   
    except:
        return np.nan

In [2]:
states_to_use = ['Queensland','New South Wales','Victoria','South Australia','Western Australia','Tasmania','Northern Territory']


list_of_searchwords = ['data+scientist','data+analyst','research+scientist','business+intelligence','machine+learning']

In [37]:
#Determining number of jobs per state,region, area etc

jobs_in_state = {'state':[],'role':[],'jobs':[]}

for state in tqdm_notebook(states_to_use):
    
    for role in list_of_searchwords:
    
        count_url = f'https://au.indeed.com/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&filter=0'

        time.sleep(np.random.randint(0,2))

        counter = requests.get(count_url)
        soup_count = BeautifulSoup(counter.text,'html.parser')
        
        if len(soup_count.find_all('div',class_='bad_query'))==1:
            
            pass
        
        else:

            try:
                record_string = soup_count.find_all('div',attrs={'id':"searchCountPages"})[0].text
                record_string = record_string.replace(",","")
                max_results_per_city = int(re.search(r"(\w+)\sjobs",record_string).group(1))

            except:
                max_results_per_city = np.nan

            #try:
                #true_pages = int(soup_count.find_all("span",class_="pn")[-2].text)
                #print(true_pages,max_results_per_city)

            #except:
                #true_pages = 0
                #print(true_pages)

            jobs_in_state['state'].append(state)
            jobs_in_state['role'].append(role)
            jobs_in_state['jobs'].append(max_results_per_city)   

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




In [38]:
job_nums = pd.DataFrame(jobs_in_state).sort_values(by='jobs',ascending=False)

state_searchlist= job_nums.groupby(["state"]).mean().sort_values(by="jobs",ascending=True).index.to_list()
role_searchlist = job_nums.groupby(["role"]).mean().sort_values(by="jobs",ascending=True).index.to_list()
job_nums

Unnamed: 0,state,role,jobs
6,New South Wales,data+analyst,205
11,Victoria,data+analyst,105
5,New South Wales,data+scientist,77
1,Queensland,data+analyst,41
10,Victoria,data+scientist,37
8,New South Wales,business+intelligence,36
13,Victoria,business+intelligence,26
3,Queensland,business+intelligence,19
16,South Australia,data+analyst,16
9,New South Wales,machine+learning,13


In [39]:
df_dict = {'job_title':[],'job_category':[],
           'company':[],'location':[],
           'region':[],'salary':[],
           'link':[],'summary':[]}

#url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 294 # Set this to a high-value (5000) to generate more results. 
# Crawling more results, will also take much longer. First test your code on a small number of results and then expand.

all_files = []

for state in tqdm_notebook(state_searchlist):

    for role in tqdm_notebook(role_searchlist):
        
        count_url = f'https://au.indeed.com/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&start=0&filter=0'

        time.sleep(np.random.randint(0,1))

        counter = requests.get(count_url)
        soup_count = BeautifulSoup(counter.text,'html.parser')
        
        if len(soup_count.find_all('div',class_='bad_query'))==1:
            
            pass
        
        else:     

            try:
                record_string = soup_count.find_all('div',attrs={'id':"searchCountPages"})[0].text
                record_string = record_string.replace(",","")
                max_results_per_city = int(re.search(r"(\w+)\sjobs",record_string).group(1))
            except:
                max_results_per_city = 1

            for start in tqdm_notebook(range(0, max_results_per_city,50)):
                time.sleep(np.random.randint(0,1))

                URL = f'https://au.indeed.com/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&start={start}&filter=0'

                results = requests.get(URL)
                soup = BeautifulSoup(results.text,'html.parser')

                #print(city,int(max_results_per_city))

                #print(state,start,soup.find_all('div',attrs={'id':"searchCountPages"})[0].text)

                if len(soup.find_all('span',attrs={'class':'salary no-wrap'})) == 0:

                    pass

                else:
                    for jobtile in soup.find_all('div',attrs={'class':re.compile(r'^jobsearch-Serp.*unified.*')}):

                        if len(jobtile.find_all('span',attrs={'class':'salary no-wrap'})) == 0:
                            pass
                        else:
                            all_files.append(jobtile)
                            df_dict['job_category'].append(" ".join(role.split("+")).title())
                            df_dict['region'].append(" ".join(state.split("+")).title())

                            detail_link = "https://au.indeed.com/"+extract_href_from_result(jobtile)
                            df_dict['link'].append(detail_link)
                            time.sleep(np.random.randint(0,1))
                            detailgrab = requests.get(detail_link)
                            soup_detail = BeautifulSoup(detailgrab.text,'html.parser')

                            try:
                                df_dict['summary'].append(extract_summary_from_result(soup_detail).text)
                            except:
                                df_dict['summary'].append(np.nan)

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [40]:
for x in all_files:
    try:
        df_dict['location'].append(extract_location_from_result(x))
    except:
        df_dict['location'].append(np.nan)
        
    try:
        df_dict['company'].append(extract_company_from_result(x))
    except:
        df_dict['company'].append(np.nan)
        
    try:
        df_dict['salary'].append(extract_salary_from_result(x))
        
    except:
        df_dict['salary'].append(np.nan)

    try:
        df_dict['job_title'].append(extract_jobtitle_from_result(x))
        
    except:
        df_dict['job_title'].append(np.nan)

In [41]:
df = pd.DataFrame(df_dict)

np.sum(df[df.salary.notna()].duplicated() ==False)*100/df.shape[0]

#df = df[df.salary.notna()]

#df = df[df.duplicated()==False]

df.to_csv("aus_jobs.csv",index=False)

In [42]:
df[df.duplicated()==False]

Unnamed: 0,job_title,job_category,company,location,region,salary,link,summary
0,Behavioural Research Scientist - Brain Perform...,Research Scientist,CSIRO,Adelaide SA,South Australia,"$98,000 - $106,000 a year",https://au.indeed.com//rc/clk?jk=bb7ad50949972...,The Opportunity\nHelp improve health outcomes ...
1,PERMANENT SSO4 - OPEN ACCESS COLLEGE - STUDENT...,Business Intelligence,Department for Education and Child Development...,Marden SA,South Australia,"$78,790 - $82,429 a year",https://au.indeed.com//rc/clk?jk=c7f13ed242366...,Position Descriptor:\nPlease note the salary a...
2,Data Analyst,Data Analyst,Data Driven Analytics,Adelaide SA,South Australia,$60 - $80 an hour,https://au.indeed.com//company/Data-Driven-Ana...,We are in the search for a Data Analyst who ha...
3,Data Analyst,Data Analyst,SoftLabs Pty Ltd,Adelaide SA,South Australia,$60 - $75 an hour,https://au.indeed.com//company/SoftLabs-Pty-Lt...,SoftLabs is looking for a Data Analyst to join...
4,Data Analyst,Data Analyst,Finite IT,Adelaide SA,South Australia,$75 - $85 an hour,https://au.indeed.com//rc/clk?jk=84c2205e8ec55...,Finite is Australia’s leading IT ‘recruiter of...
5,Senior Data Analyst,Data Analyst,Finite IT,Adelaide SA,South Australia,"$140,000 - $160,000 a year",https://au.indeed.com//rc/clk?jk=4da90e24621e8...,Finite is Australia’s leading IT ‘recruiter of...
6,Research Scientist in Global Change Ecology,Research Scientist,CSIRO,Perth WA,Western Australia,"$113,000 - $132,000 a year",https://au.indeed.com//rc/clk?jk=31d409a5af24e...,The Opportunity\nUndertake dynamic research fo...
7,ASTRO 3D Research Scientist in HI Surveys,Research Scientist,University Of Western Australia,Crawley WA,Western Australia,"$93,653 a year",https://au.indeed.com//rc/clk?jk=0ddb97c3ae555...,Faculty of Engineering and Mathematical Scienc...
8,Data Scientist,Data Scientist,Launch Recruitment,Perth WA,Western Australia,$591 a day,https://au.indeed.com//rc/clk?jk=7390bf433cadc...,Data Scientist - Perth - 6 Month Contract - Da...
9,Data Analyst,Data Analyst,Michael Page,Perth WA,Western Australia,"$85,000 - $100,000 a year",https://au.indeed.com//rc/clk?jk=7610946681a83...,Opportunity to work with emerging technology\n...
