In [197]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

import time
from tqdm import tqdm_notebook

#Function definitions

def extract_jobtitle_from_result(result):
    try:
        return result.find('a',attrs={'class':'jobtitle'})['title']
    except:
        return np.nan

def extract_location_from_result(result):    
    try:
        return result.find('span',attrs={'class':'location accessible-contrast-color-location'}).text
    except:
        return np.nan
        
def extract_salary_from_result(result):
    try:
        return result.find('span',attrs={'class':"salaryText"}).text.strip() 
    except: 
        return np.nan

def extract_company_from_result(result):
    try:
        return result.find('span',attrs={'class':"company"}).text.strip()  
    except:
        return np.nan
    
def extract_summary_from_result(result):
    try:
        return result.find('div',attrs={'class':re.compile(r'^jobsearch-jobDescriptionText?')})   
    except:
        return np.nan
    
def extract_href_from_result(result):
    try:
        return result.find('a')['href']   
    except:
        return np.nan

In [198]:
states_to_use = ['England','Wales','Scotland','Northern Ireland']
list_of_searchwords = ['data+scientist','data+analyst','research+scientist','business+intelligence','machine+learning']

In [199]:
#Determining number of jobs per state,region, area etc

jobs_in_state = {'state':[],'role':[],'jobs':[]}

for state in tqdm_notebook(states_to_use):
    
    for role in list_of_searchwords:
    

        count_url = f'https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=&as_not=&as_ttl={role}&as_cmp=&jt=all&st=&as_src=&salary=&radius=5000&l={state}&fromage=any&limit=50&sort=&psf=advsrch&filter=0'

        time.sleep(np.random.randint(0,2))

        counter = requests.get(count_url)
        soup_count = BeautifulSoup(counter.text,'html.parser')

        try:
            record_string = soup_count.find_all('div',attrs={'id':"searchCountPages"})[0].text
            record_string = record_string.replace(",","")
            max_results_per_city = int(re.search(r"(\w+)\sjobs",record_string).group(1))
            
        except:
            max_results_per_city = np.nan
            
        #try:
            #true_pages = int(soup_count.find_all("span",class_="pn")[-2].text)
            #print(true_pages,max_results_per_city)
            
        #except:
            #true_pages = 0
            #print(true_pages)

        jobs_in_state['state'].append(state)
        jobs_in_state['role'].append(role)
        jobs_in_state['jobs'].append(max_results_per_city)   

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [217]:
job_nums = pd.DataFrame(jobs_in_state).sort_values(by='jobs',ascending=False)

state_searchlist= job_nums.groupby(["state"]).mean().sort_values(by="jobs",ascending=True).index.to_list()
role_searchlist = job_nums.groupby(["role"]).mean().sort_values(by="jobs",ascending=True).index.to_list()
job_nums[job_nums.jobs.isna()]

Unnamed: 0,state,role,jobs


In [220]:
df_dict = {'job_title':[],'job_category':[],
           'company':[],'location':[],
           'region':[],'salary':[],
           'link':[],'summary':[]}

#url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 294 # Set this to a high-value (5000) to generate more results. 
# Crawling more results, will also take much longer. First test your code on a small number of results and then expand.

all_files = []

for state in tqdm_notebook(state_searchlist):

    for role in tqdm_notebook(role_searchlist):
        
        count_url = f'https://www.indeed.co.uk/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&start=0&filter=0'

        time.sleep(np.random.randint(0,1))

        counter = requests.get(count_url)
        soup_count = BeautifulSoup(counter.text,'html.parser')

        try:
            record_string = soup_count.find_all('div',attrs={'id':"searchCountPages"})[0].text
            record_string = record_string.replace(",","")
            max_results_per_city = int(re.search(r"(\w+)\sjobs",record_string).group(1))
        except:
            max_results_per_city = 1
         
        for start in tqdm_notebook(range(0, max_results_per_city,50)):
            time.sleep(np.random.randint(0,1))
            
            URL = f'https://www.indeed.co.uk/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&start={start}&filter=0'
            
            results = requests.get(URL)
            soup = BeautifulSoup(results.text,'html.parser')
                        
            #print(city,int(max_results_per_city))
            
            #print(state,start,soup.find_all('div',attrs={'id':"searchCountPages"})[0].text)
            
            if len(soup.find_all('span',attrs={'class':'salary no-wrap'})) == 0:
                
                pass
                
            else:
                for jobtile in soup.find_all('div',attrs={'class':re.compile(r'^jobsearch-Serp.*unified.*')}):

                    if len(jobtile.find_all('span',attrs={'class':'salary no-wrap'})) == 0:
                        pass
                    else:
                        all_files.append(jobtile)
                        df_dict['job_category'].append(" ".join(role.split("+")).title())
                        df_dict['region'].append(" ".join(state.split("+")).title())
                        
                        detail_link = "https://www.indeed.co.uk/"+extract_href_from_result(jobtile)
                        df_dict['link'].append(detail_link)
                        time.sleep(np.random.randint(0,1))
                        detailgrab = requests.get(detail_link)
                        soup_detail = BeautifulSoup(detailgrab.text,'html.parser')
                        
                        try:
                            df_dict['summary'].append(extract_summary_from_result(soup_detail).text)
                        except:
                            df_dict['summary'].append(np.nan)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))

HBox(children=(IntProgress(value=0, max=33), HTML(value='')))




In [221]:
for x in all_files:
    try:
        df_dict['location'].append(extract_location_from_result(x))
    except:
        df_dict['location'].append(np.nan)
        
    try:
        df_dict['company'].append(extract_company_from_result(x))
    except:
        df_dict['company'].append(np.nan)
        
    try:
        df_dict['salary'].append(extract_salary_from_result(x))
        
    except:
        df_dict['salary'].append(np.nan)

    try:
        df_dict['job_title'].append(extract_jobtitle_from_result(x))
        
    except:
        df_dict['job_title'].append(np.nan)

In [228]:
df = pd.DataFrame(df_dict)

np.sum(df[df.salary.notna()].duplicated() ==False)*100/df.shape[0]

#df = df[df.salary.notna()]

#df = df[df.duplicated()==False]

#df.to_csv("uk_jobs.csv",index=False)

78.6697247706422

In [232]:
df[df.duplicated()==False]

Unnamed: 0,job_title,job_category,company,location,region,salary,link,summary
0,Sport and Exercise Scientist (Teaching & Resea...,Research Scientist,Bangor University,Bangor,Northern Ireland,"£50,132 - £58,089 a year",https://www.indeed.co.uk//rc/clk?jk=3222e19600...,Applications are invited for the above permane...
1,Software Test Engineer – AI Machine Learning,Machine Learning,VANRATH,Belfast,Northern Ireland,"£50,000 - £70,000 a year",https://www.indeed.co.uk//rc/clk?jk=94ecff3e9a...,Introduction\nSoftware Test Engineer – AI Mach...
2,Java Consultant ( Machine Learning ),Machine Learning,VANRATH,Belfast,Northern Ireland,"£55,000 - £67,500 a year",https://www.indeed.co.uk//rc/clk?jk=c427cdddf4...,My client are literally one of the most succes...
3,Business Intelligence Analyst (Omagh),Business Intelligence,realTime,County Tyrone,Northern Ireland,"£30,000 - £45,000 a year",https://www.indeed.co.uk//rc/clk?jk=96d01bffcf...,Business Intelligence Analyst/Engineer (Omagh ...
4,Business Intelligence Analyst,Business Intelligence,Gi Group,Omagh,Northern Ireland,"£30,000 a year",https://www.indeed.co.uk//rc/clk?jk=bfe04ead4e...,Full Time\nPermanent\nDuties and responsibilit...
5,Business Intelligence Analyst,Business Intelligence,Gi Group,Omagh,Northern Ireland,"£30,000 a year",https://www.indeed.co.uk//rc/clk?jk=8cc3745452...,Full Time\nPermanent\nBusiness Intelligence An...
6,Business Intelligence Analyst,Business Intelligence,Gi Group UK,,Northern Ireland,"£25,000 - £30,000 a year",https://www.indeed.co.uk//pagead/clk?mo=r&ad=-...,Duties and responsibilities\nGather report/pro...
7,Data Scientist ( Java ),Data Scientist,VANRATH,Belfast,Northern Ireland,"£65,000 - £70,000 a year",https://www.indeed.co.uk//rc/clk?jk=b9ff91ca57...,Data Science Manager -Java / Python / R / C++/...
8,"Data Scientist ( 12 month Contract, Belfast)",Data Scientist,realTime,Belfast,Northern Ireland,"£75,000 - £85,000 a year",https://www.indeed.co.uk//rc/clk?jk=186557480e...,"Data Scientist (Contract, Belfast)\n\nMy clien..."
9,Data Scientist ( Java ),Data Scientist,VANRATH,Belfast,Northern Ireland,"£65,000 - £70,000 a year",https://www.indeed.co.uk//rc/clk?jk=4f87e38910...,Data Science Manager -Java / Python / R / C++/...
