In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

import time
from tqdm import tqdm_notebook

#Function definitions

def extract_jobtitle_from_result(result):
    try:
        return result.find('a',attrs={'class':'jobtitle'})['title']
    except:
        return np.nan

def extract_location_from_result(result):    
    try:
        return result.find('span',attrs={'class':'location accessible-contrast-color-location'}).text
    except:
        return np.nan
        
def extract_salary_from_result(result):
    try:
        return result.find('span',attrs={'class':"salaryText"}).text.strip() 
    except: 
        return np.nan

def extract_company_from_result(result):
    try:
        return result.find('span',attrs={'class':"company"}).text.strip()  
    except:
        return np.nan
    
def extract_summary_from_result(result):
    try:
        return result.find('div',attrs={'class':re.compile(r'^jobsearch-jobDescriptionText?')})   
    except:
        return np.nan
    
def extract_href_from_result(result):
    try:
        return result.find('a')['href']   
    except:
        return np.nan

In [2]:
states_to_use = ['British Columbia',
 'Alberta',
 'Saskatchewan',
 'Manitoba',
 'Ontario',
 'Quebec',
 'New Brunswick',
 'Prince Edward Island',
 'Nova Scotia',
 'Newfoundland and Labrador',
 'Yukon',
 'Northwest Territories',
 'Nunavut']


list_of_searchwords = ['data+scientist','data+analyst','research+scientist','business+intelligence','machine+learning']

In [3]:
#Determining number of jobs per state,region, area etc

jobs_in_state = {'state':[],'role':[],'jobs':[]}

for state in tqdm_notebook(states_to_use):
    
    for role in list_of_searchwords:
    
        count_url = f'https://ca.indeed.com/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&filter=0'

        time.sleep(np.random.randint(0,2))

        counter = requests.get(count_url)
        soup_count = BeautifulSoup(counter.text,'html.parser')
        
        if len(soup_count.find_all('div',class_='bad_query'))==1:
            
            pass
        
        else:

            try:
                record_string = soup_count.find_all('div',attrs={'id':"searchCountPages"})[0].text
                record_string = record_string.replace(",","")
                max_results_per_city = int(re.search(r"(\w+)\sjobs",record_string).group(1))

            except:
                max_results_per_city = np.nan

            #try:
                #true_pages = int(soup_count.find_all("span",class_="pn")[-2].text)
                #print(true_pages,max_results_per_city)

            #except:
                #true_pages = 0
                #print(true_pages)

            jobs_in_state['state'].append(state)
            jobs_in_state['role'].append(role)
            jobs_in_state['jobs'].append(max_results_per_city)   

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




In [4]:
job_nums = pd.DataFrame(jobs_in_state).sort_values(by='jobs',ascending=False)

state_searchlist= job_nums.groupby(["state"]).mean().sort_values(by="jobs",ascending=True).index.to_list()
role_searchlist = job_nums.groupby(["role"]).mean().sort_values(by="jobs",ascending=True).index.to_list()
job_nums

Unnamed: 0,state,role,jobs
17,Ontario,data+analyst,164
16,Ontario,data+scientist,121
20,Ontario,machine+learning,64
19,Ontario,business+intelligence,62
24,Quebec,business+intelligence,49
22,Quebec,data+analyst,45
21,Quebec,data+scientist,44
0,British Columbia,data+scientist,38
1,British Columbia,data+analyst,35
23,Quebec,research+scientist,24


In [5]:
df_dict = {'job_title':[],'job_category':[],
           'company':[],'location':[],
           'region':[],'salary':[],
           'link':[],'summary':[]}

#url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 294 # Set this to a high-value (5000) to generate more results. 
# Crawling more results, will also take much longer. First test your code on a small number of results and then expand.

all_files = []

for state in tqdm_notebook(state_searchlist):

    for role in tqdm_notebook(role_searchlist):
        
        count_url = f'https://ca.indeed.com/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&start=0&filter=0'

        time.sleep(np.random.randint(0,1))

        counter = requests.get(count_url)
        soup_count = BeautifulSoup(counter.text,'html.parser')
        
        if len(soup_count.find_all('div',class_='bad_query'))==1:
            
            pass
        
        else:     

            try:
                record_string = soup_count.find_all('div',attrs={'id':"searchCountPages"})[0].text
                record_string = record_string.replace(",","")
                max_results_per_city = int(re.search(r"(\w+)\sjobs",record_string).group(1))
            except:
                max_results_per_city = 1

            for start in tqdm_notebook(range(0, max_results_per_city,50)):
                time.sleep(np.random.randint(0,1))

                URL = f'https://ca.indeed.com/jobs?as_ttl={role}&jt=all&radius=5000&l={state}&fromage=any&limit=50&start={start}&filter=0'

                results = requests.get(URL)
                soup = BeautifulSoup(results.text,'html.parser')

                #print(city,int(max_results_per_city))

                #print(state,start,soup.find_all('div',attrs={'id':"searchCountPages"})[0].text)

                if len(soup.find_all('span',attrs={'class':'salary no-wrap'})) == 0:

                    pass

                else:
                    for jobtile in soup.find_all('div',attrs={'class':re.compile(r'^jobsearch-Serp.*unified.*')}):

                        if len(jobtile.find_all('span',attrs={'class':'salary no-wrap'})) == 0:
                            pass
                        else:
                            all_files.append(jobtile)
                            df_dict['job_category'].append(" ".join(role.split("+")).title())
                            df_dict['region'].append(" ".join(state.split("+")).title())

                            detail_link = "https://ca.indeed.com/"+extract_href_from_result(jobtile)
                            df_dict['link'].append(detail_link)
                            time.sleep(np.random.randint(0,1))
                            detailgrab = requests.get(detail_link)
                            soup_detail = BeautifulSoup(detailgrab.text,'html.parser')

                            try:
                                df_dict['summary'].append(extract_summary_from_result(soup_detail).text)
                            except:
                                df_dict['summary'].append(np.nan)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [6]:
for x in all_files:
    try:
        df_dict['location'].append(extract_location_from_result(x))
    except:
        df_dict['location'].append(np.nan)
        
    try:
        df_dict['company'].append(extract_company_from_result(x))
    except:
        df_dict['company'].append(np.nan)
        
    try:
        df_dict['salary'].append(extract_salary_from_result(x))
        
    except:
        df_dict['salary'].append(np.nan)

    try:
        df_dict['job_title'].append(extract_jobtitle_from_result(x))
        
    except:
        df_dict['job_title'].append(np.nan)

In [7]:
df = pd.DataFrame(df_dict)

np.sum(df[df.salary.notna()].duplicated() ==False)*100/df.shape[0]

#df = df[df.salary.notna()]

#df = df[df.duplicated()==False]

df.to_csv("can_jobs.csv",index=False)

In [8]:
df[df.duplicated()==False]

Unnamed: 0,job_title,job_category,company,location,region,salary,link,summary
0,Solutions Analyst – ERP Data,Data Analyst,City of Saskatoon,"Saskatoon, SK",Saskatchewan,"$90,556 - $104,530 a year",https://ca.indeed.com//rc/clk?jk=1ac51de68c153...,Duties:\nConsults with the ERP Portfolio Leads...
1,Research Scientist - Crustal Seismologist AMEN...,Research Scientist,Natural Resources Canada,"Dartmouth, NS",Nova Scotia,"$55,870 - $144,413 a year",https://ca.indeed.com//rc/clk?jk=8cc4bb5bf127c...,Natural Resources Canada - Lands and Minerals ...
2,Veterinary machine learning data collection re...,Machine Learning,Talkatoo,"Halifax, NS",Nova Scotia,$20 an hour,https://ca.indeed.com//rc/clk?jk=b13f3e443a175...,This 2day 16-hour gig that involves working wi...
3,AU-03 Income Tax Business Intelligence and Qua...,Business Intelligence,Canada Revenue Agency,"Dartmouth, NS",Nova Scotia,"$81,525 - $99,065 a year",https://ca.indeed.com//rc/clk?jk=3c788ea53e61a...,Pour obtenir une description des termes clés u...
4,AU-03 Income Tax Business Intelligence and Qua...,Business Intelligence,Canada Revenue Agency,"Dartmouth, NS",Nova Scotia,"$81,525 - $99,065 a year",https://ca.indeed.com//rc/clk?jk=05d44e915017d...,For a description of key terms used on CRA job...
5,AU-01 Excise Tax /Income Tax Business Intellig...,Business Intelligence,Canada Revenue Agency,"Dartmouth, NS",Nova Scotia,"$59,674 - $78,134 a year",https://ca.indeed.com//rc/clk?jk=813af33e2263c...,About the job\nDon’t miss out on your opportun...
6,"Data Analyst, Early Childhood Collaborative Re...",Data Analyst,Mount Saint Vincent University,"Halifax, NS",Nova Scotia,$24 - $26 an hour,https://ca.indeed.com//rc/clk?jk=f7b160fa6b790...,"Dr. Jessie-Lee McIsaac, Assistant Professor wi..."
7,Business Intelligence & Business Development,Business Intelligence,Catchfire,"Winnipeg, MB",Manitoba,"$30,280 - $129,381 a year",https://ca.indeed.com//company/Catchfire/jobs/...,"If you are highly motivated, self-directed, un..."
8,Trauma Data Analyst - Pediatrics & Child Health,Data Analyst,Health Sciences Centre,"Winnipeg, MB",Manitoba,$22.88 an hour,https://ca.indeed.com//rc/clk?jk=840e9f2043798...,Requisition ID: 34677\nPosition Number: 200172...
9,"AU-03 Tax Auditor, Tax Avoidance Auditor, Inte...",Business Intelligence,Canada Revenue Agency,"Edmonton, AB",Alberta,"$81,525 - $99,065 a year",https://ca.indeed.com//rc/clk?jk=ea8616f9b7e24...,For a description of key terms used on CRA job...


In [15]:
url = 'https://api.exchangeratesapi.io/latest'

requests.get(url,params={"base":"GBP"}).json()

{'rates': {'CAD': 1.6756738031,
  'HKD': 10.0519642486,
  'ISK': 159.7034573547,
  'PHP': 65.7647982632,
  'DKK': 8.6265271715,
  'HUF': 379.7662763574,
  'CZK': 29.5249312917,
  'GBP': 1.0,
  'RON': 5.4912353634,
  'SEK': 12.4073304233,
  'IDR': 18001.1894039123,
  'INR': 90.9391671863,
  'BRL': 5.1654772628,
  'RUB': 81.7847987251,
  'HRK': 8.6064343287,
  'JPY': 139.2526386291,
  'THB': 38.6891152221,
  'CHF': 1.272431234,
  'EUR': 1.1547610799,
  'MYR': 5.368946165,
  'BGN': 2.2584817201,
  'TRY': 7.3918566249,
  'CNY': 9.0676459041,
  'NOK': 11.7629737407,
  'NZD': 2.0157509411,
  'ZAR': 18.7518187487,
  'USD': 1.2825931315,
  'MXN': 24.4832444167,
  'SGD': 1.748308275,
  'AUD': 1.8787962771,
  'ILS': 4.5315134299,
  'KRW': 1506.8130903716,
  'PLN': 4.9397214716},
 'base': 'GBP',
 'date': '2019-10-25'}

In [None]:
params = dict(
  client_id= client_id,
  client_secret=client_secret,
  v='20180323',
  ll='51.515086,-0.072265',
  query='steak',
  limit=10