# Part 1 : collecting data
## Webscraping Stage

In [1]:
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
URL = 'https://www.indeed.co.uk/data-scientist-jobs-in-London'

In [3]:
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')

**Defining scraping functions**

In [5]:
#extract Jobtitle function
def extract_jobtitle_from_result(soup):
    jobtitle_list = []
    for element in soup.findAll('div', {'class': 'jobsearch-SerpJobCard unifiedRow row result'}):
        try:
            jobtitle_list.append(element.find('a', {'class': 'jobtitle turnstileLink'}).text.strip('\n'))
        except:
            jobtitle_list.append('None')
    return jobtitle_list

#extract Company function
def extract_company_from_result(soup):
    company_list = []
    for element in soup.findAll('div', {'class': 'jobsearch-SerpJobCard unifiedRow row result'}):
        try:
            company_list.append(element.find('span', {'class': 'company'}).text.strip('\n'))
        except:
            try:
                company_list.append(element.find('a', {'class': 'turnstileLink'}).text.strip('\n'))

            except:
                company_list.append('None') 
    return company_list

#extract salary function
def extract_salary_from_result(soup):
    salary_list = []
    for element in soup.findAll('div', {'class': 'jobsearch-SerpJobCard unifiedRow row result'}):
        try:
            salary_list.append(element.find('span', {'class': 'salaryText'}).text.strip('\n'))
        except:
            salary_list.append('None') #fill missing entries with 'None'  
    return salary_list

#extract Location function
def extract_location_from_result(soup):
    location_list = []
    for element in soup.findAll('div', {'class': 'jobsearch-SerpJobCard unifiedRow row result'}):
        try:
            location_list.append(element.find('span', {'class': 'location accessible-contrast-color-location'}).text.strip('\n'))
        except:
            try:
                location_list.append(element.find('div', {'class': 'location accessible-contrast-color-location'}).text.strip('\n'))
            except:
                location_list.append('None')                
    return location_list 
#Max number of results calculator 
def max_iter_calc(city):
    URL = 'https://www.indeed.co.uk/jobs?q=data+scientist,+data+analyst,+machine+learning,+Data+Architect,+data+engineer&l={}&start=0'.format(city)
    r = requests.get(URL) 
    soup = BeautifulSoup(r.text, 'html.parser') 
    raw_page_range = soup.find('div', {'class': 'searchCount-a11y-contrast-color'}).text.split('\n')[2].split()
    
    page_range = []    
    for word in raw_page_range:
        if word.replace(',', '').isdigit():
            page_range.append(int(word.replace(',', '')))

    max_result = page_range[1]
    max_n_pages = round(max_result/15)
    max_results_per_city = max_n_pages*10
    
    if max_results_per_city > 800:
        return 800
    else:
        return max_results_per_city

## For one City

In [35]:
import time 
max_results_per_city =850  #no need to go past 1000 as final page for London goes to ~500
raw_data = []

for start in range(0, max_results_per_city, 10): #iterate over different URL's with different starting points
    URL = 'https://www.indeed.co.uk/jobs?q=data+scientist,+data+analyst,+machine+learning,+Data+Architect,+data+engineer&l={}&start={}'.format(city,start)
    r = requests.get(URL)         #URL chose with 'data scientist, data analyst, machine learning' in search
    soup = BeautifulSoup(r.text, 'html.parser') #for each URL get the raw text
    raw_data.append(soup)  #append the raw text into raw_data list
    print(start, end=' ')  #display how far into the process this loop is (horizontally printed)
    time.sleep(np.abs(np.random.normal(loc=5,scale=0.5)))  #add sleeper delay to avoid captcha block

0 10 20 

Results for London

In [42]:
#results 
import itertools

jobtitle_listoflist = []
company_listoflist = []
salary_listoflist = []
location_listoflist = []

for clickcard in raw_data:  #collect each clickard within raw_data
    jobtitle_listoflist.append(extract_jobtitle_from_result(clickcard))
    company_listoflist.append(extract_company_from_result(clickcard))
    salary_listoflist.append(extract_salary_from_result(clickcard))
    location_listoflist.append(extract_location_from_result(clickcard))
         
jobtitle_list = list(itertools.chain(*jobtitle_listoflist)) #convert list of lists into single list using itertools
company_list = list(itertools.chain(*company_listoflist))
salary_list = list(itertools.chain(*salary_listoflist))
location_list = list(itertools.chain(*location_listoflist))

print(jobtitle_list[0:5])
print(company_list[0:5])
print(salary_list[0:5])
print(location_list[0:5])

['Data Analyst Apprentice', 'Data Analyst (FinOps)', 'Data Analyst Financial Services (Graduate Role)', 'Data Analyst', 'Machine Learning Research Scientist']
['TalentCloud Solutions', 'Funding Circle UK', 'Tardis Group', 'Exchange Data International', 'nPlan']
['£20,000 a year', 'None', '£25,000 - £32,500 a year', '£25,000 a year', 'None']
['London N5 1XL', 'London', 'London EC2V', 'London NW5 1JY', 'London']


In [44]:
#dataframe for London scrape
raw_df_london = pd.DataFrame({'jobtitle': jobtitle_list, 'company': company_list, 
                              'salary': salary_list, 'location': location_list}) 
raw_df_london = raw_df_london.copy()
raw_df_london.head()

Unnamed: 0,jobtitle,company,salary,location
0,Data Analyst Apprentice,TalentCloud Solutions,"£20,000 a year",London N5 1XL
1,Data Analyst (FinOps),Funding Circle UK,,London
2,Data Analyst Financial Services (Graduate Role),Tardis Group,"£25,000 - £32,500 a year",London EC2V
3,Data Analyst,Exchange Data International,"£25,000 a year",London NW5 1JY
4,Machine Learning Research Scientist,nPlan,,London


## For multiple cities

In [189]:
import time 
import itertools

city_list = ['london','manchester','birmingham', 'leeds','glasgow','southampton','liverpool','newcastle']

dict_of_city_dfs = {}      #creating empty dict - accumulates raw df's after completed for each city

for city in city_list: #iterates over list of cities
    print(city) # city status
    
    max_results_per_city = max_iter_calc(city) #calls the max number of iterations calculator for given city
    print("Max n. of results per city: ", max_results_per_city)
    time.sleep(np.abs(np.random.normal(loc=6,scale=0.5)))  #add sleeper delay to avoid captcha block

    raw_data = [] #list of raw clickards
    
    for start in range(0, max_results_per_city, 10): #iterate over different URL's with different starting points
        URL = 'https://www.indeed.co.uk/jobs?q=data+scientist,+data+analyst,+machine+learning,+Data+Architect,+data+engineer&l={}&start={}'.format(city,start)
        r = requests.get(URL)         #URL chose with 'data scientist/analyst/engineer/architect, machine learning' in searchbar
        soup = BeautifulSoup(r.text, 'html.parser') #for each URL get the raw text
        raw_data.append(soup)  #append the raw text into raw_data list
        time.sleep(np.abs(np.random.normal(loc=6,scale=0.5)))  #add sleeper delay to avoid captcha block
        print(start, end=' ')  #display how far into the process this loop is (horizontally printed)        

    jobtitle_listoflist = []
    company_listoflist = []
    salary_listoflist = []
    location_listoflist = []

    for clickcard in raw_data:
        jobtitle_listoflist.append(extract_jobtitle_from_result(clickcard)) #collect each clickard element within raw_data 
        company_listoflist.append(extract_company_from_result(clickcard))
        salary_listoflist.append(extract_salary_from_result(clickcard))
        location_listoflist.append(extract_location_from_result(clickcard))

    jobtitle_list = list(itertools.chain(*jobtitle_listoflist)) #convert list of lists into single list using itertools
    company_list = list(itertools.chain(*company_listoflist))
    salary_list = list(itertools.chain(*salary_listoflist))
    location_list = list(itertools.chain(*location_listoflist))
    
    #create key value pairs of each city name for its corresponding dataframe after each iteration of city
    key = city 
    value = pd.DataFrame({'jobtitle': jobtitle_list, 'company': company_list, 
                          'salary': salary_list, 'location': location_list})
    dict_of_city_dfs[key] = value #updating dictionary with new dataframe for each city

london
Max n. of results per city:  800
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750 760 770 780 790 manchester
Max n. of results per city:  800
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 680 690 700 710 720 730 740 750 760 770 780 790 birmingham
Max n. of results per city:  500
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 310 320 330 340 350 360 370 380 390 400 410 420 430 440 450 460 470 480 490 leeds
Max n. of results per city:  430
0 10 20 30 40 50 60 70 80 

**Saving to CSV's**

In [488]:
dict_of_city_dfs['london'].to_csv('london.csv')
dict_of_city_dfs['manchester'].to_csv('manchester.csv')
dict_of_city_dfs['birmingham'].to_csv('birmingham.csv')
dict_of_city_dfs['leeds'].to_csv('leeds.csv')
dict_of_city_dfs['glasgow'].to_csv('glasgow.csv')
dict_of_city_dfs['southampton'].to_csv('southampton.csv')
dict_of_city_dfs['liverpool'].to_csv('liverpool.csv')
dict_of_city_dfs['newcastle'].to_csv('newcastle.csv')