In [1]:
import pandas as pd
import requests as req
import bs4

# Web scraper for Indeed Web page

## Creating headers

In [2]:
url = 'https://cr.indeed.com/jobs?q=Developer&l=Costa+Rica'
base_link = 'https://cr.indeed.com'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
headers = {
    'User-Agent': user_agent,
}

## Using requests library

In [3]:
response = req.get(url, headers=headers)
response.ok #200

True

## Creating and BeautifulSoup object base on response obj

In [4]:
jobs_soup = bs4.BeautifulSoup(response.text, 'html.parser')

In [5]:
response.text[:400]

'<!DOCTYPE html>\n<html lang="es" dir="ltr">\n<head>\n<meta http-equiv="content-type" content="text/html;charset=UTF-8">\n<script type="text/javascript">\n\n(function ( tk ) { if ( tk && document.images ) { var s="/", q="?", a="&", e="="; (new Image()).src = s+"rpc"+s+"log"+q+"a"+e+"jslat"+a+"tk"+e+tk; } })(\'1dhkgdpl926qb003\');\n</script>\n<script type="text/javascript" src="/s/872b482/es_CR.js"></script>\n'

## Job selector

In [6]:
jobs_soup.select('.jobsearch-SerpJobCard')[0].select('.title')[0].select('.jobtitle')[0].text.strip()

'Wordpress Developer - Senior - Full Stack'

## Date selector

In [7]:
jobs_soup.select('.jobsearch-SerpJobCard .jobsearch-SerpJobCard-footer .date')[:5]

[<span class="date">hace 7 horas</span>,
 <span class="date">hace 21 horas</span>,
 <span class="date">hace 1 día</span>,
 <span class="date">hace 8 meses</span>,
 <span class="date">hace 13 días</span>]

## Iterating for every job and buid an object

In [8]:
job_list = [{
        'title': job.select('.title')[0].select('.jobtitle')[0].text.strip(), 
        'link':'{}{}'.format(base_link, job.select('.title')[0].select('.jobtitle')[0]['href']) ,
        'date': job.select('.jobsearch-SerpJobCard-footer')[0].select('.date')[0].text,
        } for job in jobs_soup.select('.jobsearch-SerpJobCard')]
job_list[:3]

[{'title': 'Wordpress Developer - Senior - Full Stack',
  'link': 'https://cr.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0C42uWlTcvQuSvYOtAav8PzwEmyqfcTD3ltHWCdPLhdUYorS3opGZQvGm0d7zjEri2TF-bveZjPY36Yp6PvrnjIVoYoo6PeuoRWkw70Jaj2LlZUkADIviLskz246y2Lp7tuLFfqtnNFxT3VrI_uXwlNPySTXI287MlpzwEJN-Xn-QlS3-x4AqDepzNsF55gRYA8nNCUgPVDu8bdmG2AHQO1eU20TTQaexiyoz6RaONBEHETjAOg8__YaCDnmHtQdhWyNkZ0_4WWqp0XfaPmadLeco0ZsB91-51CMiUZnVlan3OLfPdainQRCAmOjqM3Hfb8OriO4qfU-QRoozjkgrcor4UorRkZ6Xi66rTjTM_kNrAKG5J1s8adYuAsFdbVIoMbOuDpuhMKmt_gtsRZgMTkUQGJVqPIOR-7o03w8A_vlTcJuNRYTMLT&p=0&fvj=1&vjs=3',
  'date': 'hace 7 horas'},
 {'title': 'Data Developer',
  'link': 'https://cr.indeed.com/rc/clk?jk=db6ad71bdea13e5f&fccid=30c6be3855bc8a7c&vjs=3',
  'date': 'hace 21 horas'},
 {'title': 'WordPress Developer (Junior)',
  'link': 'https://cr.indeed.com/rc/clk?jk=3cb3503d625c9a94&fccid=1dd6fadbcd0d89b7&vjs=3',
  'date': 'hace 1 día'}]

## Visit every job 

In [9]:
base_link = 'https://cr.indeed.com'
job_links = []
for job in job_list:
    job_links.append(job['link'])

## Visition job at 0 pos

In [10]:
response2 = req.get(job_links[0].__str__())
response2.ok

True

In [11]:
jobs_soup2 = bs4.BeautifulSoup(response2.text, 'html.parser')

## Company & location selector

In [12]:
jobs_soup2.select('.jobsearch-InlineCompanyRating')[0].text

'Confidential-'

## Job description

In [13]:
jobs_soup2.select('#jobDescriptionText')[0].text[:300]

"About YouWe're looking for a forward thinking, engaging, Full Stack Senior Wordpress Developer with the ability to work closely with our design & marketing teams and manage multiple clients WP sites.You will live and breathe development and will be able to demonstrate your ability solve complex prob"

# Dealing with pagination

## Get next pag url

In [14]:
# Get last number page in the pagination field and then get the url from that particular number anchor object
_next = jobs_soup.select('.pagination a')[-1]['href']
display(_next)

'/jobs?q=Developer&l=Costa+Rica&start=10'

In [15]:
# URL is incomplete, concat with base url
next_link = f'{base_link}/jobs?q=Developer&l=Costa+Rica&start=230'
display(next_link)

'https://cr.indeed.com/jobs?q=Developer&l=Costa+Rica&start=230'

# Read CSV Generated file

In [16]:
df = pd.read_csv(r'..\data\jobs.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,company,date,description,link,title
0,0,World Vision International-,30+ days ago,\r\nPreferred position location: Costa Rica. O...,https://cr.indeed.com/rc/clk?jk=767ab2e9bf350f...,MuleSoft Developer
1,1,"MBC Developers-Montes de Oca, Provincia de San...",30+ days ago,Perfil del puesto: Se requiere Ingeniero(a) de...,https://cr.indeed.com/rc/clk?jk=4591ca5da45148...,Junior Developer
2,2,Lionbridge Technologies-,hace 1 hora,"Core responsibilities will be\r\nDesign, archi...",https://cr.indeed.com/rc/clk?jk=604e61e3558efb...,Senior Web Developer
3,3,Intertec International11 evaluaciones-,hace 3 días,Job Description:\r\n\r\nMajor Functions/Respon...,https://cr.indeed.com/rc/clk?jk=40d09cbbc095fd...,SQL Developer
4,4,"Cargill3,766 evaluaciones-San Antonio, Provinc...",30+ days ago,Position Purpose\r\n\r\nThe Application develo...,https://cr.indeed.com/rc/clk?jk=555f779d180a19...,IT Developer
5,5,Debugger Technology Solutions-,30+ days ago,Opciones de Contrato: (medio tiempo - tiempo c...,https://cr.indeed.com/rc/clk?jk=2b20952a80daa7...,WordPress Developer (Junior)
6,6,"VMware718 evaluaciones-Heredia, Provincia de H...",hace 12 días,"As an Application developer at VMware, you’ll ...",https://cr.indeed.com/rc/clk?jk=f0f89311353c40...,IT Dev. Junior Application Developer
7,7,Manatí-,hace 19 días,Buscamos una persona con experiencia en desarr...,https://cr.indeed.com/rc/clk?jk=e782e231b93c32...,Backend Developer for Drupal
8,8,Manatí-,hace 19 días,Buscamos una persona con experiencia en desarr...,https://cr.indeed.com/rc/clk?jk=40006a43cd2197...,Frontend Developer for Drupal
9,9,Empleos.net-,hace 8 días,Crear o modificar programas usando diferentes ...,https://cr.indeed.com/rc/clk?jk=349e7d32f0b85f...,JUNIOR SOFTWARE DEVELOPER
