In [1]:
import pandas as pd
import requests as req
import bs4

# Web scraper for Indeed Web page

## Creating headers

In [2]:
url = 'https://cr.indeed.com/jobs?q=Developer&l=Costa+Rica'
base_link = 'https://cr.indeed.com'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
headers = {
    'User-Agent': user_agent,
}

## Using requests library

In [3]:
response = req.get(url, headers=headers)
response.ok #200

True

## Creating and BeautifulSoup object base on response obj

In [4]:
jobs_soup = bs4.BeautifulSoup(response.text, 'html.parser')

In [5]:
response.text[:400]

'<!DOCTYPE html>\n<html lang="es" dir="ltr">\n<head>\n<meta http-equiv="content-type" content="text/html;charset=UTF-8">\n<script type="text/javascript" src="/s/872b482/es_CR.js"></script>\n<link href="/s/b219863/jobsearch_all.css" rel="stylesheet" type="text/css">\n<link rel="alternate" type="application/rss+xml" title="Trabajo - Developer en Costa Rica - Ofertas Empleo" href="http://cr.indeed.com/rss?q'

## Job selector

In [6]:
jobs_soup.select('.jobsearch-SerpJobCard')[0].select('.title')[0].select('.jobtitle')[0].text.strip()

'Wordpress Developer - Senior - Full Stack'

## Date selector

In [7]:
jobs_soup.select('.jobsearch-SerpJobCard .jobsearch-SerpJobCard-footer .date')[:5]

[<span class="date">hace 6 días</span>,
 <span class="date">hace 5 días</span>,
 <span class="date">30+ days ago</span>,
 <span class="date">hace 4 días</span>,
 <span class="date">30+ days ago</span>]

## Iterating for every job and buid an object

In [8]:
jobs_soup.select('.jobsearch-SerpJobCard')[0].select('.title')[0].select('.jobtitle')[0].text.strip()

'Wordpress Developer - Senior - Full Stack'

In [9]:
jobs_soup.select('.jobsearch-SerpJobCard')[0];

In [10]:
job_list = [{
        'title': job.select('.title')[0].select('.jobtitle')[0].text.strip(), 
        'company': job.select('.company')[0].text.strip(),
        'location': job.select('.location')[0].text.strip(),
        'link':'{}{}'.format(base_link, job.select('.title')[0].select('.jobtitle')[0]['href']) ,
        #'date': job.select('.jobsearch-SerpJobCard-footer .date')[0].text,
        } for job in jobs_soup.select('.jobsearch-SerpJobCard')]
job_list[:3]

[{'title': 'Wordpress Developer - Senior - Full Stack',
  'company': 'Confidential',
  'location': 'San José, Provincia de San José',
  'link': 'https://cr.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0C42uWlTcvQuSvYOtAav8PzwEmyqfcTD3ltHWCdPLhdUYorS3opGZQvGm0d7zjEri2TF-bveZjPY36Yp6PvrnjIVoYoo6PeuoRWkw70Jaj2LlZUkADIviLskz246y2Lp7tuLFfqtnNFxT3VrI_uXwlNB7ndpPJoS1XivDqv_TinQinoUNUYAhZrbgZMJR61cdJS_-C0pFpa5B2aCRbCJYa3ZqNCAxhCbTkcT90kOXW1NZbVUFHcmakjplBSKMDB9lkgpSn-mfgSI2DSAslXvlCsQl1R19ShvorgE8j_Qn__m9HUG5RNOWoWzT-VwoEzMyx6krhH3SH6Rsa1LnlPTheaYo9KULmMbjrGB_f-EqQH-YKs0DY-AOJW0_0yRMtSqG7PYA7mHeXTyDiNRAYL3zV3XKilFqkP09IYEEnn1se3J18wTAFDvb4_&p=0&fvj=1&vjs=3'},
 {'title': 'WordPress Developer (Junior)',
  'company': 'Debugger Technology Solutions',
  'location': 'Curridabat, Provincia de San José',
  'link': 'https://cr.indeed.com/rc/clk?jk=3cb3503d625c9a94&fccid=1dd6fadbcd0d89b7&vjs=3'},
 {'title': 'Data Developer',
  'company': 'Wind River',
  'location': 'San José, Provincia de San José',
  'link':

## Visit every job 

In [11]:
base_link = 'https://cr.indeed.com'
job_links = []
for job in job_list:
    job_links.append(job['link'])

## Visition job at 0 pos

In [12]:
response2 = req.get(job_links[0].__str__())
response2.ok

True

In [13]:
jobs_soup2 = bs4.BeautifulSoup(response2.text, 'html.parser')

## Company & location selector

In [14]:
jobs_soup2.select('.jobsearch-InlineCompanyRating')[0].text

'Confidential-San José, Provincia de San José'

## Job description

In [15]:
jobs_soup2.select('#jobDescriptionText')[0].text[:300]

"About YouWe're looking for a forward thinking, engaging, Full Stack Senior Wordpress Developer with the ability to work closely with our design & marketing teams and manage multiple clients WP sites.You will live and breathe development and will be able to demonstrate your ability solve complex prob"

# Dealing with pagination

## Get next pag url

In [16]:
# Get last number page in the pagination field and then get the url from that particular number anchor object
_next = jobs_soup.select('.pagination a')[-1]['href']
display(_next)

'/jobs?q=Developer&l=Costa+Rica&start=10'

In [17]:
# URL is incomplete, concat with base url
next_link = f'{base_link}/jobs?q=Developer&l=Costa+Rica&start=230'
display(next_link)

'https://cr.indeed.com/jobs?q=Developer&l=Costa+Rica&start=230'

# Read CSV Generated file

In [25]:
df = pd.read_csv(r'..\data\crjobs.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,company,description,link,location,title
0,0,Confidential,"About YouWe're looking for a forward thinking,...",https://cr.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"San José, Provincia de San José",Wordpress Developer - Senior - Full Stack
1,1,Wind River,Position Title: Data Developer\nLocation: Cost...,https://cr.indeed.com/rc/clk?jk=db6ad71bdea13e...,"San José, Provincia de San José",Data Developer
2,2,Debugger Technology Solutions,Opciones de Contrato: (medio tiempo - tiempo c...,https://cr.indeed.com/rc/clk?jk=3cb3503d625c9a...,"Curridabat, Provincia de San José",WordPress Developer (Junior)
3,3,Accenture,Accenture solves our clients' toughest challen...,https://cr.indeed.com/rc/clk?jk=9ee85b4d7614c1...,"San José, Provincia de San José",Front-End Developer Analyst
4,4,MBC Developers,Perfil del puesto: Se requiere Ingeniero(a) de...,https://cr.indeed.com/rc/clk?jk=4591ca5da45148...,"Montes de Oca, Provincia de San José",Junior Developer
5,5,Perfiles Tecnológicos,"The ideal candidate is a self- motivated, mult...",https://cr.indeed.com/rc/clk?jk=05fd6bc019bb0e...,"San José, Provincia de San José",Azure Backend Developer
6,6,4Geeks Technologies,Your role would be to provide support to a sen...,https://cr.indeed.com/rc/clk?jk=f50e11f2dc0065...,"San José, Provincia de San José",Jr. Drupal Developer
7,7,Accenture,Accenture solves our clients' toughest challen...,https://cr.indeed.com/rc/clk?jk=3e216ade39e58a...,"San José, Provincia de San José",Web Developer Associate
8,8,ServicePro,We are rapidly growing software company lookin...,https://cr.indeed.com/rc/clk?jk=c8bf502c8c987e...,"San José, Provincia de San José",Developer
9,9,NTT DATA Services,"Req ID: 59667\n\nAt NTT DATA Services, we know...",https://cr.indeed.com/rc/clk?jk=cc1f81cac44d1c...,"San José, Provincia de San José",Cognos Developer


# Questions

In [26]:
df.shape

(261, 6)