### Import libraries 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

Let's define base url for the scraper <br>

In [2]:
base_url = "https://www.indeed.com/"
job_search_url = base_url+"jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start={}"
# a list to avoid scraping non relevant data
job_key_words = ["ar/vr","augmented reality","virtual reality"]
# list to store job titles
job_title = []
# list to store job posting urls
jobs_main_url = []
# set to store job posting urls to avoid inserting duplicate data
temp_urls = set()
# list to store company name
company_names = []
# list to store location, just incase if we are scraping in any other cities/country
location = []
# list to store job description
job_description = []

### Lets get the job listing urls and job title
scrape first few pages of Indeed and get the job lising relevant to AR/VR.

In [3]:
# a function to scrape the main search page of indeed
# get the job posting urls from the listing, i.e from the UL from html tag
def extract_job_titles(soup): 
    for div in soup.find_all(name="h2", attrs={"class":"title"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            # if title is diffent than the one specified in the list of keywords
            # move to next listing
            if any(word in a["title"].lower() for word in job_key_words):
                post_url = base_url+a["href"]
                # just to ensure we do not have the job listing already in our list
                if post_url not in temp_urls:
#                     job_title.append(a["title"])
                    jobs_main_url.append(post_url)
                    post_url = ""

In [4]:
# lets scrape first 8 pages of Indeed.com with the 
# specified keyword
for i in range(0, 80, 10):
    url_listing = job_search_url.format(i)
    print(url_listing)
    page = requests.get(url_listing)
    soup = BeautifulSoup(page.text, "html.parser")
    extract_job_titles(soup)
# free up the memory by clearning the temp sets
temp_urls.clear()

https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=0
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=10
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=20
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=30
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=40
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=50
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=60
https://www.indeed.com/jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start=70


In [5]:
len(jobs_main_url)

jobs_main_url[0]

'https://www.indeed.com//pagead/clk?mo=r&ad=-6NYlbfkN0DI_pqscLjs9LkB0jlO39g2s8RE9SCHTdataN4HV1TulGfWI13h5d2IIBkrMCBpdyoKFuk6LoRw93nv9VIf-nwycaiTnbsKlxaYmP2oUVCrEos5MOIWzflKyJqtBexCiKWVxNVq0D9Lo3YRyX8n959AonagBpNTwfxL-PurTe6VIuRucl5-GiO94yVWwnXAvVgISoGItfPrNseqMDsb4xC2ojEL91N8yKvVh4bd8Mc5fuIniD6hfFc5b-doaIZvhIa1uES5XVSvT3ZwSWYWbM26dFenN677y3wT3mAbQfeCx-eBrO3fZ60Mf6N8yo7O27bu3i_ACjkTtdABfxKxDVdGbAAiAKp5LueZVYo8A1yX4dEKjypmZ7SMejud5ZmCUl48R-WM6iV0GLzjxvtGle7gu1rXKaYhH5wem4w7C6VjphxN6S4ivBiFFK_odSQNZog5yGlaD_9buNb_h0eIbTwM&p=0&fvj=1&vjs=3'

Let's clear the duplicates entries if we still got any
***

In [6]:
from collections import OrderedDict
jobs_main_url = list(OrderedDict.fromkeys(jobs_main_url))
len(jobs_main_url)

80

### Lets get the job description for further processing


In [7]:
# a function to scrape the inner job listing to
# get the job description of that post
def extract_job_description(soup): 
    jd_str = "skills: "
    # job title
    for div in soup.find_all(name="h3", attrs={"class":"icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"}):
        job_title.append(div.text)
    # company title
    for div in soup.find_all(name="div", attrs={"class":"icl-u-lg-mr--sm icl-u-xs-mr--xs"}):
        if len(div.text.split('-')[0]) > 0:
            company_names.append(div.text.split('-')[0])
    # location
    for div in soup.find_all(name="div", attrs={"class":"jobsearch-DesktopStickyContainer-companyrating"}):
        if len(div.text.split('-')[1]) > 2:
            location.append(div.text.split('-')[1])
    # Job Description
#     for div in soup.find_all(name="div", attrs={"class":"jobsearch-jobDescriptionText"}):
#         job_description.append(div.text)
    for div in soup.findAll('li'):
        jd_str += " "+ div.text
    job_description.append(jd_str)
    jd_str = ""

In [8]:
# lets scrape all the job urls we have in our list,
# generate title,JD,location and comapny name for now
for i in range(len(jobs_main_url)):
    url_individual_listing = jobs_main_url[i]
    page = requests.get(url_individual_listing)
    soup = BeautifulSoup(page.text, "html.parser")
    extract_job_description(soup)

In [9]:
# for url in jobs_main_url:
#     page = requests.get(url)
#     soup = BeautifulSoup(page.text, "html.parser")
#     extract_job_description(soup)

In [10]:
# varify that all lists are of the same size 
# otherwise it will cause problem while creating a data frame
len(job_title),len(jobs_main_url),len(job_description), len(location), len(company_names)

(80, 80, 80, 80, 80)

### Lets save the scraped data as a CSV file

In [11]:
# create a data frame to save the raw data into a csv file 
#jb_url = list(jobs_main_url)
raw_data = pd.DataFrame({
    'Title' : job_title,
    'Company': company_names,
    'Location': location,
    'JD': job_description,
    'URL': jobs_main_url
})

In [12]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 5 columns):
Title       80 non-null object
Company     80 non-null object
Location    80 non-null object
JD          80 non-null object
URL         80 non-null object
dtypes: object(5)
memory usage: 3.2+ KB


In [13]:
raw_data.head(2)

Unnamed: 0,Title,Company,Location,JD,URL
0,AR/VR Business Analyst,Tailored Management,"Menlo Park, CA",skills: Consolidate third-party retailer sale...,https://www.indeed.com//pagead/clk?mo=r&ad=-6N...
1,"Producer, Augmented Reality (Menlo Park)",The Mom Project,"Menlo Park, CA",skills: Work closely with creative leads to s...,https://www.indeed.com//pagead/clk?mo=r&ad=-6N...


In [15]:
# save the data into local disk
raw_data.to_csv('raw_data.csv',header=True,index=False)