In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

Let's define base url for the scraper <br>

In [2]:
base_url = "https://www.indeed.com/"
job_search_url = base_url+"jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start={}"
job_key_words = ["ar/vr","augmented reality","virtual reality"]
job_title = []
jobs_main_url = []
company_names = []
location = []
job_description = []

### Lets get the job listing urls and job title
scrape first few pages of Indeed and get the job lising relevant to AR/VR.

In [3]:
def extract_job_titles(soup): 
    for div in soup.find_all(name="h2", attrs={"class":"title"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            if any(word in a["title"].lower() for word in job_key_words):
                job_title.append(a["title"])
                jobs_main_url.append(base_url+a["href"])

In [4]:
for i in range(15):
    base_url.format(i)
    page = requests.get(job_search_url)
    soup = BeautifulSoup(page.text, "html.parser")
    extract_job_titles(soup)

In [5]:
len(jobs_main_url)

57

In [6]:
jobs_main_url[0]

'https://www.indeed.com//pagead/clk?mo=r&ad=-6NYlbfkN0DI_pqscLjs9LkB0jlO39g2s8RE9SCHTdataN4HV1TulGfWI13h5d2IIBkrMCBpdyoKFuk6LoRw93nv9VIf-nwycaiTnbsKlxaYmP2oUVCrEos5MOIWzflKyJqtBexCiKWVxNVq0D9Lo3YRyX8n959AonagBpNTwfxL-PurTe6VIps6UhXSzCelZ2i6M80jnMdII1b6sHyJCLymqcUTpn1x6YNzrQomP40NjVEvoaKBN1-JuBw-fFhQ0IIOAgxHqXYt8Vri_lNyTY75aw7SXF_6d2OSMWxSjL3LVtSVR0zFNaGlM2RGHK0dJ_PcAT0ol6b_LaL11ASSoT7gxY_gCP6k31Jg9WD9on827TP6qOx62Wh6H5AX3eB8SgkkvxGMpHfbJfiac0m4Vw8cTqRNvPSnD3yZPdm6ELIBcAhR0sr2bNe1tAhYiEZRNI-jo4R8Btz-IsgLf-JxJOKC72PudnjZjGyb0XAAUU8=&p=0&fvj=1&vjs=3'

### Lets get the job description for further processing


In [7]:
def extract_job_description(soup): 
    # company title
    for div in soup.find_all(name="div", attrs={"class":"icl-u-lg-mr--sm icl-u-xs-mr--xs"}):
        if len(div.text.split('-')[0]) > 0:
            company_names.append(div.text.split('-')[0])
    # location
    for div in soup.find_all(name="div", attrs={"class":"jobsearch-DesktopStickyContainer-companyrating"}):
        if len(div.text.split('-')[1]) > 2:
            location.append(div.text.split('-')[1])
    # Job Description
    for div in soup.find_all(name="div", attrs={"class":"jobsearch-jobDescriptionText"}):
        job_description.append(div.text)

In [8]:
for i in range(len(jobs_main_url)):
    url_individual_listing = jobs_main_url[i]
    page = requests.get(url_individual_listing)
    soup = BeautifulSoup(page.text, "html.parser")
    extract_job_description(soup)

In [9]:
len(job_title),len(jobs_main_url),len(job_description), len(location), len(company_names)

(57, 57, 57, 57, 57)

### Lets save the scraped data as a CSV file

In [10]:
# create a data frame to save the raw data into a csv file 
raw_data = pd.DataFrame({
    'Title' : job_title,
    'Company': company_names,
    'Location': location,
    'JD': job_description,
    'URL': jobs_main_url
})

In [11]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 5 columns):
Title       57 non-null object
Company     57 non-null object
Location    57 non-null object
JD          57 non-null object
URL         57 non-null object
dtypes: object(5)
memory usage: 2.4+ KB


In [12]:
raw_data.head()

Unnamed: 0,Title,Company,Location,JD,URL
0,AR/VR Business Analyst,Tailored Management,"Menlo Park, CA",Job Title: Business Analyst ILocation: Menlo P...,https://www.indeed.com//pagead/clk?mo=r&ad=-6N...
1,AR/VR Apps Review Specialist,Facebook,"Menlo Park, CA",Our team ensures that applications shipped in ...,https://www.indeed.com//rc/clk?jk=11bf7d641f87...
2,Strategic Planning & Operations Lead for Faceb...,Facebook,"Menlo Park, CA",The Partnerships Strategic Planning & Operatio...,https://www.indeed.com//rc/clk?jk=4edd3fa153d3...
3,"Technology Communications Director, AR/VR",Facebook,"Menlo Park, CA",We are searching for an experienced leader to ...,https://www.indeed.com//rc/clk?jk=e828972d8901...
4,"Security Partner, AR/VR",Facebook,"Menlo Park, CA",Facebook is rapidly expanding our core product...,https://www.indeed.com//rc/clk?jk=e669d6a7dc0b...


In [13]:
raw_data.to_csv('raw_data.csv',header=True)

### Lets see how many unique job titles we have in our data set.

lets find out the key skills for each of them.

In [14]:
print("We have {} unique job types in our data set".format(len(raw_data.Title.unique())))

We have 14 unique job types in our data set


In [15]:
raw_data.Title.unique()

array(['AR/VR Business Analyst', 'AR/VR Apps Review Specialist',
       'Strategic Planning & Operations Lead for Facebook AR/VR Partnerships',
       'Technology Communications Director, AR/VR',
       'Security Partner, AR/VR', 'Accountant, AR/VR',
       'Contact Center Program Manager, AR/VR',
       'Customer Experience Analyst, AR/VR', 'Product Designer, AR/VR',
       'Supply Planner, AR/VR',
       'Producer, Augmented Reality (Menlo Park)',
       'Content Manager, AR/VR (Japanese Market)',
       'Director, Global Logistics & Trade AR/VR',
       'Project Manager, AR/VR Social Experiences'], dtype=object)

In [16]:
raw_data['Title'].value_counts()

Strategic Planning & Operations Lead for Facebook AR/VR Partnerships    5
Contact Center Program Manager, AR/VR                                   5
Producer, Augmented Reality (Menlo Park)                                5
Technology Communications Director, AR/VR                               5
Product Designer, AR/VR                                                 5
AR/VR Apps Review Specialist                                            5
Supply Planner, AR/VR                                                   5
Accountant, AR/VR                                                       5
AR/VR Business Analyst                                                  5
Security Partner, AR/VR                                                 5
Customer Experience Analyst, AR/VR                                      4
Project Manager, AR/VR Social Experiences                               1
Director, Global Logistics & Trade AR/VR                                1
Content Manager, AR/VR (Japanese Marke