In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

Let's define base url for the scraper <br>

In [2]:
base_url = "https://www.indeed.com/"
job_search_url = base_url+"jobs?q=AR%2FVR&l=Menlo+Park%2C+CA&start={}"
job_key_words = ["ar/vr","augmented reality","virtual reality"]
job_title = []
jobs_main_url = []
temp_urls = set()
company_names = []
location = []
job_description = []

### Lets get the job listing urls and job title
scrape first few pages of Indeed and get the job lising relevant to AR/VR.

In [3]:
def extract_job_titles(soup): 
    for div in soup.find_all(name="h2", attrs={"class":"title"}):
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            if any(word in a["title"].lower() for word in job_key_words):
                post_url = base_url+a["href"]
                if post_url not in temp_urls:
                    job_title.append(a["title"])
                    jobs_main_url.append(post_url)
                    post_url = ""

In [4]:
for i in range(35):
    url_listing = job_search_url.format(i)
    page = requests.get(url_listing)
    soup = BeautifulSoup(page.text, "html.parser")
    extract_job_titles(soup)
# free up the memory by clearning the temp sets
temp_urls.clear()

In [5]:
len(jobs_main_url)

392

### Lets get the job description for further processing


In [6]:
def extract_job_description(soup): 
    # company title
    for div in soup.find_all(name="div", attrs={"class":"icl-u-lg-mr--sm icl-u-xs-mr--xs"}):
        if len(div.text.split('-')[0]) > 0:
            company_names.append(div.text.split('-')[0])
    # location
    for div in soup.find_all(name="div", attrs={"class":"jobsearch-DesktopStickyContainer-companyrating"}):
        if len(div.text.split('-')[1]) > 2:
            location.append(div.text.split('-')[1])
    # Job Description
    for div in soup.find_all(name="div", attrs={"class":"jobsearch-jobDescriptionText"}):
        job_description.append(div.text)

In [7]:
for i in range(len(jobs_main_url)):
    url_individual_listing = jobs_main_url[i]
    page = requests.get(url_individual_listing)
    soup = BeautifulSoup(page.text, "html.parser")
    extract_job_description(soup)

In [8]:
# for url in jobs_main_url:
#     page = requests.get(url)
#     soup = BeautifulSoup(page.text, "html.parser")
#     extract_job_description(soup)

In [9]:
len(job_title),len(jobs_main_url),len(job_description), len(location), len(company_names)

(392, 392, 392, 392, 392)

### Lets save the scraped data as a CSV file

In [10]:
# create a data frame to save the raw data into a csv file 
#jb_url = list(jobs_main_url)
raw_data = pd.DataFrame({
    'Title' : job_title,
    'Company': company_names,
    'Location': location,
    'JD': job_description,
    'URL': jobs_main_url
})

In [11]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 5 columns):
Title       392 non-null object
Company     392 non-null object
Location    392 non-null object
JD          392 non-null object
URL         392 non-null object
dtypes: object(5)
memory usage: 15.4+ KB


In [12]:
raw_data.head()

Unnamed: 0,Title,Company,Location,JD,URL
0,AR/VR Business Analyst,Tailored Management,"Menlo Park, CA",Job Title: Business Analyst ILocation: Menlo P...,https://www.indeed.com//pagead/clk?mo=r&ad=-6N...
1,"Producer, Augmented Reality (Menlo Park)",The Mom Project,"Menlo Park, CA",Company Industry: TechnologyOpportunity: The M...,https://www.indeed.com//pagead/clk?mo=r&ad=-6N...
2,"Director, Demand Planning AR/VR",Facebook,"Menlo Park, CA",As a leader within the Business Management tea...,https://www.indeed.com//rc/clk?jk=4cfdb4526611...
3,"Accountant, AR/VR",Facebook,"Menlo Park, CA","Facebook is seeking an Accountant for AR/VR, F...",https://www.indeed.com//rc/clk?jk=04a2950fd53a...
4,AR/VR Apps Review Specialist,Facebook,"Menlo Park, CA",Our team ensures that applications shipped in ...,https://www.indeed.com//rc/clk?jk=11bf7d641f87...


In [13]:
raw_data.to_csv('raw_data.csv',header=True)

### Lets see how many unique job titles we have in our data set.

lets find out the key skills for each of them.

In [14]:
print("We have {} unique job types in our data set".format(len(raw_data.Title.unique())))

We have 51 unique job types in our data set


In [20]:
raw_data.Title.nunique()

51

In [16]:
raw_data['Title'].value_counts()

Producer, Augmented Reality (Menlo Park)                                35
AR/VR Business Analyst                                                  35
Content Manager, AR/VR (Japanese Market)                                14
AR/VR Strategic Account Manager, Online Retail                          12
QA Lead, AR/VR                                                          10
AR/VR Electrical Engineer                                               10
Wireless Software Engineer, AR/VR                                       10
Product Experience Quality Engineer, AR/VR Accessories                  10
Manager, Display Strategic Sourcing - AR/VR                             10
Director, Business Enablement AR/VR                                     10
Accounting Manager, AR/VR                                               10
Director, Product, AR/VR Growth                                         10
Horizon TPM Lead - Facebook AR/VR Experiences                           10
Manager, Channel Operatio

In [19]:
raw_data.URL.nunique()

132

Looks like we are having lot of duplicates entries in our dataset. Let's clear them out