In [1]:
# import dependencies
import pandas as pd
import requests
from bs4 import BeautifulSoup
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
import re
import time
from tqdm import tqdm
import pymongo
from pymongo import MongoClient

# PyMongo Set Up

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# define database
db = client.glassdoor_db

# define the collection name
collection = db.glassdoor_listings

# Chromedriver

In [4]:
# set the chromedriver path
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

# City URLs

In [5]:
phoenix_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1133904&jobType="
portland_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1151614&jobType="
houston_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1140171&jobType="
seattle_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1150505&jobType="
sanfrancisco_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1147401&jobType="
austin_url = "https://www.glassdoor.com/Job/jobs.htm?sc.keyword=Data%20Scientist&locT=C&locId=1139761&locKeyword=Austin, %20TX&srs=RECENT_SEARCHES"
sanjose_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1147436&jobType="
boston_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1154532&jobType="
washington_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1138213&jobType="
philadelphia_url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword=Data+Scientist&sc.keyword=Data+Scientist&locT=C&locId=1152672&jobType="

In [6]:
city_urls = [phoenix_url,
             portland_url,
             houston_url,
             seattle_url,
             sanfrancisco_url,
             austin_url,
             sanjose_url,
             boston_url,
             washington_url,
             philadelphia_url]

In [7]:
cities = [phoenix_url,
         portland_url,
         houston_url,
         sanfrancisco_url,
         austin_url,
         sanjose_url,
         boston_url,
         washington_url,
         philadelphia_url]

# Web-Scraping Functions

In [8]:
def close_modal():
    if len(browser.find_by_css(".xBtn")):
        browser.find_by_css(".xBtn")[0].click()

def is_last_page():
    if len(browser.find_by_css("li.page.current.last")):
        return True
    return False

def next_page():
    try:
        next_pages = browser.find_by_css("li.next")
        next_pages[0].click()
        return True
    except:
        return False

In [9]:
# save results in MongoDB
def save_results():
    results = browser.find_by_xpath(xpath)
    for res in results:
        try:
            # use this to grab different job descriptions
            res.click()
            close_modal()
            time.sleep(15)
            # collect the salary data and translate them into integers
            salary_info = res.find_by_css("div:nth-child(3) > div:nth-child(1) > span").text
            salary_low = int(salary_info.split('-')[0].split('k')[0].split('$')[1]) * 1000
            salary_high = int(salary_info.split('-')[1].split('k')[0].split('$')[1]) * 1000

            # gather company_name, city and state
            company_info = res.find_by_css("div.flexbox.empLoc > div").text
            company_name = company_info.split('–')[0]
            city = company_info.split('–')[1].split(',')[0]
            state = company_info.split('–')[1].split(',')[1]

            # make the rating float number
            rating = float(res.find_by_css("div.logoWrap > span").text)

            # gather the job_description information
            job_description = browser.find_by_id("JobDescriptionContainer").text

            # save data into mongodb
            posting = {
                'company_name': company_name,
                'city': city,
                'state': state,
                'salary_low': salary_low,
                'salary_high': salary_high,
                'rating': rating,
                'job_description': job_description
            }
            
            collection.insert_one(posting)
            
        except:
            pass

# Scrape the Data

In [10]:
for city_url in tqdm(cities):

    browser.visit(city_url)
   
    xpath = "//*[@id='MainCol']/div/ul/li"

    last_page = False
    counter = 0
    while not last_page:
        if is_last_page(): 
                last_page = True
        close_modal()
        print(f"Processing Page {counter+1}")
        save_results()
        counter += 1
        if not next_page():
            last_page = True

  0%|          | 0/9 [00:00<?, ?it/s]

Processing Page 1
Processing Page 2
Processing Page 3
Processing Page 4
Processing Page 5
Processing Page 6
Processing Page 7
Processing Page 8
Processing Page 9
Processing Page 10
Processing Page 11
Processing Page 12


 11%|█         | 1/9 [44:12<5:53:38, 2652.36s/it]

Processing Page 1
Processing Page 2
Processing Page 3
Processing Page 4
Processing Page 5
Processing Page 6
Processing Page 7
Processing Page 8
Processing Page 9
Processing Page 10


 22%|██▏       | 2/9 [1:22:53<4:50:08, 2486.87s/it]

Processing Page 1
Processing Page 2
Processing Page 3
Processing Page 4
Processing Page 5
Processing Page 6
Processing Page 7
Processing Page 8
Processing Page 9
Processing Page 10
Processing Page 11
Processing Page 12
Processing Page 13


 33%|███▎      | 3/9 [2:13:04<4:26:08, 2661.45s/it]

Processing Page 1


 44%|████▍     | 4/9 [2:14:53<2:48:37, 2023.43s/it]

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=56005): Max retries exceeded with url: /session/f08bc6228c017547d8bf6a6c407b9920/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11cbf8828>: Failed to establish a new connection: [Errno 61] Connection refused',))

In [15]:
collection.find_one()['salary_high']

110000