# Job Webscraper

## Setup

In [10]:
# Python ≥ 3.8.3 used.
import sys
assert sys.version_info >= (3, 8, 3)

In [11]:
# Selenium ≥ 3.141.0 used.
import selenium
assert selenium.__version__ >= "3.141.0"

In [12]:
# General imports.
import re
import time

# Specific imports.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC

In [13]:
PATH = "C:\Program Files (x86)\chromedriver.exe"

## Locators

In [14]:
# %%writefile locators.py

from selenium.webdriver.common.by import By


class ConfigLocators:
    """ConfigLocators contains the locators for configuration i.e. filters.
    
    Locators are in uppercase because they are constants and are
    grouped by category. Categories are denoted by a comment and separated
    by two lines of whitespaces. A single line of whitespace denotes
    the subcategory within a category. 
    
    The categories are as follows:
    
        - Search bars.
        - Primary dropdown.
        - Main Filters.
        - "More" dropdown.
        - "More" filters.
        - "Most Relevant" filter.
        - Clear filters.
        
    Note: There are some redundant constant variables,
    however they are kept to provide better clarity.
    Previous long variable names have been changed
    to a shorter form:
    
        Examples:
        
            Before:
                FILTER_MINSALARY_INCLUDE_SALARY_CHECKBOX
            After:
                INCLUDE_SALARY_CHECKBOX
            
            Before:
                PRIMARY_DROPDOWN,
                PRIMARY_DROPDOWN_UL,
                PRIMARY_DROPDOWN_UL_LI
            After:
                PRIMARY_DROPDOWN,
                DROPDOWN_UL,
                UL_LI
          
    """
    
    # Search bars.
    KEYWORD_SEARCH = (By.ID, "sc.keyword")
    LOCATION_SEARCH = (By.ID, "sc.location")
    SEARCH_BUTTON = (By.CLASS_NAME, "SearchStyles__newSearchButton")
    
    
    # Primary dropdown (the dropdown currently open).
    PRIMARY_DROPDOWN = (By.ID, "PrimaryDropdown")
    DROPDOWN_UL = (By.CLASS_NAME, "css-wpidup")
    UL_LI = (By.TAG_NAME, "li")
    
    
    # Main filters (all filters except "More" filters and "Most Relevant").
    FILTER_JOBTYPE = (By.ID, "filter_jobType")
    FILTER_FROMAGE = (By.ID, "filter_fromAge")
    
    FILTER_MINSALARY = (By.ID, "filter_minSalary")
    INCLUDE_SALARY_CHECKBOX = (By.CLASS_NAME, "gd-ui-checkbox")
    CHECKBOX_LABEL = (By.TAG_NAME, "label")
    APPLY_BUTTON = (By.CLASS_NAME, "applybutton")
    HISTOGRAM = (By.CLASS_NAME, "histogramContainer")
    HISTOGRAM_DIVS = (By.TAG_NAME, "div")
    LEFT_SLIDER = (By.XPATH, "//div[@class='leftHandle']")
    HIST_LABEL = (By.ID, "salary-range-hist-label")
    HIST_LABEL_HEADER = (By.TAG_NAME, "h4")
    RIGHT_SLIDER = (By.XPATH, "//div[@class='rightHandle']")
    
    FILTER_RADIUS = (By.ID, "filter_radius")
    
    
    # The "More" dropdown.
    DKFILTERS = (By.ID, "DKFilters")
    FILTER_MORE = (By.CLASS_NAME, "ewzpq9a0")
    
    
    # Filters under the "More" dropdown.
    FILTER_CITYID = (By.ID, "filter_cityId")
    FILTER_INDUSTRYID = (By.ID, "filter_industryId")
    # EAO: Early Apply Only; WFHO: Work From Home Only
    FILTER_EAOWFHO = (By.CLASS_NAME, "justified")
    EAOWFHO_LABEL = (By.TAG_NAME, "label")
    
    FILTER_COMPANYRATING = (By.CLASS_NAME, "noHover")
    COMPANYRATING_STARS = (By.CLASS_NAME, "e1wcngjj1")
    STARS_DIVS = (By.TAG_NAME, "div")
    
    FILTER_JOBFUNCTIONS = (By.ID, "filter_sgocId")
    FILTER_SENIORITYLABELS = (By.ID, "filter_seniorityType")
    FILTER_COMPANIES = (By.ID, "filter_companyId")
    FILTER_COMPANYSIZES = (By.ID, "filter_employerSizes")
    
    
    # "Most Relevant" filter (a part of the mainCol but not DKFilters).
    MAIN_COL = (By.ID, "MainCol")
    BODY = (By.CLASS_NAME, "main")
    FILTER_MOSTRELEVANT = (By.CLASS_NAME, "css-150lexj")
    MOSTRELEVANT_DROPDOWN = (By.CLASS_NAME, "e1gtdke61")
    MOSTRELEVANT_DROPDOWN_UL = (By.TAG_NAME, "ul")
    UL_LI = (By.TAG_NAME, "li")
    
    
    # The Clear filters button.
    CLEAR_FILTERS = (By.CLASS_NAME, "clearFilters")
    CLEAR_FILTERS_SPAN = (By.TAG_NAME, "span")
    

class WebScrapingLocators:
    """WebScrapingLocators contains the locators for webscraping data.
    
    Refer to ConfigLocators for notation in this class.
    
    The categories are as follows:
    
        - Pop-up close button.
        - Joblistings list.
        - Page navigator.
        - Job Description column.
        - Job Info I.
        - Job Info II.
        - Job Info III.
        - Job Info IV.
        - Total page numbers.
    
    Note: The features for the dataset that will be returned
    from webscraping is split into 4 groups of features:
    job info 1, 2, 3, and 4. 
    
    """
    
    # Button to close pop-up.
    POPUP_CLOSE_BTN = (By.CLASS_NAME, "modal_closeIcon")
    
    
    # The joblistings list.
    MAIN_COL = (By.ID, "MainCol")
    JOBLISTING_CONTAINER = (By.TAG_NAME, "ul")
    JOBLISTINGS = (By.TAG_NAME, "li")
    
    
    # Page navigators at the bottom of the page.
    FOOTER_PAGE_NAV = (By.ID, "FooterPageNav")
    PAGES_CONTAINER = (By.XPATH, "//div[@class='middle']")
    PAGE_NAVS = (By.TAG_NAME, "li")
    
    
    # Job description column. 
    JD_COL = (By.ID, "JDCol")
    
    # Job Info I (company, rating, headquarters, salary est).
    HEADER = (By.CLASS_NAME, "e14vl8nk0")
    HEADER_JOB_INFO = (By.CLASS_NAME, "e1tk4kwz6")
    JOB_INFO_1 = (By.TAG_NAME, "div")
    
    # Job Info II (ratings, job type).
    JOB_INFO_2_CONTAINER = (By.CLASS_NAME, "epgue5a3")  # 2 classes correspond to this.
    JOB_INFO_2 = (By.TAG_NAME, "div")
    
    
    # Job Info III (size, founded date, type, industry, sector, revenue).
    EMP_BASIC_INFO = (By.ID, "EmpBasicInfo")
    COMP_OVERVIEW_CONTAINER = (By.CLASS_NAME, "flex-wrap")
    
    
    # Job Info IV (job description).
    JOB_DESC_CONTAINER = (By.ID, "JobDescriptionContainer")
    JOB_INFO_4 = (By.CLASS_NAME, "jobDescriptionContent")
    
    
    # Total page numbers.
    MAIN_COL = (By.ID, "MainCol")
    FOOTER = (By.CLASS_NAME, "tbl")
    PAGE_COUNT = (By.CLASS_NAME, "middle")
    

## Elements

In [15]:
# %%writefile elements.py

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# from locators import ConfigLocators as CL
# from locators import WebScrapingLocators as WSL


CL = ConfigLocators
WSL = WebScrapingLocators


class ConfigElements:
    """ConfigElements is a class for getting all the elements related to 
    configuring the filters. 
    
    Syntax for Table of Contents:
    
    The easiest way to demonstrate this is through an example.
    
        Example:
        
            - Category

                - get element * 
                    > _
                    > obj1
                    > obj2
        
        The above example is equivalent to:
        
            - Category
            
                - get element
                - get element -> obj1
                - get element -> obj2
            
    Additionally, arrows are used to denote that a certain webdriver
    element is being found from the preceding webdriver element.
    
    
    Table of Contents:
    
    - Get Elements
    
        - WebDriverWait base function
        
        - Search-Related
        
            - get *
                > keyword search
                > location search
                > search button

        - Filter-Related
        
            - get primary dropdown *
                > _
                > ul
                > ul -> all li
                
            - get filter *
                > jobtype
                > fromage/postdate
                > radius
                
            - Filter Minsalary (salary range)
            
                - get filter minsalary *
                    > include no salary data
                    > apply button
                    
                - get primary dropdown *
                    > salary range histogram container -> all bins
                    > left slider
                    > right slider
                    
                - get histogram labels -> header
                
            - More Filter
                - get dkfilters *
                    > _
                    > more
                    > clear filter -> span
                
                - get filter *
                    > cityid
                    > industry
                    > eaowfho (Easy Apply Only, Work From Home Only)
                    > eaowfho -> label
                    > company rating -> stars -> all divs
                    > job functions
                    > seniority labels
                    > companies
                    > company sizes
                                
            - get main col *
                > _
                > sortby filter
                
            - get body *
                > _
                > sortby dropdown
                > sortby dropdown -> ul
                > sortby dropdown -> ul -> all li
            
            
    - Reusable Filter-Related Functions
        - clear search and return keyword
        - get GlassdoorWebScraper obj filters by filter type 
        - getting and parsing filters 
        
        - Join Filters Functions
            - join filters *
                > jobtype
                > fromage/postdate
                > radius
                > cityid
                > industry
                > job function
                > seniority label
                > company
                > company size
                > sortby
            
        - initialize salary bins
        - move slider function
        - click more dropdown n_clicks times
        - initialize a filter
        - change filter to function
        - initialize sortby filter
        - regex parse salary
        - reset salary base function
    
    """
    
    # Tunable parameter.
    seconds_before_timeout = 10
    
    
    
    # ==================================================
    # Get Element(s)
    # ==================================================
    
    
    
    def wait_until_element(self, locator, seconds_before_timeout=seconds_before_timeout):
        return WebDriverWait(self.driver, seconds_before_timeout).until(
                   EC.presence_of_element_located(locator)
               )
    
    
    
    # ===============================
    # Search-Related
    # ===============================
    
    
    
    def get_keyword_search(self):
        return self.wait_until_element(CL.KEYWORD_SEARCH)
    
    
    def get_location_search(self):
        return self.wait_until_element(CL.LOCATION_SEARCH)
    
    
    def get_search_button(self):
        return self.wait_until_element(CL.SEARCH_BUTTON)
    
    
    
    # _______________________________
    
    
    # ===============================
    # Filter-Related
    # ===============================
    
    
    
    def get_primary_dropdown(self):
        return self.wait_until_element(CL.PRIMARY_DROPDOWN)
    
    
    # Gets the primary dropdown -> ul.
    def get_dropdown_ul(self):
        primary_dropdown = self.get_primary_dropdown()
        return primary_dropdown.find_element(*CL.DROPDOWN_UL)
    
    
    # Gets the primary dropdown -> ul -> all li.
    def get_ul_all_li(self):
        dropdown_ul = self.get_dropdown_ul()
        return dropdown_ul.find_elements(*CL.UL_LI)
    

    def get_filters_jobtypes(self):
        return self.wait_until_element(CL.FILTER_JOBTYPE)
    
    
    def get_filters_postdates(self):
        return self.wait_until_element(CL.FILTER_FROMAGE)
    
    
    def get_filters_radii(self):
        return self.wait_until_element(CL.FILTER_RADIUS)
    
    
    
    # ==================
    # Filter Minsalary
    # ==================
    
    
    
    def get_filters_minsalaries(self):
        return self.wait_until_element(CL.FILTER_MINSALARY)
    
    
    # Gets the minsalary -> checkbox.
    def get_filters_minsalaries_checkbox(self):
        primary_dropdown = self.get_primary_dropdown()
        return primary_dropdown.find_element(*CL.INCLUDE_SALARY_CHECKBOX)
    
    
    # Gets the minsalary -> checkbox -> include no salary data label.
    def get_filters_minsalaries_checkbox_label(self):
        minsalaries_checkbox = self.get_filters_minsalaries_checkbox()
        return minsalaries_checkbox.find_element(*CL.CHECKBOX_LABEL)
        
    
    
    # Gets the minsalary -> apply button.
    def get_filters_minsalaries_applybutton(self):
        primary_dropdown = self.get_primary_dropdown()
        return primary_dropdown.find_element(*CL.APPLY_BUTTON)
    
    
    # Gets the minsalary -> histogram container -> all divs.
    def get_primary_dropdown_histogram_container_all_div(self):
        primary_dropdown = self.get_primary_dropdown()
        primary_dropdown_histogram = primary_dropdown.find_element(*CL.HISTOGRAM)
        return primary_dropdown_histogram.find_elements(*CL.HISTOGRAM_DIVS)
    
    
    # Gets the minsalary -> left slider.
    def get_left_slider(self):
        primary_dropdown = self.get_primary_dropdown()
        return primary_dropdown.find_element(*CL.LEFT_SLIDER)
    
    
    # Gets the minsalary -> right slider.
    def get_right_slider(self):
        primary_dropdown = self.get_primary_dropdown()
        return primary_dropdown.find_element(*CL.RIGHT_SLIDER)
        
    
    # Gets histogram labels header.
    def get_histogram_labels_header(self):
        histogram_labels = self.wait_until_element(CL.HIST_LABEL)
        return histogram_labels.find_element(*CL.HIST_LABEL_HEADER)
        
        
        
    # __________________
    
    
    
    # ==================
    # More Filter
    # ==================
    
    
    
    def get_entire_filter(self):
        return self.wait_until_element(CL.DKFILTERS)
    
    
    def get_more_dropdown(self):
        DKFilters = self.get_entire_filter()
        return DKFilters.find_element(*CL.FILTER_MORE)
    
    
    # Gets DKFilters -> clear filter -> span.
    def get_clear_filter_span(self):
        DKFilters = self.get_entire_filter()
        clear_filter = DKFilters.find_element(*CL.CLEAR_FILTERS)
        return clear_filter.find_element(*CL.CLEAR_FILTERS_SPAN)
    
    
    def get_filters_cityids(self):
        return self.wait_until_element(CL.FILTER_CITYID)
    
    
    def get_filters_industries(self):
        return self.wait_until_element(CL.FILTER_INDUSTRYID)
    
    
    def get_filters_eaowfho(self, is_eao):
        eaowfho = self.driver.find_elements(*CL.FILTER_EAOWFHO)
        if is_eao:
            return eaowfho[0]
        return eaowfho[1]
    
    
    # Gets the EAO or WFHO filter's -> label.
    def get_filters_eaowfho_label(self, is_eao, eaowfho=None):
        if not eaowfho:
            eaowfho = self.get_filters_eaowfho(is_eao)
        return eaowfho.find_element(*CL.EAOWFHO_LABEL)
    
    
    # Gets the company rating filter -> stars -> all divs. 
    def get_filters_companyratings_stars_divs(self):
        companyratings = self.wait_until_element(CL.FILTER_COMPANYRATING)
        companyratings_stars = companyratings.find_element(*CL.COMPANYRATING_STARS)
        return companyratings_stars.find_elements(*CL.STARS_DIVS)


    def get_filters_jobfunctions(self):
        return self.wait_until_element(CL.FILTER_JOBFUNCTIONS)
    
    
    def get_filters_senioritylabels(self):
        return self.wait_until_element(CL.FILTER_SENIORITYLABELS)
    
    
    def get_filters_companies(self):
        return self.wait_until_element(CL.FILTER_COMPANIES)
    
    
    def get_filters_companysizes(self):
        return self.wait_until_element(CL.FILTER_COMPANYSIZES)
    

    
    # __________________

    
    
    def get_main_col(self):
        return self.wait_until_element(CL.MAIN_COL)
    
    
    # Gets the main col -> sortby filter.
    def get_filters_sortby(self):
        main_col = self.get_main_col()
        return main_col.find_element(*CL.FILTER_MOSTRELEVANT)
    
    
    def get_main_body(self):
        return self.wait_until_element(CL.BODY)
    
    
    # Gets the main body -> sortby dropdown.
    def get_main_body_sortby_dropdown(self):
        main_body = self.get_main_body()
        return main_body.find_element(*CL.MOSTRELEVANT_DROPDOWN)
    
    
    # Gets the main body -> sortby dropdown -> ul.
    def get_main_body_sortby_dropdown_ul(self):
        sortby_dropdown = self.get_main_body_sortby_dropdown()
        return sortby_dropdown.find_element(*CL.MOSTRELEVANT_DROPDOWN_UL)
    
    
    # Gets the main body -> sortby dropdown -> ul -> all li.
    def get_main_body_sortby_dropdown_ul_li(self):
        sortby_dropdown_ul = self.get_main_body_sortby_dropdown_ul()
        return sortby_dropdown_ul.find_elements(*CL.UL_LI)
    
    
    
    # _______________________________
    
    
    
    # __________________________________________________
    
    
    
    # ==================================================
    # Reusable Filter-Related Functions
    # ==================================================
    
    
    
    # Clear a search bar and return a keyword.
    def clear_and_search(self, search, keyword):
        search.clear()
        search.send_keys(keyword)
        search.send_keys(Keys.RETURN)    
    
    
    # Accesses the self.filters dictionary by key "filter_type"
    # and returns a list of keys (if the corresponding value to "filter_type"
    # is a dict), else it returns a list.
    def get_filters_by_type(self, filter_type):
        filters_by_type = self.filters[filter_type]
        if isinstance(filters_by_type, dict):
            return list(filters_by_type.keys())
        return filters_by_type
    
    
    # Note: get_and_parse_filters applies only to filters in the 
    # self.get_join_filters attribute.
    def get_and_parse_filters(self, filter_type_list, join_filters):
        """Gets filter text and parses it; then, it calls a join_filters function.
        
        Parameters
        ----------
        filter_type_list : str
            A string that contains all the possible filter options for a given filter
            in an unparsed manner. A filter option is defined as: jobtype/full_time where
            "full_time" is a filter option of the filter "jobtype". 
        join_filters : fn
            A function for a filter that will perform the concatenation
            of strings at the end of this get_and_parse_filters() method. This function
            varies depending on what special characters exist in the unparsed string
            "filter_type_list" and also on whether or not the filter options for a 
            filter will have counts.
            
        Returns
        -------
        type
            Returns a dict or a list.
        describe : dict or list
            If the filter options for a filter
            contains counts a dict is returned (where the parsed 
            filter options are keys and the values are the counts) 
            else it will return a list of the parsed filter options.
        
        Examples
        --------
        input : "Full-time (4722)\nPart-time (482)"
        output : {"full_time": 4722, "part_time": 482}
        
        input : "5 Miles\n10 Miles"
        output : ["5_miles", "10_miles"]

        """
        # Checks if there is at least one occurrence of the format:
        # a-zA-Z0-9 (0-9).
        # This search returns an re obj if it finds a match.
        check_for_count = re.search(r"\w+ \((\d+)\)", filter_type_list)
        
        # If there is no match for the aforementioned format,
        # then simply call the join_filters() function on the 
        # filter_type_list.
        if not check_for_count:
            return join_filters(filter_type_list)
        
        # At this point, there exists an re,
        # and group(1) is checked to see
        # if there is a count in the string "filter_type_list".
        elif check_for_count.group(1):
            filters, filters_counts = [], []
            
            for idx, filter_ in enumerate(filter_type_list.split("\n")):
                filter_split = filter_.split()
                
                # If 0.
                if not idx:
                    
                    # Since filter options come in a specified order,
                    # the first filter option is always the default
                    # and contains no count. Here, it is simply appended
                    # to list "filters". 
                    filters.append(filter_split)
                    
                # Idx != 0.
                else:
                    
                    # Append just the text portion.
                    filters.append(filter_split[:-1])
                    
                    # Strips parentheses from the number and appends
                    # to the list "filters_counts".
                    filters_counts.append(int(re.sub("[()]", "", filter_split[-1]).strip()))
                    
            # Since the default filter option is always the total,
            # the sum of all filter option counts are summed 
            # and inserted at index 0.
            filters_counts.insert(0, sum(filters_counts))
            return join_filters(filters, filters_counts)
    
    
    
    # ===============================
    # Join Filters Functions
    # ===============================
    
    
    
    def join_filters_jobtypes(self, jobtypes, jobtypes_counts):
        for idx, jobtype in enumerate(jobtypes):
            jobtypes[idx] = re.sub("-", "_", "_".join(jobtype).lower())
        return dict(zip(jobtypes, jobtypes_counts))
    
    
    def join_filters_postdates(self, postdates, postdates_counts):
        for idx, postdate in enumerate(postdates):
            postdates[idx] = "_".join(postdate).lower()
        return dict(zip(postdates, postdates_counts))
    
    
    def join_filters_radii(self, radius):
        return radius.lower().replace(" ", "_").split("\n")
    
    
    def join_filters_cityids(self, cityids, cityids_counts):
        for idx, cityid in enumerate(cityids):
            cityids[idx] = "_".join(cityid).lower().replace(",", "")
        return dict(zip(cityids, cityids_counts))
    
    
    def join_filters_industries(self, industries, industries_counts):
        for idx, industry in enumerate(industries):
            industries[idx] = "_".join(industry).lower()
        return dict(zip(industries, industries_counts))
    
    
    def join_filters_jobfunctions(self, jobfunctions, jobfunctions_counts):
        for idx, jobfunction in enumerate(jobfunctions):
            jobfunctions[idx] = "_".join(jobfunction).lower()
        return dict(zip(jobfunctions, jobfunctions_counts))
    
    
    def join_filters_senioritylabels(self, senioritylabels, senioritylabels_counts):
        for idx, senioritylabel in enumerate(senioritylabels):
            senioritylabels[idx] = "_".join(senioritylabel).lower()
        return dict(zip(senioritylabels, senioritylabels_counts))
    
    
    def join_filters_companies(self, companies, companies_counts):
        for idx, company in enumerate(companies):
            companies[idx] = "_".join(company).lower()
        return dict(zip(companies, companies_counts))
    
    
    def join_filters_companysizes(self, companysizes, companysizes_counts):
        for idx, companysize in enumerate(companysizes):
            companysizes[idx] = ("_"
                                 .join(companysize)
                                 .lower()
                                 .replace("-", "_")
                                 .replace("+", ""))
        return dict(zip(companysizes, companysizes_counts))
    
    
    # Join filters function for sortby (not used in the regular filter pipeline).
    def join_filters_sortby(self, sortbys):
        return sortbys.lower().replace(" ", "_").split("\n")
    
    
    
    # _______________________________
    
    
    
    # Initializes the possible salary bins for filter minsalary.
    def initialize_salary_bins(self):
        # Glassdoor.com seems to have different salary ranges 
        # each time the job page is loaded in. To compensate,
        # the apply button under the salary filter is clicked
        # so that the correct salary ranges are displayed.
        salary_filter_applybutton = self.get_filters_minsalaries_applybutton()
        salary_filter_applybutton.click()
        
        print("BEFORE RESET")
        
        time.sleep(1)
        self.reset_salary_slider()
        time.sleep(1)
        
        print("AFTER RESET")
        
        salary_filter = self.get_filters_minsalaries()
        salary_filter.click()
        
#         left_slider = self.get_left_slider()
#         left_slider.click()
#         left_slider.click()
#         right_slider = self.get_right_slider()
#         right_slider.click()
#         right_slider.click()
        
        # Get all histogram bins into a list.
        histogram_bins = self.get_primary_dropdown_histogram_container_all_div()
        
        # Click the left slider first.
        left_slider = self.get_left_slider()
        left_slider.click()
        
        time.sleep(1)
        
        print("A")
        
        all_bins = []
        for idx, _ in enumerate(range(len(histogram_bins) - 1)):
            if not idx:
                
                # If 0, move the slider left first before
                # doing anything else because 
                # moving the left slider right immediately 
                # after clicking the left slider would move 
                # it to the far right.
                left_slider.send_keys(Keys.ARROW_LEFT)
                
            # Get the current salary range and regex parse it.
            a_bin = self.get_histogram_labels_header()
            a_bin = a_bin.text.replace("$", "").split("-")
            
            # Append the lower endpoint of the salary range.
            all_bins.append(a_bin[0])
            
            # Move once to the right to update the lower
            # endpoint of the salary range.
            left_slider.send_keys(Keys.ARROW_RIGHT)
            
        print("B")
            
        # Finally, add the endpointt of the last salary range
        # and close the filter and return all bins for 
        # the left and right slider bins.
        all_bins.append(a_bin[1])
        salary_filter.click()
        
        print("C")
        
        # If [a, b] is the largest possible salary range,
        # then the left slider can access values from
        # index(a) to index(b - 1) and the right slider
        # can access values from index(a + 1) to
        # index(b) inclusive of the endpoint.
        left_slider_bins = all_bins[:-1]
        right_slider_bins = all_bins[1:]
        return left_slider_bins, right_slider_bins
    
    
    # Moves left and right sliders for filter minsalary.
    def move_slider(self, slider, idx, current_idx, difference):
        if idx < current_idx:
            for _ in range(difference):
                slider.send_keys(Keys.ARROW_LEFT)
        elif idx > current_idx:
            for _ in range(difference):
                slider.send_keys(Keys.ARROW_RIGHT)
    
    
    # Because the more dropdown filters don't close properly, this small
    # function is aimed at simply closing that dropdown.
    # It also doubles as a more dropdown clicker, as it takes an
    # n_clicks argument.
    def click_more_dropdown(self, n_clicks=1):
        more_filter = self.get_more_dropdown()
        for _ in range(n_clicks):
            more_filter.click()
        
    
    # Note: init_filter works for only filters in the 
    # self.get_join_filters attribute.
    def init_filter(self, 
                    name, 
                    get_filters_, 
                    join_filters_=None,
                    is_salary=False,
                    is_more=False):
        """This function initializes a single filter. 
        
        Parameters
        ----------
        name : str
            Name of the filter.
        get_filters_ : fn
            The function that gets a certain filter.
        join_filters_ : fn, optional
            The join function for a certain filter. 
            This is optional as some filters don't use a join filters function
            like salary range.
        is_salary : bool
            True if the filter passed in is the salary range filter (it
            is a special filter that requires special initialization).
        is_more : bool
            True if the filter is under the "More" dropdown.
            
        Returns
        -------
        type
            NoneType
        describe
            This function simply initializes all filter options and doesn't
            return anything.
        """
        try:
            
            # First checks if it is_salary.
            if not is_salary:
                
                # If it is_more, then the more dropdown is clicked
                # and a JS script is called.
                if is_more:
                    self.click_more_dropdown()

                _filter = get_filters_()
                if not is_more:
                    _filter.click()
                elif is_more:
                    self.driver.execute_script("arguments[0].click();", _filter) 

                _dropdown_ul = self.get_dropdown_ul()
                self.filters[name] = self.get_and_parse_filters(_dropdown_ul.text,
                                                            join_filters_)
                
                # If is_more, then double click the more dropdown 
                # to ensure whatever dropdown might be left open
                # to be closed.
                if not is_more:
                    _filter.click()
                elif is_more:
                    self.click_more_dropdown(n_clicks=2)
                    
            # Special initialization for is_salary.
            elif is_salary:
                print("ENTER ELIF")
                self.filters[name] = {}
                salary_filter = get_filters_()
                salary_filter.click()
                print("CLICKED FILTER")
                left_slider_bins, right_slider_bins = self.initialize_salary_bins()
                self.filters[name]["left_slider"] = left_slider_bins
                self.filters[name]["right_slider"] = right_slider_bins
                print("INITIALIZED SALARY")
                
        # Glassdoor.com occassionally might exclude a filter or two 
        # from the "More" dropdown. This except block catches it and 
        # simply prints the exception.
        except Exception as e:
            print("init_filter is not working.")
            print(e)
        
    
    # Changes a filter option to.
    # Note: change_filter_to only applies to filters
    # in the self.get_join_filters attribute.
    def change_filter_to(self, name, choice, is_more=False):
        try:
            if is_more:
                self.click_more_dropdown()
            
            _filter = self.get_join_filters[name]["get"]
            if not is_more:
                _filter().click()
            elif is_more:
                self.driver.execute_script("arguments[0].click();", _filter()) 
                
            dropdown_ul_li = self.get_ul_all_li()
            ul_li_element = dropdown_ul_li[self.get_filters_by_type(name).index(choice)]
            ul_li_element.click()
            
            if is_more:
                self.click_more_dropdown(n_clicks=2)
        except Exception as e:
            print(e)
    
    
    # Initialize the "Most Relevant" dropdown "filter".
    # This one is under the main body of the page rather than the 
    # main group of filters and thus it is initialized separately
    # as it is not necessarily a filter.
    def init_sortby(self):
        sortby_filter = self.get_filters_sortby()
        sortby_filter.click()
        sortby_dropdown_ul = self.get_main_body_sortby_dropdown_ul()
        self.filters["sortbys"] = self.join_filters_sortby(sortby_dropdown_ul.text)
        self.click_more_dropdown(n_clicks=2)
    
    
    def regex_parse_salary(self, header):
                return header.text.replace("$", "").split("-")
        
        
    def reset_salary_base_fn(self, slider, idx, key_fn, is_both=False):
        # Given the salary dropdown is open.
        slider.click()
        a_bin = self.get_histogram_labels_header()
        a_bin = self.regex_parse_salary(a_bin)
        
        # Basically, keep a moving variable
        # one trails the other and the loop
        # breaks if the 2 variables are equal to each other 
        # meaning the end of the salary range has been hit by that
        # specific slider.
        before = a_bin[idx]
        slider.click()
        slider.send_keys(key_fn)
        current = self.regex_parse_salary(self.get_histogram_labels_header())[idx]
        while current != before:
            before = current
            slider.send_keys(key_fn)
            current = self.regex_parse_salary(self.get_histogram_labels_header())[idx]
        if not is_both:
            applybutton = self.get_filters_minsalaries_applybutton()
            applybutton.click()
    
    # __________________________________________________

    
    
class WebScrapingElements:
    """WebScrapingElements is a class for getting all the elements related to 
    webscraping the job listing data.
    
    Refer to ConfigElements for syntax in the table of contents.
    
    Table of Contents:
    
    - Wait Base Function
    
    - Pop-up
        
        - pop-up close button
        
    - Get Joblistings
    
        - get joblistings list
        - page navigator
        - get page count
        
    - Get Job Info
    
        - get jdcol
        - get job info *
            > 1
            > 2
            > 3
            > 4
    
    """
    
    seconds_before_timeout = 5
    
    def wait_until_element(self, locator, seconds_before_timeout=seconds_before_timeout):
        return WebDriverWait(self.driver, seconds_before_timeout).until(
                EC.presence_of_element_located(locator)
            )
    
    # ==================================================
    # Pop-up
    # ==================================================
    
    
    
    def close_popup(self):
        return self.wait_until_element(WSL.POPUP_CLOSE_BTN)
        
        
    # __________________________________________________
    
    
    
    # ==================================================
    # Get Joblistings
    # ==================================================
    
    
    
    def get_joblistings(self):
        main_col = self.wait_until_element(WSL.MAIN_COL)
        joblisting_container = main_col.find_element(*WSL.JOBLISTING_CONTAINER)
        return joblisting_container.find_elements(*WSL.JOBLISTINGS)
    
    
    def get_page_nav(self):
        footer_page_nav = self.wait_until_element(WSL.FOOTER_PAGE_NAV)
        pages_container = footer_page_nav.find_element(*WSL.PAGES_CONTAINER)
        return pages_container.find_elements(*WSL.PAGE_NAVS)

    
    def get_page_count(self):
        main_col = self.wait_until_element(WSL.MAIN_COL)
        footer = main_col.find_element(*WSL.FOOTER)
        return footer.find_element(*WSL.PAGE_COUNT)

    
    
    # __________________________________________________
    
    
    
    # ==================================================
    # Get Job Info
    # ==================================================
    
    
    
    def get_jdcol(self):
        return self.wait_until_element(WSL.JD_COL)
    
    
    def get_jobinfo1(self):
        jd_col = self.get_jdcol()
        header = jd_col.find_element(*WSL.HEADER)
        header_job_info = header.find_element(*WSL.HEADER_JOB_INFO)
        return header_job_info.find_elements(*WSL.JOB_INFO_1)
    
    
    def get_jobinfo2(self, is_insights=True):  # Get company insights or ratings.
        jd_col = self.get_jdcol()
        job_info_2_containers = jd_col.find_elements(*WSL.JOB_INFO_2_CONTAINER)
        if is_insights:
            return job_info_2_containers[1].find_elements(*WSL.JOB_INFO_2)
        return job_info_2_containers[0].find_elements(*WSL.JOB_INFO_2)
    
    
    def get_jobinfo3(self):
        emp_basic_info = self.wait_until_element(WSL.EMP_BASIC_INFO)
        return emp_basic_info.find_element(*WSL.COMP_OVERVIEW_CONTAINER)
    
    
    def get_jobinfo4(self):
        job_desc_container = self.wait_until_element(WSL.JOB_DESC_CONTAINER)
        return job_desc_container.find_element(*WSL.JOB_INFO_4)
    
    
        
    # __________________________________________________
    

## Webscraper

In [16]:
# %%writefile webscraper.py

import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# from elements import ConfigElements, WebScrapingElements


class GlassdoorWebScraper(ConfigElements, WebScrapingElements):
    """ A GlassdoorWebScraper obj will be able to configure filters and webscrape.

    Ensure that your chromedriver corresponds correctly to your current
    Google chrome version here: 

    https://sites.google.com/a/chromium.org/chromedriver/downloads
    
    
    
    Here is the same URL with keyword="data scientist":
    
    https://www.glassdoor.com/Job/jobs.htm?sc.keyword="data scientist"
    &locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&
    fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=
    -1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=
    -1&employerSizes=0&applicationType=0&remoteWorkType=0
    
    
    
    Design:
        
        - locators.py and elements.py are split into 2 components: 
          filter configuration and webscraping.
        - The GlassdoorWebScraper class uses the low level functions
          of elements.py to create higher level functions.
        - GlassdoorWebScraper also inherits from elements.py to give 
          access to the lower level functions.
    
    
    
    Table of Contents:
    
    - Basic Utility Functions
        - update keyword and URL
        - set implicit wait
        - get
        - close
    - Filter Configuration Functions
        - clear filters
        - init filters
        - init change filters
        - include no salary data
        - change *
            > keyword
            > location
            > jobtype
            > postdate
            > salary
            > city
            > industry
            > job function
            > seniority label
            > company
            > company size
            > Easy Apply Only/Work From Home Only
            > rating
            > sortby
    - Webscrape Function

    
    
    Functions:
    
    update_keyword_and_URL(keyword)
        Updates the keyword and URL simultaneously.
        
    set_implicitly_wait(implicitly_wait_time)
        Set the global implicit wait time.
        
    get(implicitly_wait_time=5, set_implicitly_wait=True)
        Creates a webdriver, maximizes window, sets the implicit wait time
        (which defaults to 5) if set_implicitly_wait is true, then
        finally opens the URL.
        
    close()
        Closes the current tab. This function is a wrapper just for 
        convenience.
    
    change_keyword_to(keyword)
        Enter keyword into the keyword search bar and return.
        
    change_location_to(location)
        Enter location into the location search bar and return.
        
    init_filters(_filter=None)
        Initialize all filters if _filter is None else initialize _filter.
        
    reset_salary_slider(is_both=True, is_left=True)
        Reset salary slider. is_both decides if both sliders should be reset
        and is_left decides if the left or the right one should be reset 
        (in the case that is_both is False).
        
     init_change_filters(filter_type)
         Initialize a filter, then print out possible filter options,
         then change the filter to specified input. 
        
    change_jobtype_to(jobtype)
        Change to a specified jobtype filter option.
        
    change_postdate_to(postdate)
        Change to a specified postdate filter option.
        
    include_no_salary_data(include)
        The boolean include dictates whether or not the checkbox is checked. 
    
    change_salary_to(begin_salary, end_salary)
        The salary range is in the form [a, b]. a is the begin_salary and is a string
        (e.g. "125K" where the K represents thousands). b is the end_salary and
        is also a string. include_no_salary_data defaults to True meaning
        it will by default include data with no salary.
        
    change_radius_to(radius)
        Change to a specified radius filter option.
        
    change_cityid_to(cityid)
        Change to a specified cityid filter option.
        
    change_industry_to(industry)
        Change to a specified industry filter option.
        
    change_jobfunction_to(job_function)
        Change to a specified job function filter option.
        
    change_senioritylabel_to(seniority_label)
        Change to a specified seniority label filter option.
        
    change_company_to(company)
        Change to a specified company filter option.
        
    change_companysize_to(company_size)
        Change to a specified company size filter option.
        
    easy_apply_work_home(is_eao, will_apply)
        If is_eao is true, then select the Easy Apply Only label button else
        select the Work From Home Only label button. Then, if will_apply is true,
        then apply. 
        
    change_rating_to(rating)
        Change the rating.
        
    clear_filters()
        Clears all filters.
        
    sort_by(sort_type)
        Changes the "Most Relevant" dropdown (sortby) filter to a specified filter option.
        
    scrape_jobs(n_jobs)
        Webscrape jobs. n_jobs determines the size of the dataset.
        
        
    This project was created with inspiration from:
    
    https://github.com/arapfaik/scraping-glassdoor-selenium
    
    """     
     
    def __init__(self, keyword, PATH="C:\Program Files (x86)\chromedriver.exe"):
        """The following attributes can be accessed and changed but it is advised not to do so directly.
        
            All attributes of a GlassdoorWebScraper obj include:
        
            self.PATH: 
                The path to your chromedriver.exe.
            
            
            self.keyword: 
                The keyword initialized by the user.
            
            
            self.URL_part_1: 
                The first part of the Glassdoor URL. 
            
            
            self.URL_part_2: 
                The second part of the Glassdoor URL.
        
        
            self.URL:
                The concatenation of self.URL_part_1, self.keyword, and self.URL_part_2 
                in that exact order.
                
                
            self.driver:
                The Selenium webdriver. Only created when the user creates a GlassdoorWebScraper obj and
                calls the get() method.
                
                
            self.filters:
                A dictionary of dictionaries and lists. It contains all the configurable filters 
                of the current opened webpage. Only created when the init_configs() method is called.
                
                
            self.get_join_filters:
                A dictionary of dictionaries. The outer dict has keys for each filter. These keys correspond
                to dictionary values that hold: name, get fn, join fn, is_salary, and is_more. Name is the name
                of the filter. Get fn is the get locator function for that filter. Join fn is the function used to
                regex simplify and concatenate the preprocessed filter options corresponding to a filter. is_salary 
                checks if the filter is the salary filter. is_more checks if the filter is under the more dropdown.
                
        """
        self.PATH = PATH
        self.keyword = keyword
        
        self.URL_part_1 = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword='
        self.URL_part_2 = '&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType= \
                             all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId= \
                             -1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId= \
                             -1&employerSizes=0&applicationType=0&remoteWorkType=0'
        self.URL = self.URL_part_1 + self.keyword + self.URL_part_2

        self.filters = {}
        
        # Excludes company rating, easy apply only, work from home only, and the
        # most relevant (sortby) filters. 
        self.get_join_filters = {
            "jobtypes": {
                "get": self.get_filters_jobtypes,
                "join": self.join_filters_jobtypes,
                "is_salary": False,
                "is_more": False,
                "change": self.change_jobtype_to
            },
            "postdates": {
                "get": self.get_filters_postdates,
                "join": self.join_filters_postdates, 
                "is_salary": False,
                "is_more": False,
                "change": self.change_postdate_to
            },
            "salaries": {
                "get": self.get_filters_minsalaries,
                "join": None,
                "is_salary": True,
                "is_more": False,
                "change": self.change_salary_to
            },
            "radii": {
                "get": self.get_filters_radii,
                "join": self.join_filters_radii,
                "is_salary": False,
                "is_more": False,
                "change": self.change_radius_to
            },
            "cityids": {
                "get": self.get_filters_cityids,
                "join": self.join_filters_cityids,
                "is_salary": False,
                "is_more": True,
                "change": self.change_cityid_to
            },
            "industries": {
                "get": self.get_filters_industries,
                "join": self.join_filters_industries,
                "is_salary": False,
                "is_more": True,
                "change": self.change_industry_to
            },
            "job_functions": {
                "get": self.get_filters_jobfunctions,
                "join": self.join_filters_jobfunctions,
                "is_salary": False,
                "is_more": True,
                "change": self.change_jobfunction_to
            },
            "seniority_labels": {
                "get": self.get_filters_senioritylabels,
                "join": self.join_filters_senioritylabels,
                "is_salary": False,
                "is_more": True,
                "change": self.change_senioritylabel_to
            },
            "companies": {
                "get": self.get_filters_companies,
                "join": self.join_filters_companies,
                "is_salary": False,
                "is_more": True,
                "change": self.change_company_to
            },
            "company_sizes": {
                "get": self.get_filters_companysizes,
                "join": self.join_filters_companysizes,
                "is_salary": False,
                "is_more": True,
                "change": self.change_companysize_to
            },
        }
        
        
        
    # ==================================================
    # Basic Utility Functions.
    # ==================================================
    
    
    
    def update_keyword_and_URL(self, keyword):
        self.keyword = keyword
        self.URL = self.URL_part_1 + self.keyword + self.URL_part_2
    
    
    # Sets the implicit wait time.
    # Note: Implicit wait time is set for life of the Webdriver obj once declared;
    # this means the get() function must be called again before setting a new 
    # implicit wait time.
    def set_implicitly_wait(self, implicitly_wait_time):
        self.driver.implicitly_wait(implicitly_wait_time)
    
    
    def get(self, implicitly_wait_time=5, set_implicitly_wait=True): 
        self.driver = webdriver.Chrome(self.PATH)
        if set_implicitly_wait: self.set_implicitly_wait(implicitly_wait_time)
        self.driver.maximize_window()
        self.driver.get(self.URL)
    
    
    def close(self):
        self.driver.close()
    
    
    
    # ==================================================
    # Filter Configuration Functions.
    # ==================================================
    
    
    
    def clear_filters(self):
        try:
            clear_filter_button = self.get_clear_filter_span()
            clear_filter_button.click()
        except Exception as e:
            print(e)
    
    
    # Initialize all filter configurations and allows for initializing specific filters.
    # Note: filters change when other filters are updated.
    # Note: Make sure to glance the self.filters attribute and call init_filters() on 
    # the filter that will be next changed.
    def init_filters(self, _filter=None):
        if not _filter:
            for filters, attribs in self.get_join_filters.items():
                self.init_filter(name=filters, 
                                 get_filters_=attribs["get"], 
                                 join_filters_=attribs["join"],
                                 is_salary=attribs["is_salary"],
                                 is_more=attribs["is_more"])
            
            self.init_sortby()
            
            self.click_more_dropdown(n_clicks=2)
        elif _filter:
            if _filter == "sortbys":
                self.init_sortby()
            else:
                attribs = self.get_join_filters[_filter]
                self.init_filter(name=_filter,
                                 get_filters_=attribs["get"],
                                 join_filters_=attribs["join"],
                                 is_salary=attribs["is_salary"],
                                 is_more=attribs["is_more"])
        
        
    def reset_salary_slider(self, is_both=True, is_left=True):
        slider_info = {
            "left_slider": {
                "slider": self.get_left_slider,
                "idx": 0,
                "key_fn": Keys.ARROW_LEFT
            },
            "right_slider": {
                "slider": self.get_right_slider,
                "idx": 1,
                "key_fn": Keys.ARROW_RIGHT
            }
        }
        
        salary_filter = self.get_join_filters["salaries"]["get"]()
        salary_filter.click()
        
        if not is_both:
            if is_left:
                left_slider_info = slider_info["left_slider"]

                slider = left_slider_info["slider"]()
                idx = left_slider_info["idx"]
                key_fn = left_slider_info["key_fn"]
            elif not is_left:
                right_slider_info = slider_info["right_slider"]

                slider = right_slider_info["slider"]()
                idx = right_slider_info["idx"]
                key_fn = right_slider_info["key_fn"]
                
            self.reset_salary_base_fn(slider, idx, key_fn)
        elif is_both:
            for _, values in slider_info.items():
                slider = values["slider"]()
                idx = values["idx"]
                key_fn = values["key_fn"]
                self.reset_salary_base_fn(slider, idx, key_fn, is_both=True)
            applybutton = self.get_filters_minsalaries_applybutton()
            applybutton.click()
        
        
        
        
    
    # A streamlined wrapper function to chain together common methods:
    # initialize a filter, print the viable filter options,
    # and change to the specified filter option(s).
    # Note: If you use this function to change "salaries",
    # then it will reset all filters and then initialize the salary filter.
    # This design choice was because there are certain
    # page loadups that have a weird looking salary histogram.
    # Note: Default to init_filters("salaries") and change_salary_to()
    # if the histogram is chaotic (this function doesn't work well with the
    # weird features of the chaotic histogram).
    def init_change_filter(self, filter_type):
        if filter_type != "salaries":
            self.init_filters(filter_type)
            print(f"Your options for filter {filter_type} are :")
            print("----------------------------")
            print(f"{filter_type}: ", self.filters[filter_type])
            _filter = input("Enter a filter option: ")
            self.get_join_filters[filter_type]["change"](_filter)
        elif filter_type == "salaries":
            try:
                # Check if there is a preexisting config dict.
                _ = self.filters["salaries"]["left_slider"]
                # Since Glassdoor.com's salary filter changes
                # randomly upon page load-up, this try block checks
                # what setup the salary filter is and 
                # acts accordingly.
                try:
                    # Try to get the checkbox.
                    salary_filter = self.get_filters_minsalaries()
                    salary_filter.click()
                    no_salary_label = self.get_filters_minsalaries_checkbox_label()
                    salary_filter.click()
                    
                    clear_filter = False
                except:
                    clear_filter = True
                if clear_filter:
                    salary_filter = self.get_filters_minsalaries()
                    salary_filter.click()
                    self.clear_filters()
                    
                    
                time.sleep(1)
                self.init_filters(filter_type)
            except Exception as e:
                print(e)
                print("ENTER EXCEPTION")
                self.init_filters(filter_type)
                time.sleep(1)
            print(f"Your options for filter {filter_type} are :")
            print("----------------------------")
            print("left_slider: ", self.filters[filter_type]["left_slider"])
            print("")
            print("right_slider: ", self.filters[filter_type]["right_slider"])
            begin_salary = input("Enter a lower bound salary from the left slider list: ")
            end_salary = input("Enter an upper bound salary from the left slider list: ")
            
            # The include no salary data feature is deleted from this function
            # because it seems to not work well together. So the user must call
            # include_no_salary_data() separately.
            self.get_join_filters[filter_type]["change"](begin_salary, 
                                                         end_salary)
            
    
    def include_no_salary_data(self, include):
        try:
            salary_filter = self.get_filters_minsalaries()
            salary_filter.click()

            no_salary_label = self.get_filters_minsalaries_checkbox_label()
            checkbox = self.get_filters_minsalaries_checkbox()
            is_checked = checkbox.get_attribute("aria-checked")
            if is_checked == "true":
                is_checked = True
            else:
                is_checked = False
            if is_checked:
                if include: pass
                elif not include: no_salary_label.click()
            elif not is_checked:
                if include: no_salary_label.click()
                elif not include: pass
            applybutton = self.get_filters_minsalaries_applybutton()
            applybutton.click()
        except Exception as e:
            print(e)
    
    
    # Change keyword (occupation).
    def change_keyword_to(self, keyword):
        keyword_search = self.get_keyword_search()
        self.clear_and_search(keyword_search, keyword)
        
    
    def change_location_to(self, location):
        location_search = self.get_location_search()
        
        # Clears the input field for the location search bar.
        # Note: Selenium's clear() function does not work here
        # so a custom delete was implemented.
        initial_location = location_search.get_attribute("value")
        for _ in range(len(initial_location)):
            location_search.send_keys(Keys.BACKSPACE)
            
        # Enters user's inputted location and presses the search icon.
        # Note: clear_and_search() was not used because clear() does 
        # not work.
        location_search.send_keys(location)
        search_button = self.get_search_button()
        search_button.click()
        
        
    def change_jobtype_to(self, jobtype):
        jobtype_attribs = self.get_join_filters["jobtypes"]
        self.change_filter_to(name="jobtypes", 
                              choice=jobtype,
                              is_more=jobtype_attribs["is_more"])
    
    
    def change_postdate_to(self, postdate):
        postdate_attribs = self.get_join_filters["postdates"]
        self.change_filter_to(name="postdates", 
                              choice=postdate,
                              is_more=postdate_attribs["is_more"])
            
    
    # Note: Glassdoor.com has a weird inconsistency with how the
    # salary filter is initialized and used (the salary ranges themselves
    # change as well depending on clicking the apply, or moving the sliders
    # left or right when they are already at the edges!).
    # Note: change_salary_to() works fine for the bell curve
    # histogram, but might take a few clear_filters() and init_filters("salaries")
    # to work somewhat consistently for the chaotic histogram.
    def change_salary_to(self, begin_salary, end_salary):
        
        
        salary_filter = self.get_filters_minsalaries()
        salary_filter.click()
            
        # For some reason, the salary range header is bugged.
        # These statements here is to ensure that the lower endpoint
        # of the salary range is actually correctly displayed.
        left_slider = self.get_left_slider()
        left_slider.click()
        left_slider.send_keys(Keys.ARROW_LEFT)
        left_slider.send_keys(Keys.ARROW_RIGHT)
        right_slider = self.get_right_slider()
        right_slider.send_keys(Keys.ARROW_RIGHT)
        right_slider.send_keys(Keys.ARROW_LEFT)
        
        # Regex parse the current salary range.
        a_bin = self.get_histogram_labels_header()
        a_bin = a_bin.text.replace("$", "").split("-")
        
        # The next 4 blocks of code dictate how far the current left and right 
        # sliders are from the desired begin_salary (left slider) and 
        # end_salary (right slider).
        left_slider_salaries = self.filters["salaries"]["left_slider"]
        right_slider_salaries = self.filters["salaries"]["right_slider"]
        
        begin_salary = begin_salary.upper()
        end_salary = end_salary.upper()
        
        begin_salary_idx = left_slider_salaries.index(begin_salary)
        end_salary_idx = right_slider_salaries.index(end_salary)
        
        current_begin_salary, current_end_salary = a_bin[0], a_bin[1]
        current_begin_salary_idx = left_slider_salaries.index(current_begin_salary)
        current_end_salary_idx = right_slider_salaries.index(current_end_salary)
        
        begin_salary_difference = abs(begin_salary_idx - current_begin_salary_idx)
        end_salary_difference = abs(end_salary_idx - current_end_salary_idx)
        
        # Click and move left slider.
        left_slider = self.get_left_slider()                                                        
        left_slider.click()
                    
        self.move_slider(left_slider, 
                         begin_salary_idx, 
                         current_begin_salary_idx,
                         begin_salary_difference)
        
        # Click and move right slider.
        right_slider = self.get_right_slider()
        right_slider.click()
        
        self.move_slider(right_slider,
                         end_salary_idx,
                         current_end_salary_idx,
                         end_salary_difference)
        
        # Finally, apply the changes.
        applybutton = self.get_filters_minsalaries_applybutton()
        applybutton.click()
        
        
    def change_radius_to(self, radius):
        radius_attribs = self.get_join_filters["radii"]
        self.change_filter_to(name="radii", 
                              choice=radius,
                              is_more=radius_attribs["is_more"])
    
    
    def change_cityid_to(self, cityid):
        cityid_attribs = self.get_join_filters["cityids"]
        self.change_filter_to(name="cityids", 
                              choice=cityid,
                              is_more=cityid_attribs["is_more"])
        
        
    def change_industry_to(self, industry):
        industry_attribs = self.get_join_filters["industries"]
        self.change_filter_to(name="industries", 
                              choice=industry,
                              is_more=industry_attribs["is_more"])
        
        
    def change_jobfunction_to(self, job_function):
        jobfunction_attribs = self.get_join_filters["job_functions"]
        self.change_filter_to(name="job_functions", 
                              choice=job_function,
                              is_more=jobfunction_attribs["is_more"])
        
        
    def change_senioritylabel_to(self, seniority_label):
        senioritylabel_attribs = self.get_join_filters["seniority_labels"]
        self.change_filter_to(name="seniority_labels", 
                              choice=seniority_label,
                              is_more=senioritylabel_attribs["is_more"])
        
        
    def change_company_to(self, company):
        company_attribs = self.get_join_filters["companies"]
        self.change_filter_to(name="companies", 
                              choice=company,
                              is_more=company_attribs["is_more"])
        
        
    def change_companysize_to(self, company_size):
        companysize_attribs = self.get_join_filters["company_sizes"]
        self.change_filter_to(name="company_sizes", 
                              choice=company_size,
                              is_more=companysize_attribs["is_more"])
        
        
    def easy_apply_work_home(self, is_eao, will_apply):
        self.click_more_dropdown()
        
        apply = self.get_filters_eaowfho(is_eao)
        btn = self.get_filters_eaowfho_label(is_eao, eaowfho=apply)
        if "applied" in apply.get_attribute("class"):
            if will_apply:
                pass
            elif not will_apply:
                btn.click()
        elif "applied" not in apply.get_attribute("class"):
            if will_apply:
                btn.click()
            elif not will_apply:
                pass
        
        self.click_more_dropdown()
        
        
    def change_rating_to(self, rating):  # Rating goes from 1-4.
        self.click_more_dropdown()
        
        ratings = self.get_filters_companyratings_stars_divs()
        
        ratings[rating - 1].click()
        
        self.click_more_dropdown()
        
        
    # The sort by dropdown seems to be a little bugged.
    # It changes the order of the job listings which implies that it works
    # yet the checkmark for the dropdown (there is a checkmark next to the user's selected dropdown
    # choice) stays on "Most Relevant" regardless of what option ("Most Relevant" or "Most Recent")
    # the user chooses. Additionally, it is unsure whether or not the actual dropdown button
    # should change when one selects "Most Relevant" or "Most Recent". That could possibly
    # be bugged too.
    # Note: since this "filter" isn't a part of the DKFilters tag, it will not follow
    # the general pipeline for DKFilters for flexibility.
    def sort_by(self, sort_type):
        sortby_filter = self.get_filters_sortby()
        sortby_filter.click()
        
        dropdown_ul_li = self.get_main_body_sortby_dropdown_ul_li()  
        ul_li_element = dropdown_ul_li[self.get_filters_by_type("sortbys").index(sort_type)]
        ul_li_element.click()
    
    
    
    # ==================================================
    # Webscraping Function.
    # ==================================================
    
    
    
    def scrape_jobs(self, n_jobs):
        # Gets the total number of pages.
        total_pages = int(self.get_page_count().text.split()[-1])
        page_counter = 1
        jobs = []
        
        while len(jobs) < n_jobs and page_counter <= total_pages:
            time.sleep(2)
            
            joblistings = self.get_joblistings()

            for joblisting in joblistings:
                if len(jobs) == n_jobs:
                    break
                
                joblisting.click()

                time.sleep(2)

                # Check if there is a pop-up.
                try:
                    close_popup_btn = self.close_popup()
                    close_popup_btn.click()
                except:
                    pass

                time.sleep(2)

                jobinfo = {}

                # Job Info I.
                jobinfo1 = self.get_jobinfo1()

                try:
                    company = jobinfo1[0].text
                except:
                    company = -1
                try:
                    job_title = jobinfo1[1].text
                except:
                    job_title = -1
                try:
                    headquarters = jobinfo1[2].text
                except:
                    headquarters = -1
                try:
                    salary_estimate = jobinfo1[3].text
                except:
                    salary_estimate = -1

                jobinfo1_features = {
                    "company": company,
                    "job title": job_title,
                    "headquarters": headquarters,
                    "salary estimate": salary_estimate
                }

                # Job Info II.
                jobinfo2 = self.get_jobinfo2()

                try:
                    job_type = jobinfo2[0].text
                except:
                    job_type = -1

                jobinfo2_features = {
                    "job type": job_type
                }

                # Job Info III.
                try:
                    jobinfo3 = self.get_jobinfo3()
                except:
                    pass

                jobinfo3_features = {
                    "size": -1,
                    "founded": -1,
                    "type": -1,
                    "industry": -1,
                    "sector": -1,
                    "revenue": -1
                }

                try:
                    # The text is a string, split by "\n", it will 
                    # be a list with every value an attribute and every other
                    # value corresponding to a value.
                    jobinfo3_features_updated = {}
                    features_list = jobinfo3.text.lower().split("\n")
                    for i in range(0, len(features_list)-1, 2):
                        jobinfo3_features_updated[features_list[i]] = features_list[i + 1]
                except:
                    pass

                jobinfo3_features.update(jobinfo3_features_updated)

                # Job Info IV.
                jobinfo4_features = {"job description": self.get_jobinfo4().text}

                for jobinfo_features in [jobinfo1_features, 
                                jobinfo2_features, 
                                jobinfo3_features, 
                                jobinfo4_features]:
                    jobinfo.update(jobinfo_features)

                jobs.append(jobinfo)
            
            if len(jobs) == n_jobs:
                break
            
            # Clicks the right arrow in the page navigator footer.
            try:
                page_nav_right_arrow = self.get_page_nav()[6]
                page_nav_right_arrow.click()
            except:
                pass
            
            page_counter += 1
            
        return pd.DataFrame(jobs)
    

In [17]:
gd_scraper = GlassdoorWebScraper("data scientist")

In [18]:
gd_scraper.get()

In [19]:
gd_scraper.init_filters()

ENTER ELIF
CLICKED FILTER
BEFORE RESET
AFTER RESET
A
B
C
INITIALIZED SALARY


In [21]:
print(gd_scraper.filters.keys())

dict_keys(['jobtypes', 'postdates', 'salaries', 'radii', 'cityids', 'industries', 'job_functions', 'seniority_labels', 'companies', 'company_sizes', 'sortbys'])


In [22]:
gd_scraper.filters

{'jobtypes': {'all_job_types': 4959,
  'full_time': 4750,
  'part_time': 96,
  'contract': 20,
  'internship': 45,
  'temporary': 12,
  'entry_level': 36},
 'postdates': {'posted_any_time': 10026,
  'last_day': 47,
  'last_3_days': 493,
  'last_week': 1631,
  'last_2_weeks': 2961,
  'last_month': 4894},
 'salaries': {'left_slider': ['18K',
   '74K',
   '84K',
   '94K',
   '104K',
   '114K',
   '124K',
   '134K',
   '144K',
   '154K',
   '164K',
   '174K',
   '184K',
   '194K',
   '204K',
   '214K',
   '224K',
   '234K',
   '244K',
   '254K',
   '264K',
   '274K',
   '284K',
   '294K',
   '304K',
   '314K',
   '324K',
   '334K',
   '344K',
   '354K',
   '364K',
   '374K',
   '384K',
   '394K'],
  'right_slider': ['74K',
   '84K',
   '94K',
   '104K',
   '114K',
   '124K',
   '134K',
   '144K',
   '154K',
   '164K',
   '174K',
   '184K',
   '194K',
   '204K',
   '214K',
   '224K',
   '234K',
   '244K',
   '254K',
   '264K',
   '274K',
   '284K',
   '294K',
   '304K',
   '314K',
   '324K'

In [110]:
gd_scraper.init_change_filter("salaries")

ENTER ELIF
CLICKED FILTER
BEFORE RESET
AFTER RESET
A
B
C
INITIALIZED SALARY
Your options for filter salaries are :
----------------------------
left_slider:  ['92K', '99K', '102K', '105K', '107K', '110K', '114K', '117K', '119K', '121K', '123K', '125K', '127K', '128K', '130K', '132K', '133K', '135K', '137K', '139K', '140K', '141K', '143K', '144K', '146K', '148K', '149K', '150K', '152K', '155K', '156K', '157K', '159K', '161K', '163K', '165K', '167K']

right_slider:  ['99K', '102K', '105K', '107K', '110K', '114K', '117K', '119K', '121K', '123K', '125K', '127K', '128K', '130K', '132K', '133K', '135K', '137K', '139K', '140K', '141K', '143K', '144K', '146K', '148K', '149K', '150K', '152K', '155K', '156K', '157K', '159K', '161K', '163K', '165K', '167K', '173K']
Enter a lower bound salary from the left slider list: 128k
Enter an upper bound salary from the left slider list: 167k


In [396]:
gd_scraper.filters["salaries"]

{'left_slider': ['92K',
  '99K',
  '102K',
  '105K',
  '108K',
  '111K',
  '114K',
  '117K',
  '119K',
  '121K',
  '123K',
  '125K',
  '127K',
  '128K',
  '130K',
  '132K',
  '133K',
  '135K',
  '137K',
  '139K',
  '140K',
  '141K',
  '143K',
  '144K',
  '146K',
  '148K',
  '149K',
  '150K',
  '152K',
  '155K',
  '156K',
  '157K',
  '159K',
  '161K',
  '163K',
  '165K',
  '167K'],
 'right_slider': ['99K',
  '102K',
  '105K',
  '108K',
  '111K',
  '114K',
  '117K',
  '119K',
  '121K',
  '123K',
  '125K',
  '127K',
  '128K',
  '130K',
  '132K',
  '133K',
  '135K',
  '137K',
  '139K',
  '140K',
  '141K',
  '143K',
  '144K',
  '146K',
  '148K',
  '149K',
  '150K',
  '152K',
  '155K',
  '156K',
  '157K',
  '159K',
  '161K',
  '163K',
  '165K',
  '167K',
  '173K']}

In [397]:
gd_scraper.change_salary_to("132k", "161k")

In [8]:
gd_scraper.close()