In [45]:
import time
import pandas as pd
from argparse import ArgumentParser
import argparse
import logging
import logging.config
from selenium import webdriver
from selenium.webdriver import ActionChains
import selenium
import numpy as np

import json
import urllib
import datetime as dt
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException


start = time.time()

DEFAULT_URL = ('https://www.glassdoor.com/Overview/Working-at-'
               'Premise-Data-Corporation-EI_IE952471.11,35.htm')

parser = ArgumentParser()
parser.add_argument('-u', '--url',
                    help='URL of the company\'s Glassdoor landing page.',
                    default=DEFAULT_URL)
parser.add_argument('-f', '--file', default='glassdoor_ratings.csv',
                    help='Output file.')
parser.add_argument('--headless', action='store_true',
                    help='Run Chrome in headless mode.')
parser.add_argument('--username', help='Email address used to sign in to GD.')
parser.add_argument('-p', '--password', help='Password to sign in to GD.')
parser.add_argument('-c', '--credentials', help='Credentials file')
parser.add_argument('-l', '--limit', default=25,
                    action='store', type=int, help='Max reviews to scrape')
parser.add_argument('--start_from_url', action='store_true',
                    help='Start scraping from the passed URL.')
parser.add_argument(
    '--max_date', help='Latest review date to scrape.\
    Only use this option with --start_from_url.\
    You also must have sorted Glassdoor reviews ASCENDING by date.',
    type=lambda s: dt.datetime.strptime(s, "%Y-%m-%d"))
parser.add_argument(
    '--min_date', help='Earliest review date to scrape.\
    Only use this option with --start_from_url.\
    You also must have sorted Glassdoor reviews DESCENDING by date.',
    type=lambda s: dt.datetime.strptime(s, "%Y-%m-%d"))
args = parser.parse_args()




# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


with open('secret.json') as f:
    d = json.loads(f.read())
    args.username = d['username']
    args.password = d['password']


mainurl = 'https://www.glassdoor.com/Reviews/index.htm'  

# Set up Chrome options
options = Options()
#options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize the WebDriver
browser = webdriver.Chrome(options=options)

def take_screenshot(filename):
    browser.save_screenshot(filename)
    logger.info(f"Screenshot saved as {filename}")

def sign_in():
    logger.info(f'Signing in to {args.username}')
    
    url = 'https://www.glassdoor.com/profile/login_input.htm'
    browser.get(url)
    
    take_screenshot('login_page.png')  # Take a screenshot of the login page
    
    try:
        # Enter email
        email_field = WebDriverWait(browser, 30).until(
            EC.presence_of_element_located((By.XPATH, '//input[@type="email"]'))
        )
        email_field.send_keys(args.username)
        email_btn = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '//button[@data-test="email-form-button"]'))
        )
        email_btn.click()
        print('email sent and clicked')
        time.sleep(3)
        
        # Enter password
        password_field = WebDriverWait(browser, 30).until(
            EC.presence_of_element_located((By.XPATH, '//input[@type="password"]'))
        )
        password_field.send_keys(args.password)
        print('password sent')
        time.sleep(3)
        submit_btn = browser.find_element((By.XPATH, '//button[@type="submit"]'))
        time.sleep(3)
        submit_btn.click()
        print('submit clicked')

        # Wait for login to complete and check for CAPTCHA or successful login
        try:
            WebDriverWait(browser, 30).until(
                EC.url_contains('glassdoor.com')
            )
            # Check if the user is redirected to a logged-in state
            logged_in_elements = WebDriverWait(browser, 30).until(
                EC.presence_of_all_elements_located((By.XPATH, '//body[@class="main loggedIn lang-en en-US gdGrid _initOk"]'))
            )
            if logged_in_elements:
                logger.info("Successfully logged in. Navigating to main URL.")
                browser.get(mainurl)
                take_screenshot('post_login.png')
            else:
                logger.error("Login failed or CAPTCHA required.")
                take_screenshot('login_failed.png')
                # Optionally, you can handle CAPTCHA here or prompt for manual intervention
        except TimeoutException:
            logger.error("Timeout while waiting for successful login or CAPTCHA prompt.")
            take_screenshot('login_timeout.png')
    
    except TimeoutException:
        logger.error("Timed out waiting for the login elements to be available")
        take_screenshot('timeout_error.png')
    except NoSuchElementException:
        logger.error("Could not find one or more of the login elements")
        take_screenshot('element_error.png')
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        take_screenshot('unexpected_error.png')



# Function to scrape company names and ratings
def scrape_companies_and_ratings():

    df = pd.DataFrame(columns=['Company Name', 'Rating'])

    # Scrape company names
    company_elements = WebDriverWait(browser, 30).until(
        EC.presence_of_all_elements_located((By.XPATH, "//h2[@data-test='employer-short-name']"))
    )
    company_names = [company.text for company in company_elements]

    rating_elements = browser.find_elements(By.XPATH, '//span[@data-test="rating"]')
    company_ratings = [rating.text for rating in rating_elements]

    data_tuples = list(zip(company_names, company_ratings))
    temp_df = pd.DataFrame(data_tuples, columns=['Company Name', 'Rating'])
    df = df.append(temp_df, ignore_index=True)

    print(df)

try:
    sign_in()
    scrape_companies_and_ratings()
finally:
    browser.quit()


INFO:__main__:Signing in to adhanani@trinity.edu
INFO:__main__:Screenshot saved as login_page.png


email sent and clicked
password sent


ERROR:__main__:An unexpected error occurred: Message: invalid argument: 'using' must be a string
  (Session info: chrome=127.0.6533.120)
Stacktrace:
0   chromedriver                        0x0000000102cf9024 cxxbridge1$str$ptr + 1887276
1   chromedriver                        0x0000000102cf1700 cxxbridge1$str$ptr + 1856264
2   chromedriver                        0x000000010290082c cxxbridge1$string$len + 88524
3   chromedriver                        0x0000000102944210 cxxbridge1$string$len + 365488
4   chromedriver                        0x000000010297c48c cxxbridge1$string$len + 595500
5   chromedriver                        0x0000000102939474 cxxbridge1$string$len + 321044
6   chromedriver                        0x000000010293a0e4 cxxbridge1$string$len + 324228
7   chromedriver                        0x0000000102cc0a08 cxxbridge1$str$ptr + 1656336
8   chromedriver                        0x0000000102cc5464 cxxbridge1$str$ptr + 1675372
9   chromedriver                        0x00000001

TimeoutException: Message: 
