In [None]:
import re
import sys
import time
import math
import hashlib
import requests

In [None]:
import pandas as pd
from loguru import logger
from bs4 import BeautifulSoup

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options

In [None]:
import time
import threading
from concurrent.futures import ThreadPoolExecutor

In [None]:
from tqdm import tqdm

tqdm.pandas()

In [None]:
logger.remove()
logger.add(
    sink=sys.stdout, 
    format="<green>{time}</green> | <level>{level}</level> | <level>{message}</level>"
)

## StillHiring.today Table Scraping

In [None]:
driver = webdriver.Chrome()

In [None]:
url = "https://airtable.com/appPGrJqA2zH65k5I/shrI8dno1rMGKZM8y/tblKU0jQiyIX182uU?backgroundColor=cyan&viewControls=on"

In [None]:
driver.get(url)
table_left = None;
table_right = None;

job_loop = True;
job_finished = False;

company_dictionary = {
    "Company-Name": [],
    "Application-Links": []
};

companyNameSet = set()
companyLinkSet = set()

try:
    right_scrollbar = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".antiscroll-scrollbar-vertical"))
    )
except Exception as e:
    print(e)
    driver.quit()

if (right_scrollbar):
    actions = ActionChains(driver)

    while (job_loop):
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        company_names = soup.find_all('div', class_='cell primary read')
        company_links = soup.find_all('a', class_='link-quiet')
        hrefs = [link.get('href') for link in company_links]
        
        for company in company_names:
            if (company.text not in companyNameSet):
                company_dictionary["Company-Name"].append(company.text)
                if (company.text == "Axuall"):
                    job_finished = True;
                companyNameSet.add(company.text)
                
        for href in hrefs:
            if (company.text not in companyLinkSet):
                company_dictionary["Application-Links"].append(href)
                companyLinkSet.add(href)

        if (job_finished):
            job_loop = False;
            break;
            
        actions.click_and_hold(right_scrollbar).perform()
        scroll_amount = 10
        actions.move_by_offset(0, scroll_amount).perform()
        time.sleep(3)  # Adjust sleep time to wait for content to load
        actions.release().perform()
    driver.quit()

In [None]:
df = pd.DataFrame.from_dict(company_dictionary)

In [None]:
df

## Getting Job List of Workday

In [None]:
#df_workday = df[df['Application-Links'].str.contains("workday", case=False, na=False)]

In [None]:
chrome_options = Options()
chrome_options.add_argument("--headless=new") # for Chrome >= 109

In [None]:
prefs = {
    "profile.managed_default_content_settings.images": 2,  # Disable images
}
chrome_options.add_experimental_option("prefs", prefs)

In [None]:
df_workday = pd.read_csv("./company_urls.txt", delimiter=' ')
df_workday = df_workday.iloc[:100]

In [None]:
df_workday

In [None]:
def divide_pages_into_three_parts(total_pages):
    part_size = total_pages // 3
    remainder = total_pages % 3

    parts = []
    start_page = 1

    for i in range(3):
        end_page = start_page + part_size - 1
        if remainder > 0:
            end_page += 1
            remainder -= 1
        
        parts.append((start_page, end_page))
        start_page = end_page + 1

    return parts

In [None]:
def estimate_current_job_time(previous_predictions, completion_times, alpha = 0.5):
    """
    Update the Exponential Moving Average (EMA) based on the latest completion time
    and the previous EMA prediction.

    Parameters:
    - completion_times: list of float or int, the history of completion times, 
                        with the last entry being the most recent completion time.
    - previous_predictions: list of float, the history of previous EMA predictions, 
                            with the last entry being the most recent EMA.
    - alpha: float, the smoothing factor, where 0 < alpha <= 1. Defaults to 0.1.

    Returns:
    - float: the updated EMA value.
    """
    ema = alpha * completion_times[len(completion_times) - 1] + (1 - alpha) * previous_predictions[len(previous_predictions) - 1]
    return ema

In [None]:
def time_monitoring_task(duration, stop_event):
    start_time = time.time()
    
    while (time.time() - start_time < duration and not stop_event.is_set()):
        time.sleep(0.1)
        
    if (time.time() - start_time < duration):
        logger.debug("Terminating Timing Thread as host process finished early!")
    else:
        logger.warning("Time limit reached for current job. Signaling processing task to stop.")
        stop_event.set()
    return

In [None]:
def workday_job_scraper(url, start, finish, duration):
    page = start
    prev_page = start
    stop_event = threading.Event()
    current_thread_name = threading.current_thread().name
    executor = ThreadPoolExecutor()
    
    terms = [
        "software", "developer", "data", ".Net", "C#", "C", "C++",
        "full stack", "backend", "front end", "frontend", "backend", 
        "back-end", "back end", "systems", "DevOps", "site reliability"
    ]
    
    # Initialize the thread timer
    try:
        executor.submit(time_monitoring_task, duration, stop_event)
    except Exception as e:
        print(f"[{current_thread_name}] Could not spawn time thread! Exiting!")
        print(f"{e}")
        total_lost_jobs = (((finish - start) + 1) * 20)
        return total_lost_jobs

    # Then start scraping
    try:
        retries = 100
        hash_set = set()
        
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)
        driver.get(url)
        time.sleep(10)
            
        walk_start = 1
        while(walk_start != start and not stop_event.is_set()):
            walk_start += 1 
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            try:
                WebDriverWait(driver, 100).until(EC.element_to_be_clickable((By.XPATH, f"//button[@data-uxi-widget-type='paginationPageButton' and text()='{walk_start}']")))
            except TimeoutException:
                print(f"Could not find the pagination button for page {walk_start} within the specified time.")
                
            buttonA = driver.find_element(By.XPATH, f"//button[@data-uxi-widget-type='paginationPageButton' and text()='{walk_start}']")
            buttonA.click()

        if (stop_event.is_set()):
            logger.error("Thread Timed out during Walk!")
            total_lost_jobs = (((finish - start) + 1) * 20)
            driver.quit()
            return total_lost_jobs
        else:  
            logger.debug(f"[{current_thread_name}] Finished Walk!")

        retry_cnt = 0
        total_lost_jobs = 0
        while(page <= finish and retry_cnt != 10 and not stop_event.is_set()):
            WebDriverWait(driver, retries).until(EC.presence_of_all_elements_located((By.TAG_NAME, "li")))
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, "html.parser")
            
            li_elements = soup.find_all('li')

            """Core Scraping Logic"""
            skipped_jobs = 0
            duplicate_cnt = 0
            for li in li_elements:
                job_title_raw = li.find('a', {'data-automation-id': 'jobTitle'})
                location_raw = li.find('div', {'data-automation-id': 'locations'})
                postedTime_raw = li.find('div', {'data-automation-id': 'postedOn'})
                jobID_raw = li.find('ul', {'data-automation-id': 'subtitle'})
                
                if (job_title_raw and location_raw and postedTime_raw and jobID_raw):
                    # Job Title
                    job_title = job_title_raw.get_text()
                    
                    # Location
                    location = None
                    dd_elements_loc = location_raw.find_all('dd')
                    for dd in dd_elements_loc:
                        location = dd.get_text()
    
                    # Time
                    postedTime = None
                    dd_elements_time = postedTime_raw.find_all('dd')
                    for dd in dd_elements_time:
                        postedTime = dd.get_text()
    
                    # Job Link
                    job_link = (url.split('.com')[0] + '.com' if '.com' in url else url) + job_title_raw['href']

                    # Job ID
                    jobID = None
                    li_elements_id = jobID_raw.find_all('li')
                    readableID = ""
                    for li in li_elements_id:
                        readableID += li.get_text()
                    jobID = hashlib.sha256(readableID.encode())
                        
                    term_found = False
                    if (jobID not in hash_set):
                        for term in terms:
                            if term in job_title.lower():
                                term_found = True
                        if (term_found):
                            list_lock_A = threading.Lock()
                            with list_lock_A:
                                workday_jobs["Job_Title"].append(job_title)
                                workday_jobs["Job_Location"].append(location)
                                workday_jobs["Job_Posted_Time"].append(postedTime)
                                workday_jobs["Job_Link"].append(job_link)
                                workday_jobs["Job_ID"].append(jobID)
                                workday_jobs["Job_Meta"].append(readableID)
                        hash_set.add(jobID)
                    else:
                        duplicate_cnt += 1
                elif ((job_title_raw == None) ^ (location_raw == None) ^ (postedTime_raw == None) ^ (jobID_raw == None)):
                    skipped_jobs += 1

            if (duplicate_cnt <= 5): # Having some duplicates is fine (if possible), but a whole page is unlikely
                total_lost_jobs += skipped_jobs
                total_lost_jobs += duplicate_cnt

                prev_page = page
                page += 1
                if (page > finish):
                    break
                    
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                WebDriverWait(driver, retries).until(EC.element_to_be_clickable((By.XPATH, f"//button[@data-uxi-widget-type='paginationPageButton' and text()='{page}']")))
                buttonB = driver.find_element(By.XPATH, f"//button[@data-uxi-widget-type='paginationPageButton' and text()='{page}']")
                buttonB.click()

                
                WebDriverWait(driver, retries).until(EC.presence_of_all_elements_located((By.TAG_NAME, "li")))
                WebDriverWait(driver, retries).until(EC.presence_of_all_elements_located((By.TAG_NAME, "button")))
            elif (prev_page == page):
                retry_cnt += 1
                print(f"[{current_thread_name}] Current Page Contained too many Duplicates! ({duplicate_cnt}) Retrying...")
                time.sleep(3)
                
        if (retry_cnt == 10):
            logger.error("Hit Max Retry Count!")
            total_lost_jobs = (((finish - start) + 1) * 20)
        elif (stop_event.is_set()):
            logger.error("Thread Timed out!")
            total_lost_jobs = (((finish - start) + 1) * 20)
        
        logger.info(f"[{current_thread_name}] Total Jobs Lost: {total_lost_jobs}")
        driver.quit()
        stop_event.set()
        executor.shutdown(wait=True)
        
        return total_lost_jobs
    except Exception as e:
        current_thread_name = threading.current_thread().name
        total_lost_jobs = (((finish - start) + 1) * 20)
        print(f"[ERROR] {current_thread_name} ran into an error!")
        print(f"[ERROR] on page {page}!")
        logger.exception(f"Error occcured for {url}")
        return total_lost_jobs

In [None]:
def workday_job_scraper_multithread(url):
    if (len(url.split("/")) >= 5 and "en-US" not in url):
        logger.info(f"Skipped link '{url}' due to job workday referring outside the U.S")
        return True
        
    start_time = time.time()
    estimated_time = estimate_current_job_time(historical_EMA_Predictions, actual_completion_times)
    ema_with_constant = estimated_time + 300
    
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.set_page_load_timeout(30)
        driver.get(url)
        time.sleep(3)
    
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
    
        driver.quit()
    except Exception as e:
        print(f"Could not get total number of jobs!")
        print(f"Error occcured at for {url} : {e}")
        return False
    
    p_element = soup.find('p', {'data-automation-id': 'jobOutOfText'})
    total_jobs = -1
    
    if (p_element):
        total_jobs = int(p_element.get_text().split()[4])
    else:
        return False

    total_pages = total_jobs // 20

    print(f"*** FOR {url} ***")
    print(f"Total number of jobs is: {total_jobs}")
    print(f"Total number of pages is: {total_pages}")
    print(f"Estimated time for completion of longest running thread: {estimated_time}")

    if (total_pages < 3):
        workday_job_scraper(url, 1, total_pages, ema_with_constant)
    else:
        partition = divide_pages_into_three_parts(total_pages)
    
        logger.info(f"Partitioning pages scheme per thread: {partition}")
        logger.debug("Executing Threading!")
        
        with ThreadPoolExecutor(max_workers=3) as executor:
            jobs_to_scrape = [
                (f"{url}", partition[0][0], partition[0][1], ema_with_constant),
                (f"{url}", partition[1][0], partition[1][1], ema_with_constant),
                (f"{url}", partition[2][0], partition[2][1], ema_with_constant)
            ]
            futures = [executor.submit(workday_job_scraper, url, start, finish, duration) for url, start, finish, duration in jobs_to_scrape]

        overall_jobs_lost = 0
        for future in futures:
            overall_jobs_lost += future.result()
        percentage = (overall_jobs_lost / total_jobs) * 100
        job_loss_rate_arr.append(percentage)

        end_time = time.time()
        elapsed_time = end_time - start_time
        actual_completion_times.append(elapsed_time)
        historical_EMA_Predictions.append(estimated_time)
        
        print("\n --- Scrape Statistics --- ")
        print(f"Execution time: {elapsed_time} seconds")
        print(f"Jobs Lost this Workday Link: {overall_jobs_lost}")
        print(f"Jobs Loss: {percentage}%")
        print(f"Average Job Loss Rate: {math.trunc((sum(job_loss_rate_arr) / len(job_loss_rate_arr)) * 100) / 100}% \n\n")
    
    return True

In [None]:
workday_jobs = {}
workday_jobs["Job_Title"] = []
workday_jobs["Job_Location"] = []
workday_jobs["Job_Posted_Time"] = []
workday_jobs["Job_Link"] = []
workday_jobs["Job_ID"] = []
workday_jobs["Job_Meta"] = []

job_loss_rate_arr = []

historical_EMA_Predictions = [0] # Initialize with 200 Seconds of EMA
actual_completion_times = [200] # Initalize with actual time

df_workday["Application-Links"].progress_apply(workday_job_scraper_multithread)

In [None]:
jobs_workday_master = pd.DataFrame(workday_jobs)

In [None]:
len(jobs_workday_master)

In [None]:
jobs_workday_master

In [None]:
def converTimeToNum(x):
    if (x == "Posted Today"):
        return 0
    elif (x == "Posted Yesterday"):
        return 1
    num = x.split()[1]
    if (num == "30+"):
        return 30
    
    return int(num)

In [None]:
jobs_workday_master["Job_Posted_Time"] = jobs_workday_master["Job_Posted_Time"].progress_apply(converTimeToNum)

In [None]:
jobs_workday_master

In [None]:
df_sorted = jobs_workday_master.sort_values(by='Job_Posted_Time')

### Now we do a criteria seperation

In [None]:
def extract_string(url):
    # Find the substring between '//' and the first '/' after it
    match = re.search(r'https?://([^./]+)\.', url)
    if match:
        return match.group(1)
    return None

In [None]:
df_sorted["Company"] = df_sorted["Job_Link"].progress_apply(extract_string)

In [None]:
df_sorted

In [None]:
exclude_strings_titles = ["Intern", "Internship", "Temporary", "Senior", "Lead", "Principal", "Staff", "Manager", "Director", "Head", "Chief", "Architect", "VP", "Vice President", "Manager", "Sr"]
pattern_titles = '|'.join(exclude_strings_titles)
df_filtered = df_sorted[~df_sorted['Job_Title'].str.contains(pattern_titles, na=False)]

In [None]:
exclude_strings_location = ["Mexico", "India", "Poland", "Toronto", "Ireland", "Bangalore", "China", "Pune", "Singapore", "Bengaluru", "Israel", "Noida", "Manila", "Gurgaon"]
pattern_locations = '|'.join(exclude_strings_location)
df_filtered = df_filtered[~df_filtered['Job_Location'].str.contains(pattern_locations, na=False)]

In [None]:
df_filtered = df_filtered[df_filtered['Job_Posted_Time'] < 7]

In [None]:
df_filtered

In [None]:
final_df = df_filtered.drop_duplicates(subset=['Job_Meta'], keep='first')

In [None]:
final_df

In [None]:
final_df.to_csv("./workday_jobs.csv")

In [None]:
df = pd.read_csv("./test_workday.csv")

In [None]:
df

In [None]:
def extract_string(url):
    # Find the substring between '//' and the first '/' after it
    match = re.search(r'https?://([^./]+)\.', url)
    if match:
        return match.group(1)
    return None

In [None]:
def converTimeToNum2(x):
    try:
        if (x == "Posted Today"):
            return 0
        elif (x == "Posted Yesterday"):
            return 1
        num = x.split()[1]
        if (num == "30+"):
            return 30
        return int(num)
    except ValueError as e:
        return None

In [None]:
df["Job_Posted_Time"] = df["Job_Posted_Time"].progress_apply(converTimeToNum2)


In [None]:
df = df.dropna(subset=["Job_Posted_Time"])

In [None]:
df_sorted = df.sort_values(by='Job_Posted_Time')

In [None]:
df_sorted["Company"] = df_sorted["Job_Link"].progress_apply(extract_string)

In [None]:
exclude_strings_titles = ["Intern", "Internship", "Temporary", "Senior", "Lead", "Principal", "Staff", "Manager", "Director", "Head", "Chief", "Architect", "VP", "Vice President", "Manager", "Sr"]
pattern_titles = '|'.join(exclude_strings_titles)
df_filtered = df_sorted[~df_sorted['Job_Title'].str.contains(pattern_titles, na=False)]


In [None]:
exclude_strings_location = [
    "Mexico", "India", "Poland", "Toronto", "Ireland", 
    "Bangalore", "China", "Pune", "Singapore", "Bengaluru", 
    "Israel", "Noida", "Manila", "Gurgaon", "Prague", 
    "Cairo", "Seoul", "Mumbai", "Lund", "Madrid"
]
pattern_locations = '|'.join(exclude_strings_location)
df_filtered = df_filtered[~df_filtered['Job_Location'].str.contains(pattern_locations, na=False)]


In [None]:
df_filtered = df_filtered[df_filtered['Job_Posted_Time'] < 7]

In [None]:
df_filtered

In [None]:
df_filtered.to_csv("./test.csv")

In [None]:
logger.info("That's it, beautiful and simple logging!")
logger.debug("That's it, beautiful and simple logging!")
logger.warning("That's it, beautiful and simple logging!")
logger.error("That's it, beautiful and simple logging!")
logger.success("That's it, beautiful and simple logging!")
logger.critical("That's it, beautiful and simple logging!")
logger.exception("That's it, beautiful and simple logging!", "ERROR!")

In [None]:
from pymongo import MongoClient 

In [None]:
import re
import pandas as pd

In [None]:
uri = "mongodb+srv://zacharygou:jobscrper@cluster0.mv6xv.mongodb.net/"
df = pd.read_csv("/Users/zacharyg/Downloads/company_urls_unique.csv")

In [None]:
df

In [None]:
client = MongoClient(uri)
db = client["jobscrper"]
collection = db["company_metadata"]

In [None]:
def extract_string(url):
    # Find the substring between '//' and the first '/' after it
    match = re.search(r'https?://([^./]+)\.', url)
    if match:
        return match.group(1)
    return None

In [None]:
for index, row in df.iterrows():
    document = {
        "name": extract_string(row["Application-Links"]),
        "url": row["Application-Links"]
    }
    collection.insert_one(document)

In [None]:
all_documents = collection.find()
test = {"urls": []}
for document in all_documents:
    test["urls"].append(document["url"])

In [None]:
df = pd.DataFrame(test)

In [None]:
df

In [None]:
import redis

In [None]:
r = redis.Redis(host='localhost', port=6379, db=0)

In [None]:
r.set('foo', 'bar')
r.get('foo')

In [None]:
keys = r.keys('*')
keys

In [None]:
r.delete('25209055324c93f6b8be881c59c691cdfe29f95e92c1eb01accb27713a6affba')

In [None]:
keys = r.keys('*')
keys

In [None]:
print(r.get('foo'))

In [None]:
len('https://genesys.wd1.myworkdayjobs.com/es/Genesys'.split("/"))

## Using Playwright

In [1]:
url = "https://cvshealth.wd1.myworkdayjobs.com/CVS_Health_Careers"

In [7]:
import time
import asyncio
from playwright.async_api import async_playwright

In [8]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()
page = await browser.new_page()
await page.goto(url)

await asyncio.sleep(5)

content = await page.content()

await browser.close()
await playwright.stop()

print(content)

<!DOCTYPE html><html lang="en-US" dir="ltr" data-react-helmet="dir,lang"><head>
    <title>Careers</title>
    <!-- Application Properties -->
    <meta http-equiv="X-UA-Compatible" content="chrome=1;IE=EDGE">
    <meta http-equiv="content-type" content="text/html; charset=UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0">

    <link rel="canonical" href="https://cvshealth.wd1.myworkdayjobs.com/CVS_Health_Careers">

    <!-- OpenGraph Tags -->
    <meta name="title" property="og:title">
    <meta name="description" property="og:description" content="Our Work Experience is the combination of everything that's unique about us: our culture, our core values, our company meetings, our commitment to sustainability, our recognition programs, but most importantly, it's our people. Our employees are self-disciplined, hard working, curious, trustworthy, humble, and truthful. They make choices according to what is best for the team, they live for

In [10]:
import requests

def get_page_content(url):
    # Optional headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    return response.text

# Usage
content = get_page_content(url)
print(content)

<!DOCTYPE html>
<html lang="en-US">
<head>
    <title></title>
    <!-- Application Properties -->
    <meta http-equiv="X-UA-Compatible" content="chrome=1;IE=EDGE"/>
    <meta http-equiv="content-type" content="text/html; charset=UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=2.0">

    <link rel="canonical" href="https://cvshealth.wd1.myworkdayjobs.com/CVS_Health_Careers" />

    <!-- OpenGraph Tags -->
    <meta name="title" property="og:title">
    <meta name="description" property="og:description" content="Our Work Experience is the combination of everything that&#39;s unique about us: our culture, our core values, our company meetings, our commitment to sustainability, our recognition programs, but most importantly, it&#39;s our people. Our employees are self-disciplined, hard working, curious, trustworthy, humble, and truthful. They make choices according to what is best for the team, they live for opportunities to collaborate and

In [None]:
import json
import subprocess


def get_rendered_content(url):
    node_script = """
    const jsdom = require('jsdom');
    const axios = require('axios');
    const { JSDOM } = jsdom;

    async function getContent(url) {
        try {
            // Get the page
            const response = await axios.get(url);
            
            // Create minimal DOM environment
            const dom = new JSDOM(response.data, {
                url: url,
                runScripts: 'dangerously',  // Allows JS execution
            });

            const { window } = dom;
            const { document } = window;

            // Add axios to window for any AJAX calls
            window.axios = axios;

            // Let any scripts run
            await new Promise(resolve => setTimeout(resolve, 1000));

            // Get your content (modify selectors as needed)
            const content = {
                html: dom.serialize(),
                // Example: get all links
                links: Array.from(document.querySelectorAll('a')).map(a => ({
                    href: a.href,
                    text: a.textContent.trim()
                }))
            };

            return content;

        } catch (error) {
            console.error('Error:', error.message);
            return null;
        }
    }

    getContent(process.argv[1])
        .then(result => console.log(JSON.stringify(result)))
        .catch(error => console.error(error));
    """

    result = subprocess.run(
        ['node', '-e', node_script, url],
        capture_output=True,
        text=True
    )
    
    return json.loads(result.stdout)