In [3]:
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, WebDriverException
from openpyxl import load_workbook
from openpyxl.styles import Font
import time
import urllib.parse
from IPython.display import FileLink

def scrape_indeed_jobs():
    """
    Interactive Indeed Job Scraper with Contract Role Filtering.
    - Prompts user for job title, category, location, job type (e.g., Contract), and number of pages.
    - Scrapes Indeed job listings dynamically based on user input.
    - Saves results in an Excel file with clickable "Apply Now" links.
    - Provides a download link for the Excel file.
    """

    # 🔹 Step 1: Get User Input
    search_keyword = input("Enter job title or keyword (e.g., AI Engineer): ").strip()
    category = input("Enter job category (optional, press Enter to skip): ").strip()
    location = input("Enter job location (e.g., Chicago, IL): ").strip()
    job_type = input("Enter job type (Full-time, Part-time, Contract, Internship, Temporary) or leave blank for all: ").strip().lower()
    num_pages = int(input("Enter number of pages to scrape (e.g., 5): ").strip())

    # 🔹 Step 2: Map Job Type to Indeed Filters
    job_type_map = {
        "full-time": "sc=0kf:jtype(fulltime):",
        "part-time": "sc=0kf:jtype(parttime):",
        "contract": "sc=0kf:jtype(contract):",
        "internship": "sc=0kf:jtype(internship):",
        "temporary": "sc=0kf:jtype(temporary):"
    }
    job_type_param = f"&{job_type_map[job_type]}" if job_type in job_type_map else ""

    # 🔹 Step 3: Build the Indeed URL with User Inputs
    search_param = search_keyword.replace(" ", "+")
    location_encoded = urllib.parse.quote(location)
    category_param = f"&sc=0kf:{category.replace(' ', '+')}:" if category else ""

    all_jobs = []
    output_file = f"Indeed_{search_keyword}_{location}_{job_type}.xlsx".replace(" ", "_")

    # Set up Undetected ChromeDriver
    options = uc.ChromeOptions()
    options.headless = False  # Set to True for headless mode
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36")
    options.add_argument("accept-language=en-US,en;q=0.9")

    try:
        driver = uc.Chrome(options=options)
    except WebDriverException as e:
        print(f"❌ Failed to launch browser: {e}")
        return None

    for page_num in range(num_pages):
        url = f"https://www.indeed.com/jobs?q={search_param}{category_param}&l={location_encoded}{job_type_param}&start={page_num * 10}"
        print(f"🔄 Scraping Page {page_num + 1} - URL: {url}")

        try:
            driver.get(url)
            time.sleep(5)  # Wait for page to load

            # Accept cookies popup if exists
            try:
                cookie_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")
                cookie_button.click()
                time.sleep(2)
            except NoSuchElementException:
                pass  # No cookie popup found

            # Find job listings
            job_cards = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")

            if not job_cards:
                print(f"⚠️ No jobs found on Page {page_num + 1}.")
                continue

            print(f"✅ Page {page_num + 1}: Found {len(job_cards)} jobs.")

            for job in job_cards:
                try:
                    title_element = job.find_element(By.TAG_NAME, "h2").find_element(By.TAG_NAME, "a")
                    title = title_element.text.strip()
                    link = "https://www.indeed.com" + title_element.get_attribute("href")

                    # Extract company name
                    try:
                        company_element = job.find_element(By.CSS_SELECTOR, "[data-testid='company-name']")
                        company = company_element.text.strip()
                    except NoSuchElementException:
                        company = "N/A"

                    # Extract location
                    try:
                        location_element = job.find_element(By.CSS_SELECTOR, "[data-testid='text-location']")
                        job_location = location_element.text.strip()
                    except NoSuchElementException:
                        job_location = "N/A"

                    # Extract salary
                    try:
                        salary_element = job.find_element(By.CSS_SELECTOR, "span[data-testid='attribute_snippet-text']")
                        salary = salary_element.text.strip()
                    except NoSuchElementException:
                        salary = "N/A"

                    all_jobs.append({
                        "Title": title,
                        "Company": company,
                        "Location": job_location,
                        "Salary": salary,
                        "Link": link
                    })

                    print(f"   📌 {title} at {company} ({job_location}) - 💰 {salary}")

                except Exception as e:
                    print(f"⚠️ Skipping job due to error: {e}")

        except WebDriverException as e:
            print(f"❌ Error navigating to page {page_num + 1}: {e}")
            continue

    driver.quit()

    if not all_jobs:
        print("❌ No jobs found. Exiting...")
        return None

    # Convert to DataFrame
    df = pd.DataFrame(all_jobs)

    # Fix incorrect URLs (remove duplicate Indeed domain if present)
    df["Link"] = df["Link"].apply(lambda url: url.replace("https://www.indeed.comhttps://www.indeed.com", "https://www.indeed.com"))

    # Create "Apply Now" clickable hyperlinks for Excel using the HYPERLINK function
    df["Apply Now"] = df["Link"].apply(lambda url: f'=HYPERLINK("{url}", "📝 Apply Now")')

    # Save the DataFrame to an Excel file
    df.drop(columns=["Link"], inplace=True)  # Remove raw link column
    df.to_excel(output_file, index=False, engine="openpyxl")

    # Load the Excel file and format hyperlinks
    wb = load_workbook(output_file)
    ws = wb.active

    # Apply hyperlink formatting to the "Apply Now" column
    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=ws.max_column, max_col=ws.max_column):
        for cell in row:
            cell.font = Font(color="0000FF", underline="single")  # Blue color with underline

    # Save the final formatted Excel file
    wb.save(output_file)

    print(f"\n📄 Excel file '{output_file}' saved successfully with clickable 'Apply Now' links!")

    # Provide download link in Jupyter Notebook
    return output_file

# Run the function interactively
output_file = scrape_indeed_jobs()

# Provide download link in Jupyter Notebook
if output_file:
    display(FileLink(output_file))


Enter job title or keyword (e.g., AI Engineer):  devops
Enter job category (optional, press Enter to skip):  
Enter job location (e.g., Chicago, IL):  Orlando, FL
Enter job type (Full-time, Part-time, Contract, Internship, Temporary) or leave blank for all:  Full-time
Enter number of pages to scrape (e.g., 5):  50


🔄 Scraping Page 1 - URL: https://www.indeed.com/jobs?q=devops&l=Orlando%2C%20FL&sc=0kf:jtype(fulltime):&start=0
✅ Page 1: Found 15 jobs.
   📌 Staff Build and Release (DevOps) Engineer at GE Vernova (Melbourne, FL 32934) - 💰 N/A
   📌 Senior Manager - DevOps Engineering at Marriott Vacations Worldwide (Orlando, FL 32836) - 💰 N/A
   📌 Sr. AWS DevOps Engineer at CAPCO (Hybrid work in Orlando, FL) - 💰 N/A
   📌 Vice President, Site Reliability/DevOps Engineer (Dev Infrastructure Platform) at BNY Mellon (Lake Mary, FL 32746) - 💰 N/A
   📌 Senior AWS DevOps Engineer at Deloitte (Lake Mary, FL 32746) - 💰 N/A
   📌 Software Engineer at Disney Experiences (Orlando, FL) - 💰 N/A
   📌 Deloitte Consulting, US Delivery Center, Government & Public Services - Solution Analyst (Software Developer) at Deloitte (Lake Mary, FL 32746) - 💰 N/A
   📌 Senior Python Developer/DevOps Engineer at Six Feet Up Inc. (Windermere, FL) - 💰 N/A
   📌 Software Engineer (DevSecOps) /Embedded/C++ /Orlando, FL at Lockheed Martin