In [2]:
#Made by Durga Linares
#Slight adjustments to save folder by Alfredo Espinoza
#!pip install -r requirements.txt
# --- LIBRARIES AND CONFIGURATION ---
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

print("--- STARTING SCRAPING SCRIPT ---")

# --- ROBUST DESCRIPTION EXTRACTION FUNCTION ---
def extract_job_description(driver):
    """Extract job description using multiple strategies"""
    description_selectors = [
        "//*[contains(text(), 'Descripción del puesto')]/following-sibling::*[1]",
        "//*[contains(text(), 'Descripción')]/following::p[1]",
        "//*[contains(text(), 'DESCRIPCIÓN')]/following::div[1]",
        "//h3[contains(., 'Descripción')]/following-sibling::p",
        "//div[contains(@class, 'description')]",
        "//div[contains(@class, 'job-description')]",
        "//div[contains(@class, 'job-detail-description')]//p",
    ]
    
    for selector in description_selectors:
        try:
            element = driver.find_element(By.XPATH, selector)
            text = element.get_attribute('innerText') or element.text
            
            if text and text.strip():
                if 'BENEFICIOS' in text.upper():
                    description = text.split('BENEFICIOS')[0].strip()
                else:
                    description = text.strip()
                
                description = description.replace('\n', ' ').replace('\r', ' ')
                description = ' '.join(description.split())
                return description[:800] if len(description) > 800 else description
                
        except Exception:
            continue
    
    return "Not found"

# ==============================================================================
# --- STAGE 1: EXTRACT JOB POSTING LINKS FROM ALL PAGES ---
# ==============================================================================

# Setup driver
driver = webdriver.Chrome()
url = "https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias.html"
print(f"Navigating to: {url}")
driver.get(url)
time.sleep(5)

# Handle cookies popup
try:
    cookie_button = driver.find_element(By.ID, "onetrust-accept-btn-handler")
    cookie_button.click()
    print("Cookies accepted")
    time.sleep(3)
except:
    print("No cookies popup")

# Extract job links from ALL pages
job_links = []
print("\n--- STAGE 1: Extracting job links from pages 1 to 6 ---")

for page in range(1, 7):
    print(f"📄 Processing page {page}...")
    
    try:
        # Wait for job listings to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="listado-avisos"]/div/a'))
        )

        jobs = driver.find_elements(By.XPATH, '//*[@id="listado-avisos"]/div/a')
        
        links_on_this_page = 0
        for job in jobs:
            link = job.get_attribute("href")
            # Filter valid job links
            if (link and 
                '/empleos/' in link and 
                link.endswith('.html') and
                link not in job_links and
                '#' not in link and
                'javascript' not in link.lower()):
                
                job_links.append(link)
                links_on_this_page += 1
        
        print(f"✅ Found {links_on_this_page} valid links on page {page}")
        print(f"📊 Total links so far: {len(job_links)}")
        
        # Navigate to next page (only if not last page)
        if page < 6:
            try:
                next_button = driver.find_element(By.LINK_TEXT, str(page + 1))
                driver.execute_script("arguments[0].click();", next_button)
                print(f"🔀 Clicked page {page + 1} button")
                time.sleep(5)
                
                # Wait for new page to load
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.XPATH, '//*[@id="listado-avisos"]/div/a'))
                )
                
            except Exception as e:
                print(f"❌ Could not navigate to page {page + 1}: {e}")
                break
        else:
            print("✅ Reached the last page (6)")
                
    except Exception as e:
        print(f"❌ Error on page {page}: {e}")
        break

print(f"\n✅ Extraction completed! Total job links found: {len(job_links)}")

# ==============================================================================
# --- STAGE 2: SCRAPE JOB DETAILS FROM ALL LINKS ---
# ==============================================================================

scraped_data = []
print(f"\n--- STAGE 2: Scraping details from {len(job_links)} jobs ---")

# PROCESS ALL JOBS
for i, link in enumerate(job_links):
    try:
        print(f"🔍 Processing job {i+1}/{len(job_links)}")
        driver.get(link)
        wait = WebDriverWait(driver, 10)

        # Wait for page to load
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='header-component']//h1")))
        except TimeoutException:
            print("   ⚠️ Page didn't load properly, skipping...")
            continue

        # Click 'Detalle del empleo' if available
        try:
            detalle_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[normalize-space()='Detalle del empleo']")))
            driver.execute_script("arguments[0].click();", detalle_button)
            time.sleep(3)
        except (NoSuchElementException, TimeoutException):
            pass
        
        offer_details = {}

        # Job Title
        try:
            title = driver.find_element(By.XPATH, "//div[@id='header-component']//h1").text
            offer_details['Job Title'] = title.strip()
        except NoSuchElementException:
            offer_details['Job Title'] = "Not found"

        # Description
        offer_details['Description'] = extract_job_description(driver)

        # District
        try:
            location_text = driver.find_element(By.XPATH, "//i[@name='icon-light-location-pin']/following-sibling::h2").text
            district = location_text.split(',')[0].strip()
            offer_details['District'] = district
        except NoSuchElementException:
            offer_details['District'] = "Not specified"

        # Work Mode
        try:
            work_mode = driver.find_element(By.XPATH, "//i[@name='icon-light-office']/following-sibling::p").text
            offer_details['Work Mode'] = work_mode.strip()
        except NoSuchElementException:
            offer_details['Work Mode'] = "Not specified"

        scraped_data.append(offer_details)
        print(f"✅ Completed job {i+1}/{len(job_links)}")

    except Exception as e:
        print(f"❌ Error processing job {i+1}: {str(e)}")
        continue

# ==============================================================================
# --- CREATE CSV AND DISPLAY RESULTS ---
# ==============================================================================

if scraped_data:
    df = pd.DataFrame(scraped_data)
    
    # Display compact table
    print("\n" + "="*60)
    print("📋 EXTRACTED JOB DATA")
    print("="*60)
    
    # Create compact display
    display_df = df.copy()
    display_df['Job Title'] = display_df['Job Title'].apply(lambda x: x[:30] + '...' if len(str(x)) > 30 else x)
    display_df['Description'] = display_df['Description'].apply(lambda x: x[:40] + '...' if len(str(x)) > 40 else x)
    
    print(display_df.to_string(index=False))
    
    # Save to CSV
    import os
    output_folder = "output"
    os.makedirs(output_folder, exist_ok=True)
    output_filename = os.path.join(output_folder, "bumeran_programacion_jobs.csv")
    df.to_csv(output_filename, index=False, encoding='utf-8-sig')
    
    print(f"\n💾 CSV file saved: '{output_filename}'")
    print(f"📊 Total jobs extracted: {len(df)}")
    
else:
    print("❌ No data extracted")

# Close browser
driver.quit()
print("\n✅ SCRAPING COMPLETED! Browser closed.")

--- STARTING SCRAPING SCRIPT ---
Navigating to: https://www.bumeran.com.pe/en-lima/empleos-area-tecnologia-sistemas-y-telecomunicaciones-subarea-programacion-full-time-publicacion-menor-a-15-dias.html
No cookies popup

--- STAGE 1: Extracting job links from pages 1 to 6 ---
📄 Processing page 1...
✅ Found 20 valid links on page 1
📊 Total links so far: 20
🔀 Clicked page 2 button
📄 Processing page 2...
✅ Found 20 valid links on page 2
📊 Total links so far: 40
🔀 Clicked page 3 button
📄 Processing page 3...
✅ Found 19 valid links on page 3
📊 Total links so far: 59
🔀 Clicked page 4 button
📄 Processing page 4...
✅ Found 20 valid links on page 4
📊 Total links so far: 79
🔀 Clicked page 5 button
📄 Processing page 5...
✅ Found 19 valid links on page 5
📊 Total links so far: 98
🔀 Clicked page 6 button
📄 Processing page 6...
✅ Found 9 valid links on page 6
📊 Total links so far: 107
✅ Reached the last page (6)

✅ Extraction completed! Total job links found: 107

--- STAGE 2: Scraping details from 107