In [3]:
pip install selenium webdriver-manager pandas beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [50]:
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

def get_info(title, location, page=1):  # ✅ test with 1 page first
    full_job = []
    options = uc.ChromeOptions()
    options.binary_location = '/usr/bin/google-chrome'
    driver = uc.Chrome(options=options, version_main=145)

    for p in range(0, page * 20, 20):
        listing_url = f"https://indeed.com/jobs?q={title}&l={location}&start={p}"
        driver.get(listing_url)
        time.sleep(10)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        cards = soup.find_all('div', class_=lambda c: c and 'job_seen_beacon' in c)
        card_count = len(cards)
        print(f"Found {card_count} cards on page {p}")

        seen_descriptions = set()
        idx = 0

        while idx < card_count:
            try:
                if listing_url not in driver.current_url:
                    driver.get(listing_url)
                    time.sleep(8)

                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div.job_seen_beacon"))
                )
                time.sleep(2)
                live_cards = driver.find_elements(By.CSS_SELECTOR, "div.job_seen_beacon")

                if idx >= len(live_cards):
                    print(f"  Card {idx}: not in DOM, skipping")
                    idx += 1
                    continue

                # ✅ Snapshot BEFORE click
                try:
                    old_desc = driver.find_element(By.ID, "jobDescriptionText").text.strip()
                except:
                    old_desc = "__EMPTY__"

                print(f"\n  [Card {idx}] BEFORE click — desc starts with: {old_desc[:80]!r}")

                url_before = driver.current_url

                # ✅ Click the job TITLE link directly — more reliable than clicking the card div
                try:
                    title_link = live_cards[idx].find_element(By.CSS_SELECTOR, "h2 a, a[data-jk]")
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", title_link)
                    time.sleep(1)
                    ActionChains(driver).move_to_element(title_link).click().perform()
                except Exception as ce:
                    print(f"  Title link click failed ({ce}), falling back to JS click on card")
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", live_cards[idx])
                    time.sleep(1)
                    driver.execute_script("arguments[0].click();", live_cards[idx])

                time.sleep(3)
                print(f"  [Card {idx}] AFTER click — URL: {driver.current_url[:80]!r}")

                # ✅ Full page redirect handling
                if driver.current_url != url_before and listing_url not in driver.current_url:
                    print(f"  [Card {idx}] Full page redirect detected")
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.ID, "jobDescriptionText"))
                        )
                        description = driver.find_element(By.ID, "jobDescriptionText").text.strip()
                    except:
                        description = "N/A"
                    try:
                        salary = driver.find_element(By.CSS_SELECTOR, "div#jobDetailsSection").text.strip()
                    except:
                        salary = "Not Listed"
                    try:
                        date = driver.find_element(By.ID, "jobsearch-JobMetadataFooter").text.strip()
                    except:
                        date = "N/A"
                    driver.back()
                    time.sleep(8)

                else:
                    # ✅ Side panel mode
                    print(f"  [Card {idx}] Side panel mode — waiting for desc change...")
                    description = "__NOT_LOADED__"
                    for attempt in range(3):  # retry up to 3 times
                        try:
                            WebDriverWait(driver, 10).until(lambda d: (
                                d.find_element(By.ID, "jobDescriptionText").text.strip() != old_desc and
                                d.find_element(By.ID, "jobDescriptionText").text.strip() != ""
                            ))
                            new_desc = driver.find_element(By.ID, "jobDescriptionText").text.strip()
                            print(f"  [Card {idx}] AFTER wait — desc starts with: {new_desc[:80]!r}")

                            if new_desc in seen_descriptions:
                                print(f"  [Card {idx}] Desc already seen! Re-clicking...")
                                driver.execute_script("arguments[0].click();", live_cards[idx])
                                time.sleep(3)
                                continue
                            description = new_desc
                            break
                        except:
                            print(f"  [Card {idx}] Wait timed out (attempt {attempt+1}), re-clicking...")
                            try:
                                live_cards = driver.find_elements(By.CSS_SELECTOR, "div.job_seen_beacon")
                                driver.execute_script("arguments[0].click();", live_cards[idx])
                            except:
                                pass
                            time.sleep(3)

                    salary = "Not Listed"
                    for sel in [
                        "div#jobDetailsSection",
                        "[data-testid='jobsearch-OtherJobDetailsContainer']",
                        "[data-testid='attribute_snippet_testid']",
                        "[class*='salary']", "[class*='Salary']"
                    ]:
                        try:
                            txt = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                            if txt:
                                salary = txt
                                break
                        except:
                            continue

                    date = "N/A"
                    for sel in [
                        "div#jobsearch-JobMetadataFooter",
                        "[data-testid='jobsearch-JobMetadataFooter']",
                        "[class*='JobMetadataFooter']", "span[class*='date']"
                    ]:
                        try:
                            txt = driver.find_element(By.CSS_SELECTOR, sel).text.strip()
                            if txt:
                                date = txt
                                break
                        except:
                            continue

                seen_descriptions.add(description)

                card = cards[idx]
                jobs = {
                    'title': card.find('h2').text.strip() if card.find('h2') else "N/A",
                    'company': card.find('span', {'data-testid': 'company-name'}).text.strip() if card.find('span', {'data-testid': 'company-name'}) else "N/A",
                    'location': card.find('div', {'data-testid': 'text-location'}).text.strip() if card.find('div', {'data-testid': 'text-location'}) else "N/A",
                    'salary': salary,
                    'date': date,
                    'description': description
                }
                full_job.append(jobs)
                print(f"  [Card {idx}] ✅ Saved — salary={salary!r} date={date!r}")
                time.sleep(2)
                idx += 1

            except Exception as e:
                print(f"  Error in card {idx}: {e}")
                try:
                    driver.get(listing_url)
                    time.sleep(8)
                except:
                    pass
                idx += 1
                continue

    driver.quit()
    return pd.DataFrame(full_job)

df = get_info("Data", "france", page=1)
df

Found 19 cards on page 0

  [Card 0] BEFORE click — desc starts with: 'The Master Data Administrator (MDA) plays a critical role in maintaining the acc'
  [Card 0] AFTER click — URL: 'https://www.indeed.com/jobs?q=Data&l=france&start=0&vjk=e238e5f5b1526ccb'
  [Card 0] Full page redirect detected
  [Card 0] ✅ Saved — salary='Job details\nPay\n$25 an hour\nJob type\nPart-time\n&nbsp;' date='N/A'

  [Card 1] BEFORE click — desc starts with: 'Data Integration Specialist\nIndianapolis (Periodic travel to branches)\n\nAbout th'
  [Card 1] AFTER click — URL: 'https://www.indeed.com/jobs?q=Data&l=france&start=0&vjk=e238e5f5b1526ccb'
  [Card 1] Full page redirect detected
  [Card 1] ✅ Saved — salary='Job details\nPay\n$25 an hour\nJob type\nPart-time\n&nbsp;' date='N/A'

  [Card 2] BEFORE click — desc starts with: 'Data Integration Specialist\nIndianapolis (Periodic travel to branches)\n\nAbout th'
  Title link click failed (Message: element not interactable: https://www.indeed.com/viewjob?jk=f

Unnamed: 0,title,company,location,salary,date,description
0,Data Integration Specialist,"Ryan Fireprotection, Inc.",United States,Job details\nPay\n$25 an hour\nJob type\nPart-...,,The Master Data Administrator (MDA) plays a cr...
1,Master Data Administrator - Part Time (25 hour...,Current Lighting Employee Co LLC,"Mayfield Heights, OH 44124",Job details\nPay\n$25 an hour\nJob type\nPart-...,,The Master Data Administrator (MDA) plays a cr...
2,Data Steward,Reckitt,"Wilson, NC 27893",Not Listed,If you require alternative methods of applicat...,__NOT_LOADED__
3,Data Entry,ATC,"Indianapolis, IN",Not Listed,If you require alternative methods of applicat...,__NOT_LOADED__
4,QUALITY ANALYST,Mercy Medical Center,"Baltimore, MD 21202 (Downtown area)",Job details\nJob type\nContract\n&nbsp;,Report job,__NOT_LOADED__
5,Product Data Specialist,Luma Financial Technologies,"Cincinnati, OH 45202 (Mount Adams area)",Job details\nJob type\nContract\n&nbsp;,Report job,__NOT_LOADED__
6,Data QA Specialist,1X Technologies AS,"San Carlos, CA",Not Listed,,Data Integration Specialist\nIndianapolis (Per...
7,Data Entry Clerk,ATC,"Indianapolis, IN",Job details\nJob type\nFull-time\nEncouraged t...,,Labcorp is seeking a Data Entry Operator to jo...
8,Processor - Data (Remote),Essilor and Subsidiaries,"Dallas, TX 75234",Job details\nPay\n$28 - $51 an hour\nJob type\...,,About 1X\nWe build humanoid robots that work a...
9,Jr Data Analyst,The Restaurant Store,"Lancaster, PA 17602","Job details\nPay\n$60,100 - $121,600 a year\nJ...",,Why join Stryker?\nLooking for a place that va...


In [5]:
import sys
print(sys.executable)

/home/abdellah/anaconda3/bin/python


In [4]:
pip list 

Package                           Version
--------------------------------- --------------------
aext-assistant                    4.20.0
aext-assistant-server             4.20.0
aext-core                         4.20.0
aext-core-server                  4.20.0
aext_environments_server          4.20.0
aext-panels                       4.20.0
aext-panels-server                4.20.0
aext-project-filebrowser-server   4.20.0
aext-share-notebook               4.20.0
aext-share-notebook-server        4.20.0
aext-shared                       4.20.0
aext-toolbox                      4.20.0
aiobotocore                       2.25.0
aiodns                            3.5.0
aiohappyeyeballs                  2.6.1
aiohttp                           3.13.2
aioitertools                      0.12.0
aiosignal                         1.4.0
alabaster                         0.7.16
alembic                           1.18.3
altair                            5.5.0
anaconda-anon-usage               0.7.5
anacon

In [6]:
pip install undetected_chromedriver

Collecting undetected_chromedriver
  Using cached undetected-chromedriver-3.5.5.tar.gz (65 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting websockets (from undetected_chromedriver)
  Downloading websockets-16.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (6.8 kB)
Downloading websockets-16.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl (184 kB)
Building wheels for collected packages: undetected_chromedriver
  Building wheel for undetected_chromedriver (pyproject.toml) ... [?25ldone
[?25h  Created wheel for undetected_chromedriver: filename=undetected_chromedriver-3.5.5-py3-none-any.whl size=47130 sha256=6727f0a35ac37acb90438e0d24afbc69f29b2d85d9cfb8ae301b0b388e051f92
  Stored in directory: /home/abdellah/.cache/pip/wheels/7a/5f/c1/06f68421cc7172ef51504631252870bcb3a2fdf3b6a025f362
S

In [33]:
pd.set_option('display.max_columns', None)

In [45]:
import undetected_chromedriver as uc from bs4 import BeautifulSoup import time import pandas as pd from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains def get_info(title, location, page=3): full_job = [] options = uc.ChromeOptions() options.binary_location = '/usr/bin/google-chrome' driver = uc.Chrome(options=options, version_main=145) for p in range(0, page * 20, 20): driver.get(f"https://indeed.com/jobs?q={title}&l={location}&start={p}") time.sleep(10) original_url = driver.current_url soup = BeautifulSoup(driver.page_source, 'html.parser') cards = soup.find_all('div', class_=lambda c: c and 'job_seen_beacon' in c) card_count = len(cards) print(f"Found {card_count} cards on page {p}") for idx in range(card_count): try: if driver.current_url != original_url: driver.get(original_url) time.sleep(8) live_cards = driver.find_elements(By.CSS_SELECTOR, "div.job_seen_beacon") if idx >= len(live_cards): continue # Capture old desc BEFORE click try: old_desc = driver.find_element(By.ID, "jobDescriptionText").text.strip() except: old_desc = "__EMPTY__" # Scroll into view driver.execute_script("arguments[0].scrollIntoView({block:'center'});", live_cards[idx]) time.sleep(1) # ✅ Try real ActionChains click first, fallback to JS click try: ActionChains(driver).move_to_element(live_cards[idx]).click().perform() except: driver.execute_script("arguments[0].click();", live_cards[idx]) time.sleep(3) # wait for panel to load # ✅ DEBUG: print what's in the right panel area try: panel = driver.find_element(By.ID, "jobDescriptionText") new_desc = panel.text.strip() print(f" DESC preview: {new_desc[:80]!r}") except Exception as e: print(f" jobDescriptionText NOT FOUND: {e}") # ✅ Print all IDs/classes visible to debug panel_candidates = driver.find_elements(By.CSS_SELECTOR, "[id*='job'], [id*='Job'], [id*='desc'], [id*='panel']") for el in panel_candidates[:5]: print(f" candidate: id={el.get_attribute('id')} class={el.get_attribute('class')[:60]}") new_desc = "N/A" # ✅ Wait for change if new_desc not in ("N/A", "", old_desc): description = new_desc else: try: wait = WebDriverWait(driver, 10) wait.until(lambda d: d.find_element(By.ID, "jobDescriptionText").text.strip() not in ("", old_desc)) description = driver.find_element(By.ID, "jobDescriptionText").text.strip() except: description = new_desc # use whatever we got # ✅ Salary — try multiple selectors and print what we find salary = "Not Listed" salary_selectors = [ "div#jobDetailsSection", "[data-testid='jobsearch-OtherJobDetailsContainer']", "[data-testid='attribute_snippet_testid']", "[class*='salary']", "[class*='Salary']", "[data-testid*='salary']", ] for sel in salary_selectors: try: el = driver.find_element(By.CSS_SELECTOR, sel) txt = el.text.strip() if txt: print(f" SALARY via '{sel}': {txt[:80]!r}") salary = txt break except: continue # ✅ Date — try multiple selectors date = "N/A" date_selectors = [ "div#jobsearch-JobMetadataFooter", "[data-testid='jobsearch-JobMetadataFooter']", "[class*='JobMetadataFooter']", "[class*='metadata']", "span[class*='date']", ] for sel in date_selectors: try: el = driver.find_element(By.CSS_SELECTOR, sel) txt = el.text.strip() if txt: print(f" DATE via '{sel}': {txt[:80]!r}") date = txt break except: continue card = cards[idx] jobs = { 'title': card.find('h2').text.strip() if card.find('h2') else "N/A", 'company': card.find('span', {'data-testid': 'company-name'}).text.strip() if card.find('span', {'data-testid': 'company-name'}) else "N/A", 'location': card.find('div', {'data-testid': 'text-location'}).text.strip() if card.find('div', {'data-testid': 'text-location'}) else "N/A", 'salary': salary, 'date': date, 'description': description } full_job.append(jobs) time.sleep(2) except Exception as e: print(f"Error in card {idx}: {e}") continue driver.quit() return pd.DataFrame(full_job) df = get_info("Data", "france", page=3) df

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 

Error: Message: 



In [34]:
df['salary']

0     Job details\nPay\n$88,000 - $132,000 a year\n&...
1     Job details\nPay\n$88,000 - $132,000 a year\n&...
2     Job details\nPay\n$88,000 - $132,000 a year\n&...
3     Job details\nPay\n$88,000 - $132,000 a year\n&...
4     Job details\nPay\n$88,000 - $132,000 a year\n&...
5     Job details\nPay\n$88,000 - $132,000 a year\n&...
6     Job details\nPay\n$88,000 - $132,000 a year\n&...
7     Job details\nPay\n$88,000 - $132,000 a year\n&...
8     Job details\nPay\n$88,000 - $132,000 a year\n&...
9     Job details\nPay\n$88,000 - $132,000 a year\n&...
10    Job details\nPay\n$88,000 - $132,000 a year\n&...
11    Job details\nPay\n$88,000 - $132,000 a year\n&...
12    Job details\nPay\n$88,000 - $132,000 a year\n&...
13    Job details\nPay\n$88,000 - $132,000 a year\n&...
14    Job details\nPay\n$88,000 - $132,000 a year\n&...
15    Job details\nPay\n$88,000 - $132,000 a year\n&...
Name: salary, dtype: object