Necessary Libraries

In [1]:
!pip install -q selenium
!pip install -q beautifulsoup4


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import os


Chrome Driver

In [3]:
!apt-get update -qq
!apt-get install -y chromium-chromedriver chromium-browser -qq

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Scraper Function

In [4]:
def scrape_karkidi_jobs_selenium_colab_clickable(keyword="data science", max_page_clicks=20):
    #for headless execution in Colab
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 YourJobScraperSeleniumColab/1.0")

    try:
        driver = webdriver.Chrome(options=options)
    except Exception as e:
        print(f"Error initializing WebDriver in Colab: {e}")
        print("Ensure chromium-chromedriver and chromium-browser are installed via apt-get.")
        return pd.DataFrame()

    jobs_list = []
    seen_job_urls = set()
    initial_url = f"https://www.karkidi.com/Find-Jobs/1/all/India?search={keyword.replace(' ', '%20')}"

    print(f"\n--- Starting scrape (Selenium Clickable) for keyword: '{keyword}' ---")
    print(f"Loading initial page: {initial_url}")
    try:
        driver.get(initial_url)
        time.sleep(3)
    except Exception as e:
        print(f"Error navigating to initial URL {initial_url} with Selenium: {e}")
        driver.quit()
        return pd.DataFrame()

    for page_click_count in range(max_page_clicks + 1):
        if page_click_count > 0:
            print(f"Processing page {page_click_count + 1} (after click)")
        else:
            print(f"Processing page 1 (initial load)")

        soup = BeautifulSoup(driver.page_source, "html.parser")
        job_blocks = soup.find_all("div", class_="ads-details")

        if not job_blocks and page_click_count == 0:
            print(f"No job blocks found on the first page for keyword '{keyword}'. Stopping for this keyword.")
            break
        elif not job_blocks and page_click_count > 0 :
             print(f"No job blocks found on page {page_click_count + 1} after click. Stopping for this keyword.")
             break

        new_jobs_found_on_this_page_count = 0
        for job_html_block in job_blocks:
            scraped_at = datetime.now().isoformat()
            title, job_url, company, location, experience, posting_date, job_type, summary, skills = ["Not found"] * 9

            try:
                cmp_info_div = job_html_block.find("div", class_="cmp-info")
                if cmp_info_div:
                    title_link_tag = cmp_info_div.find("a", href=lambda href_val: href_val and "job-details" in href_val)
                    if title_link_tag:
                        h4_tag = title_link_tag.find("h4")
                        if h4_tag: title = h4_tag.get_text(strip=True)
                        job_url_value = title_link_tag.get('href')
                        if job_url_value:
                            if job_url_value.startswith('/'): job_url = "https://www.karkidi.com" + job_url_value
                            elif job_url_value.startswith('http'): job_url = job_url_value

                    company_tag = cmp_info_div.find("a", href=lambda href_val: href_val and "Employer-Profile" in href_val)
                    if company_tag: company = company_tag.get_text(strip=True)

                    map_marker_icon = cmp_info_div.find("i", class_="fa-map-marker")
                    if map_marker_icon and map_marker_icon.parent and map_marker_icon.parent.name == 'p':
                        location = map_marker_icon.parent.get_text(strip=True)

                    experience_tag = cmp_info_div.find("p", class_="emp-exp")
                    if experience_tag: experience = experience_tag.get_text(strip=True)

                hour_details_div = job_html_block.find("div", class_="hour-details")
                if hour_details_div:
                    date_p_tag = hour_details_div.find("p")
                    if date_p_tag: posting_date = date_p_tag.get_text(strip=True)

                    # Job Type
                    job_type_span = hour_details_div.find("span", class_=lambda c_val: c_val and any(kw_type in c_val for kw_type in ['fulltime', 'parttime', 'contract', 'internship']))
                    if job_type_span: job_type = job_type_span.get_text(strip=True)
                    elif not job_type_span and hour_details_div.find("span", class_="label-warning"):
                         potential_job_type_span = hour_details_div.find("span", class_="label-warning")
                         if potential_job_type_span and potential_job_type_span.get_text(strip=True) in ["Full Time", "Part Time", "Contract"]:
                              job_type = potential_job_type_span.get_text(strip=True)


                summary_span_tag = job_html_block.find("span", class_="left-content", string="Summary")
                if summary_span_tag:
                    summary_content_parts = []
                    current_element = summary_span_tag.find_next_sibling()
                    while current_element:
                        if current_element.name == 'div' and 'msg-cell' in current_element.get('class', []):
                            if current_element.find("span", class_="left-content", string="Key Skills"): break
                        if current_element.name in ['p', 'ul']:
                            summary_content_parts.append(current_element.get_text(separator=' ', strip=True))
                        current_element = current_element.find_next_sibling()
                    if summary_content_parts: summary = " ".join(summary_content_parts).strip()
                    elif summary_span_tag.find_next("p"):
                        summary = summary_span_tag.find_next("p").get_text(strip=True)

                # Skills
                skills_span_tag = job_html_block.find("span", class_="left-content", string="Key Skills")
                if skills_span_tag:
                    skills_p_tag = skills_span_tag.find_next_sibling("p", class_="text-greey")
                    if skills_p_tag: skills = skills_p_tag.get_text(strip=True)
                    elif skills_span_tag.find_next("p"):
                        skills = skills_span_tag.find_next("p").get_text(strip=True)

                if job_url != "Not found" and job_url not in seen_job_urls:
                    seen_job_urls.add(job_url)
                    jobs_list.append({
                        "Title": title, "Company": company, "Location": location,
                        "Experience": experience, "Summary": summary, "Skills": skills,
                        "Posting_Date": posting_date, "Job_URL": job_url, "Job_Type": job_type,
                        "Scraped_Timestamp": scraped_at
                    })
                    new_jobs_found_on_this_page_count += 1
                elif job_url == "Not found":
                    print(f"Skipping job (Title: '{title}') due to 'Not found' Job_URL.")

            except Exception as e:
                print(f"Error parsing a job block (Title: '{title if title != 'Not found' else 'Unknown'}'): {e}")
                continue


        if new_jobs_found_on_this_page_count == 0 and page_click_count > 0: # After first page, if no new jobs, stop.
            print(f"No new unique jobs found on page {page_click_count + 1} after click. Stopping for keyword '{keyword}'.")
            break

        if page_click_count >= max_page_clicks: # Reached max click attempts
            print(f"Reached max_page_clicks limit of {max_page_clicks} for keyword '{keyword}'.")
            break

        # find and click the "Next" button.
        try:

            next_li_element = driver.find_element(By.CSS_SELECTOR, "div#paging-bottom ul.pagination li.next")


            if "disabled" in next_li_element.get_attribute("class").split():
                print("Next button's parent <li> is disabled. Assuming end of results.")
                break
            else:
                next_button_link = next_li_element.find_element(By.TAG_NAME, "a")
                if next_button_link.is_displayed() and next_button_link.is_enabled():
                    print(f"Found 'Next' button, clicking to go to page {page_click_count + 2}...")
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_button_link) # Ensure button is in view
                    time.sleep(0.5) # Brief pause after scroll
                    next_button_link.click()
                    time.sleep(3) # Wait for new page content to load
                else:
                    print(f"'Next' button link found but not displayed/enabled on page {page_click_count + 1}. End of results.")
                    break
        except NoSuchElementException:
            print(f"'Next' button's parent <li> (li.next) not found on page {page_click_count + 1}. End of results.")
            break
        except ElementNotInteractableException:
            print(f"'Next' button found but not interactable on page {page_click_count + 1}. End of results.")
            break
        except Exception as e_click:
            print(f"An error occurred trying to find/click 'Next' on page {page_click_count + 1}: {e_click}")
            break

    driver.quit() # Close the browser session
    print(f"--- Finished keyword (Selenium Clickable): '{keyword}'. Found {len(jobs_list)} unique jobs for this run. ---")
    return pd.DataFrame(jobs_list)

if __name__ == "__main__":
    keywords_to_scrape = [
        "Data Scientist", "Machine Learning Engineer", "AI Engineer", "Data Analyst",
        "Business Intelligence Developer", "Business Intelligence Analyst",
        "Big Data Engineer", "Data Engineer", "Quantitative Analyst",
        "NLP Engineer", "Computer Vision Engineer", "Statistician",
        "AI Researcher", "Deep Learning Engineer", "MLOps Engineer",
        "Analytics Manager", "Data Science Manager", "AI Product Manager",
        "Data Visualization Specialist", "Business Systems Analyst"
    ]

    all_jobs_dfs_selenium_clickable = []
    data_dir = 'data'
    if not os.path.exists(data_dir): os.makedirs(data_dir)

    for kw in keywords_to_scrape:
        # The script will stop earlier if no new jobs or if 'Next' is disabled/not found.
        df_keyword_selenium = scrape_karkidi_jobs_selenium_colab_clickable(keyword=kw, max_page_clicks=20)
        if not df_keyword_selenium.empty:
            all_jobs_dfs_selenium_clickable.append(df_keyword_selenium)
        print(f"Pausing for 5 seconds before next keyword (Selenium run)...")
        time.sleep(5)

    if all_jobs_dfs_selenium_clickable:
        final_df_selenium_clickable = pd.concat(all_jobs_dfs_selenium_clickable, ignore_index=True)
        print(f"\nTotal jobs scraped (Selenium Clickable) before global de-duplication: {len(final_df_selenium_clickable)}")

        # Global de-duplication based on Job_URL
        if 'Job_URL' in final_df_selenium_clickable.columns:
            valid_url_jobs_df = final_df_selenium_clickable[final_df_selenium_clickable['Job_URL'] != "Not found"].copy()
            if not valid_url_jobs_df.empty:
                initial_valid_count = len(valid_url_jobs_df)
                valid_url_jobs_df.drop_duplicates(subset=['Job_URL'], keep='first', inplace=True)
                final_df_cleaned_selenium_clickable = valid_url_jobs_df
                print(f"Dropped {initial_valid_count - len(final_df_cleaned_selenium_clickable)} duplicates from Selenium jobs with valid URLs.")
            else:
                final_df_cleaned_selenium_clickable = pd.DataFrame() # No valid URL jobs found
        else: # doesnt happen if Job_URL is always a key
            final_df_cleaned_selenium_clickable = final_df_selenium_clickable

        if not final_df_cleaned_selenium_clickable.empty:
            print(f"\nTotal unique jobs (Selenium Clickable, with valid URLs) saved: {len(final_df_cleaned_selenium_clickable)}")
            csv_path_selenium_clickable = os.path.join(data_dir, 'karkidi_jobs_selenium_BULK_datascience.csv')
            final_df_cleaned_selenium_clickable.to_csv(csv_path_selenium_clickable, index=False)
            print(f"Successfully saved {len(final_df_cleaned_selenium_clickable)} unique jobs to {csv_path_selenium_clickable}")

            print("\nSample of final compiled Selenium data (first 5 rows):")
            from IPython.display import display
            display(final_df_cleaned_selenium_clickable.head())

            print("\nFinal Selenium DataFrame Info:")
            final_df_cleaned_selenium_clickable.info()

        else:
            print("No unique jobs with valid URLs to save from Selenium scrape after de-duplication.")
    else:
        print("No jobs were scraped using Selenium across all specified keywords.")


--- Starting scrape (Selenium Clickable) for keyword: 'Data Scientist' ---
Loading initial page: https://www.karkidi.com/Find-Jobs/1/all/India?search=Data%20Scientist
Processing page 1 (initial load)
Found 'Next' button, clicking to go to page 2...
Processing page 2 (after click)
Found 'Next' button, clicking to go to page 3...
Processing page 3 (after click)
Found 'Next' button, clicking to go to page 4...
Processing page 4 (after click)
Found 'Next' button, clicking to go to page 5...
Processing page 5 (after click)
Found 'Next' button, clicking to go to page 6...
Processing page 6 (after click)
Found 'Next' button, clicking to go to page 7...
Processing page 7 (after click)
Found 'Next' button, clicking to go to page 8...
Processing page 8 (after click)
Found 'Next' button, clicking to go to page 9...
Processing page 9 (after click)
Found 'Next' button, clicking to go to page 10...
Processing page 10 (after click)
Found 'Next' button, clicking to go to page 11...
Processing page 11

Unnamed: 0,Title,Company,Location,Experience,Summary,Skills,Posting_Date,Job_URL,Job_Type,Scraped_Timestamp
0,Machine Learning Physical Design Engineer,Google,"Bengaluru, Karnataka, India",4-6 year,Minimum qualifications: Bachelor's degree in E...,"Aartificial intelligence,Algorithms,Data struc...",20 May 2025,https://www.karkidi.com/job-details/89536-mach...,Full Time,2025-05-20T17:56:56.028158
1,"Staff Software Engineer - Monetization, Poe (R...","Quora, Inc.",India,8-10 year,About Quora: Quora’s mission is to grow and sh...,"Aartificial intelligence,Analytical and Proble...",17 May 2025,https://www.karkidi.com/job-details/89511-staf...,Full Time,2025-05-20T17:56:56.029094
2,Staff Backend Engineer - Bot Creator Ecosystem...,"Quora, Inc.",India,8-10 year,About Quora: Quora’s mission is to grow and sh...,"Aartificial intelligence,API,Data science tech...",14 May 2025,https://www.karkidi.com/job-details/89486-staf...,Full Time,2025-05-20T17:56:56.029800
3,Senior Backend Engineer - Bot Creator Ecosyste...,"Quora, Inc.",India,6-8 year,About Quora: Quora’s mission is to grow and sh...,"Aartificial intelligence,API,Data science tech...",14 May 2025,https://www.karkidi.com/job-details/89469-seni...,Full Time,2025-05-20T17:56:56.030515
4,Data Scientist Lead - AIML,JPMorgan Chase,"Bengaluru, Karnataka, India",6-8 year,We have an opportunity to impact your career a...,"Aartificial intelligence,Data science techniqu...",13 May 2025,https://www.karkidi.com/job-details/89455-data...,Full Time,2025-05-20T17:56:56.031249



Final Selenium DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 210 entries, 0 to 209
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              210 non-null    object
 1   Company            210 non-null    object
 2   Location           210 non-null    object
 3   Experience         210 non-null    object
 4   Summary            210 non-null    object
 5   Skills             210 non-null    object
 6   Posting_Date       210 non-null    object
 7   Job_URL            210 non-null    object
 8   Job_Type           210 non-null    object
 9   Scraped_Timestamp  210 non-null    object
dtypes: object(10)
memory usage: 18.0+ KB
