In [11]:
# pip install -U python-jobspy

In [None]:
# Install required dependencies for the local jobspy module (using --user to avoid permission issues)
import subprocess
import sys

# Install dependencies that jobspy might need
packages = [
    'markdownify', 
    'tls-client', 
    'regex',
    'pandas',  # Core dependency
    'requests',  # Likely needed for web scraping
    'beautifulsoup4',  # Likely needed for HTML parsing
    'lxml',  # Parser for BeautifulSoup
]
print("Installing/updating dependencies...")
for pkg in packages:
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', pkg], 
                             stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"✓ {pkg} installed/updated")
    except subprocess.CalledProcessError:
        print(f"⚠ {pkg} installation had issues (may already be installed)")
print("Dependencies check complete.")

In [None]:
import csv
import sys
import os
import traceback

try:
    print("Step 1: Setting up paths...")
    # Get the workspace directory (one level up from Scrapers/)
    workspace_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
    jobspy_dir = os.path.join(workspace_dir, 'jobspy')
    jobspy_init = os.path.join(jobspy_dir, '__init__.py')
    
    print(f"  - Workspace dir: {workspace_dir}")
    print(f"  - Jobspy dir: {jobspy_dir}")
    
    # Verify the jobspy folder and __init__.py exist
    if not os.path.exists(jobspy_dir):
        raise ImportError(f"jobspy folder not found at: {jobspy_dir}")
    if not os.path.exists(jobspy_init):
        raise ImportError(f"jobspy/__init__.py not found at: {jobspy_init}")
    print("  ✓ Paths verified")
    
except Exception as e:
    print(f"❌ Error in path setup: {type(e).__name__}: {e}")
    traceback.print_exc()
    raise

try:
    print("\nStep 2: Configuring sys.path...")
    # Ensure workspace directory is at the front of sys.path BEFORE any imports
    # This must happen before importing jobspy
    if workspace_dir in sys.path:
        sys.path.remove(workspace_dir)
    sys.path.insert(0, workspace_dir)
    print(f"  ✓ Workspace directory added to sys.path (position 0)")
    print(f"  - First 3 entries in sys.path: {sys.path[:3]}")
    
except Exception as e:
    print(f"❌ Error configuring sys.path: {type(e).__name__}: {e}")
    traceback.print_exc()
    raise

try:
    print("\nStep 3: Importing jobspy module...")
    
    # First, try importing basic dependencies that jobspy needs
    print("  - Checking dependencies...")
    try:
        import pandas as pd
        print("    ✓ pandas")
    except ImportError as e:
        print(f"    ❌ pandas: {e}")
        raise
    
    try:
        from concurrent.futures import ThreadPoolExecutor
        print("    ✓ concurrent.futures")
    except ImportError as e:
        print(f"    ❌ concurrent.futures: {e}")
        raise
    
    # Try importing jobspy step by step to find where it fails
    print("  - Importing jobspy submodules...")
    
    try:
        import jobspy.model
        print("    ✓ jobspy.model")
    except Exception as e:
        print(f"    ❌ jobspy.model: {type(e).__name__}: {e}")
        traceback.print_exc()
        raise
    
    try:
        import jobspy.util
        print("    ✓ jobspy.util")
    except Exception as e:
        print(f"    ❌ jobspy.util: {type(e).__name__}: {e}")
        traceback.print_exc()
        raise
    
    # Try importing scrapers one by one
    scrapers = ['bayt', 'bdjobs', 'glassdoor', 'google', 'indeed', 'linkedin', 'naukri', 'ziprecruiter']
    for scraper_name in scrapers:
        try:
            module = __import__(f'jobspy.{scraper_name}', fromlist=[scraper_name])
            print(f"    ✓ jobspy.{scraper_name}")
        except Exception as e:
            print(f"    ⚠ jobspy.{scraper_name}: {type(e).__name__}: {e}")
            # Don't raise - some scrapers might have optional dependencies
    
    # Now try importing the main module
    print("  - Importing main jobspy module...")
    try:
        import jobspy
        print("    ✓ jobspy imported")
    except Exception as e:
        print(f"    ❌ Error importing jobspy: {type(e).__name__}: {e}")
        traceback.print_exc()
        raise
    
    # Verify we're using the local version
    local_jobspy_path = os.path.abspath(os.path.dirname(jobspy.__file__))
    expected_path = os.path.abspath(jobspy_dir)
    if local_jobspy_path == expected_path:
        print(f"  ✓ Successfully loaded jobspy from: {local_jobspy_path}")
    else:
        print(f"  ⚠ Warning: jobspy loaded from {local_jobspy_path}, expected {expected_path}")
    
    # Now try importing scrape_jobs
    print("  - Importing scrape_jobs...")
    try:
        from jobspy import scrape_jobs
        print("  ✓ scrape_jobs imported successfully")
    except Exception as e:
        print(f"  ❌ Error importing scrape_jobs: {type(e).__name__}: {e}")
        traceback.print_exc()
        raise
    
    # Check if scrape_jobs has google_search_term parameter
    import inspect
    sig = inspect.signature(scrape_jobs)
    if 'google_search_term' in sig.parameters:
        print(f"  ✓ scrape_jobs supports 'google_search_term' parameter")
    else:
        print(f"  ❌ scrape_jobs does NOT support 'google_search_term' parameter")
        print(f"  Available parameters: {list(sig.parameters.keys())}")
    
except ImportError as e:
    print(f"\n❌ ImportError: {e}")
    print("  This usually means a required dependency is missing.")
    traceback.print_exc()
    print("\n  Try installing missing dependencies or check the error above.")
    raise
except Exception as e:
    print(f"\n❌ Error importing jobspy: {type(e).__name__}: {e}")
    traceback.print_exc()
    raise

try:
    print("\nStep 4: Calling scrape_jobs...")
    jobs = scrape_jobs(
        site_name=["indeed", "linkedin", "zip_recruiter", "google"], # "glassdoor", "bayt", "naukri", "bdjobs"
        search_term="software engineer",
        google_search_term="software engineer jobs near San Francisco, CA since yesterday",
        location="San Francisco, CA",
        results_wanted=20,
        hours_old=72,
        country_indeed='USA',
        
        # linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
        # proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
    )
    print(f"  ✓ scrape_jobs completed successfully")
    print(f"  Found {len(jobs)} jobs")
    if len(jobs) > 0:
        print(f"  First few jobs:")
        print(jobs.head())
    else:
        print("  ⚠ No jobs found")
        
except TypeError as e:
    print(f"❌ TypeError in scrape_jobs: {e}")
    print("  This usually means a parameter is not supported")
    traceback.print_exc()
    raise
except Exception as e:
    print(f"❌ Error calling scrape_jobs: {type(e).__name__}: {e}")
    traceback.print_exc()
    raise

try:
    print("\nStep 5: Saving to CSV...")
    jobs.to_csv("jobs.csv", quoting=csv.QUOTE_NONNUMERIC, escapechar="\\", index=False)
    print(f"  ✓ Jobs saved to jobs.csv")
    
except Exception as e:
    print(f"❌ Error saving CSV: {type(e).__name__}: {e}")
    traceback.print_exc()
    raise

print("\n✅ All steps completed successfully!")



Step 1: Setting up paths...
  - Workspace dir: /home/jovyan
  - Jobspy dir: /home/jovyan/jobspy
  ✓ Paths verified

Step 2: Configuring sys.path...
  ✓ Workspace directory added to sys.path (position 0)
  - First 3 entries in sys.path: ['/home/jovyan', '/opt/conda/lib/python313.zip', '/opt/conda/lib/python3.13']

Step 3: Importing jobspy module...
  - Checking dependencies...
    ✓ pandas
    ✓ concurrent.futures
  - Importing jobspy submodules...
