In [1]:
!pip install selenium beautifulsoup4 pandas


Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.4 MB 660.6 kB/s eta 0:00:15
   -------------

In [2]:
import json
import pandas as pd
import time
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [3]:
# Function to calculate the posted date
def calculate_posted_date(text):
    today = datetime.today()
    days_ago = int(text.split(' ')[1])
    posted_date = today - timedelta(days=days_ago)
    return posted_date.strftime('%d-%m-%Y')
    

In [4]:
# Function to scrape job details from a single job post
def scrape_job_details(job_soup):
    job_details = {}
    job_details['company'] = job_soup.find('a', class_='topcard__org-name-link').text.strip() if job_soup.find('a', class_='topcard__org-name-link') else "null"
    job_details['job_title'] = job_soup.find('h1', class_='topcard__title').text.strip() if job_soup.find('h1', class_='topcard__title') else "null"
    job_details['linkedin_job_id'] = job_soup.find('meta', {'name': 'decoratedJobPostingId'})['content'] if job_soup.find('meta', {'name': 'decoratedJobPostingId'}) else "null"
    job_details['location'] = job_soup.find('span', class_='topcard__flavor topcard__flavor--bullet').text.strip() if job_soup.find('span', class_='topcard__flavor topcard__flavor--bullet') else "null"
    posted_on_text = job_soup.find('span', class_='posted-time-ago__text').text.strip() if job_soup.find('span', class_='posted-time-ago__text') else "null"
    job_details['posted_on'] = posted_on_text
    job_details['posted_date'] = calculate_posted_date(posted_on_text) if posted_on_text != "null" else "null"
    job_details['employment_type'] = job_soup.find('span', class_='job-criteria__text job-criteria__text--criteria').text.strip() if job_soup.find('span', class_='job-criteria__text job-criteria__text--criteria') else "null"
    job_details['seniority_level'] = job_soup.find('span', class_='job-criteria__text job-criteria__text--criteria').text.strip() if job_soup.find('span', class_='job-criteria__text job-criteria__text--criteria') else "null"

    return job_details

In [5]:
# URLs of the job listings to scrape
urls = [
    "https://www.linkedin.com/jobs/search?location=India&geoId=102713980&f_C=1035&position=1&pageNum=0",
    "https://www.linkedin.com/jobs/search?keywords=&location=India&geoId=102713980&f_C=1441",
    "https://www.linkedin.com/jobs/search?keywords=&location=India&geoId=102713980&f_TPR=r86400&f_C=1586&position=1&pageNum=0"
]

In [6]:
# Start the Selenium WebDriver
driver = webdriver.Chrome()  # Ensure you have the appropriate driver for your browser


In [7]:
# List to hold all job details
all_jobs = []


In [8]:
# Loop over the URLs and scrape data
for url in urls:
    driver.get(url)
    time.sleep(5)  # Wait for the page to load completely

    # Scroll down to load more jobs
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust as necessary for your connection speed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    job_cards = soup.find_all('li', class_='result-card job-result-card')

    for job_card in job_cards:
        job_link = job_card.find('a', class_='result-card__full-card-link')['href']
        driver.get(job_link)
        time.sleep(3)  # Wait for the job page to load completely
        job_soup = BeautifulSoup(driver.page_source, 'html.parser')
        job_details = scrape_job_details(job_soup)
        all_jobs.append(job_details)

    # Break the loop if we've reached at least 50 jobs
    if len(all_jobs) >= 50:
        break

In [9]:
# Close the driver
driver.quit()

In [10]:
# Save the data to JSON and CSV files
with open('jobs_data.json', 'w') as json_file:
    json.dump(all_jobs, json_file, indent=4)


In [11]:
df = pd.DataFrame(all_jobs)
df.to_csv('jobs_data.csv', index=False)

print("Scraping completed. Data saved to 'jobs_data.json' and 'jobs_data.csv'.")

Scraping completed. Data saved to 'jobs_data.json' and 'jobs_data.csv'.
