<a href="https://colab.research.google.com/github/bhargavasomya/linkedin-jobs-analysis/blob/main/linkedin_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# used to make HTTP requests to web servers.
import requests
# for parsing HTML and XML documents
from bs4 import BeautifulSoup
# used to store extracted data in dataframe
import pandas as pd
# the next 2 libraries are used to create a random sleep function
import random
import time

In [8]:
# Define job position and location
position = 'Data Scientist'
location = 'Greater%20Seattle%20Area'

# Convert position to a URL-friendly format
url_friendly_position = position.replace(" ", "%20")

In [9]:
# Define headers to mimic real browsers
headers = [
    {'User-Agent': 'Mozilla/5.0'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'},
    {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36'},
    {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'}
]

In [10]:
# Initialize list to store job data
jobs_data = []

# Define the number of pages to scrape (each page usually has 25-50 jobs)
num_pages = 5  # Adjust as needed (5 pages * 50 jobs = 250 jobs)

for page in range(num_pages):
    start = page * 25  # Pagination step

    # Construct URL with entry & mid-level filter
    # For internships update the URL with f_E=1
    # For Manager roles f_E=5,6
    url_search = f'https://www.linkedin.com/jobs/search/?keywords={url_friendly_position}&location={location}&f_E=2,3,4&start={start}'

    # Select a random header to avoid detection
    head = random.choice(headers)
    print(f"\nFetching jobs from page {page + 1} - URL: {url_search}")

    # Send request to fetch job listings
    response = requests.get(url_search, headers=head)

    if response.status_code != 200:
        print(f"Failed to retrieve jobs from page {page + 1}, status code: {response.status_code}")
        break

    # Parse the response
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find job list container
    joblist = soup.find('ul', class_="jobs-search__results-list")
    if not joblist:
        print(f"No job listings found on page {page + 1}")
        continue

    # Find all job postings
    alljobs = joblist.find_all('li')

    # Loop through each job posting
    for job in alljobs:
        try:
            # Extract job info container
            info = job.find('div', class_="base-search-card__info")

            # Extract job title
            title = info.find('h3', class_="base-search-card__title").text.strip()

            # Extract company name
            company = info.find('h4', class_="base-search-card__subtitle").text.strip()

            # Extract job location
            location_element = job.find('span', class_="job-search-card__location")
            location_job = location_element.text.strip() if location_element else "Not specified"

            # Extract job posting URL
            joburl = job.find('a', class_="base-card__full-link")['href']

            print(f"Fetching job description for: {title} at {company}")

            # Request job page to extract description
            job_response = requests.get(joburl, headers=head)
            time.sleep(random.uniform(3, 7))  # Adding a delay to avoid detection

            if job_response.status_code == 200:
                job_soup = BeautifulSoup(job_response.text, 'html.parser')

                # Extract job description
                description_section = job_soup.find('div', class_="show-more-less-html__markup")
                job_description = description_section.text.strip() if description_section else "No description available"

            else:
                job_description = "Failed to retrieve"

            # Store job data
            jobs_data.append({
                "Job Title": title,
                "Company": company,
                "Location": location_job,
                "Job Description": job_description,
                "Job URL": joburl
            })

        except Exception as e:
            print(f"Error extracting job details: {e}")

    # Random delay before fetching next page
    time.sleep(random.uniform(5, 10))

# Save to CSV
df = pd.DataFrame(jobs_data)
df.to_csv("linkedin_filtered_jobs2.csv", index=False)

print("\n Job details saved to 'linkedin_filtered_jobs2.csv' successfully!")



Fetching jobs from page 1 - URL: https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=Greater%20Seattle%20Area&f_E=2,3,4&start=0
Fetching job description for: Data Scientist - Bellevue at Resulticks
Fetching job description for: Data Scientist at Synergis
Fetching job description for: Data Scientist at IntePros
Fetching job description for: Data Scientist / Senior Data Scientist, Analytics at DoorDash
Fetching job description for: Machine Learning Engineer, Gen AI at Tecton
Fetching job description for: ML ( Machine Learning ) Engineers at Bright Mind Solutions LLC
Fetching job description for: Junior ML Engineer (Remote) at SynergisticIT
Fetching job description for: Machine Learning Engineer at Rec Room
Fetching job description for: Senior Data Scientist at LatentView Analytics
Fetching job description for: Entry Level Data Scientist (Remote) at SynergisticIT
Fetching job description for: Data Scientist at People Tech Group Inc
Fetching job description for: Data 