# Naukri Web Scraping

### Naukri Web Scraping is the project based on scraping jobs post related to any domains and their details. It also analyse the scraped data.

In [None]:
# Cell 1: Imports
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import time
import re

# Show plots inline
%matplotlib inline


In [None]:
# Cell 2: Initialize WebDriver
def initialize_driver():
    """Initializes and returns a Firefox WebDriver."""
    driver = webdriver.Firefox()
    return driver

driver = initialize_driver()


In [None]:
# Cell 3: Scraping Function (customizable role + pages)
def scrape_naukri_jobs(driver, job_role, num_pages):
    """
    Scrape job data from Naukri.com for a given role and number of pages.
    
    Args:
        driver: Selenium WebDriver
        job_role (str): Job keyword/role to search (e.g., "python tester")
        num_pages (int): Number of pages to scrape
    
    Returns:
        dict: Scraped job data
    """
    jobs = {
        "job_no": [],
        "roles": [],
        "companies": [],
        "locations": [],
        "experience": [],
        "salaries": [],
        "skills": []
    }

    job_role = job_role.replace(" ", "-")  # URL-friendly

    for i in range(num_pages):
        url = f"https://www.naukri.com/{job_role}-jobs-{i}"
        print(f"Scraping page {i+1}/{num_pages}: {url}")
        driver.get(url)
        time.sleep(3)

        lst = driver.find_elements(By.CLASS_NAME, "srp-jobtuple-wrapper")

        for index, job in enumerate(lst):
            driver.implicitly_wait(2)
            jobno = (i * len(lst) + index + 1)

            try:
                role = job.find_element(By.CLASS_NAME, "title").text
                company = job.find_element(By.CLASS_NAME, "comp-name").text
                location = job.find_element(By.CLASS_NAME, "loc-wrap").text
                exp = job.find_element(By.CLASS_NAME, "exp-wrap").text

                # Salary (optional)
                salary_elements = job.find_elements(By.CLASS_NAME, "sal-wrap")
                salary = salary_elements[0].text if salary_elements else "NA"

                # Skills (optional)
                skill_elements = job.find_elements(By.CSS_SELECTOR, ".tags-gt li")
                skill = ",".join([li.text for li in skill_elements]) if skill_elements else "NA"

                jobs["job_no"].append(jobno)
                jobs["roles"].append(role)
                jobs["companies"].append(company)
                jobs["locations"].append(location)
                jobs["experience"].append(exp)
                jobs["salaries"].append(salary)
                jobs["skills"].append(skill)

            except NoSuchElementException:
                continue
    return jobs


In [None]:
# Cell 4: Process Data
def process_job_data(jobs_data):
    df_raw = pd.DataFrame.from_dict(jobs_data)
    df_raw = df_raw.apply(lambda x: x.astype(str).str.lower())

    # Split locations and skills
    df_raw['skills'] = [skill.split(",") for skill in df_raw['skills']]
    df_raw['locations'] = [location.split(",") for location in df_raw['locations']]

    # Clean salaries and experience
    df_raw['salaries'] = df_raw['salaries'].str.replace(' lacs pa', '', regex=False)
    df_raw['experience'] = df_raw['experience'].str.replace(' yrs', '', regex=False)

    return df_raw


In [None]:
# Cell 5: Analysis Functions
def analyze_experience(df):
    print("\n📊 Experience Analysis")
    experience_counts = df['experience'].value_counts()
    display(experience_counts.head(10))

    plt.figure(figsize=(10, 6))
    experience_counts.head(10).plot(kind='bar', color='skyblue')
    plt.title('Experience Range')
    plt.show()

def analyze_salary(df):
    print("\n📊 Salary Analysis")
    salaries_counts = df['salaries'].value_counts()
    display(salaries_counts.head(10))

    plt.figure(figsize=(10, 6))
    salaries_counts.head(10).plot(kind='bar', color='orange')
    plt.title('Salary Range')
    plt.show()

def clean_and_analyze_locations(df):
    print("\n📊 Location Analysis")
    df_location = df.assign(Values=df['locations'].str.split(',')).explode('locations')
    df_location = df_location[['job_no', 'locations']]

    # Basic cleaning
    df_location['locations'] = df_location['locations'].apply(lambda x: re.sub(r'\(.*\)|hybrid\s*-\s*|\bnew\s|\s*|/.*$', '', x).strip())

    # Normalization
    df_location['locations'] = df_location['locations'].str.replace(r'\b\w*mumbai\w*\b', 'mumbai', regex=True)
    df_location['locations'] = df_location['locations'].str.replace(r'\b\w*delhi\w*\b', 'delhi', regex=True)
    df_location['locations'] = df_location['locations'].str.replace(r'\b\w*bangal\w*\b', 'bengaluru', regex=True)
    df_location['locations'] = df_location['locations'].str.replace(r'\b\w*noida\w*\b', 'noida', regex=True)

    location_counts = df_location['locations'].value_counts()
    display(location_counts.head(10))

    plt.figure(figsize=(10, 6))
    location_counts.head(10).plot(kind='bar', color='skyblue')
    plt.title('Job Locations')
    plt.show()

    # Wordcloud
    location_string = ', '.join(df_location['locations'].dropna())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(location_string)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

def analyze_skills(df):
    print("\n📊 Skills Analysis")
    df_skills = df.assign(Values=df['skills'].str.split(',')).explode('skills')
    df_skills = df_skills[['job_no', 'skills']]

    distinct_skill = df_skills['skills'].unique()
    print(f"Total distinct skills: {len(distinct_skill)}")

    # Wordcloud
    skills_string = ', '.join(df_skills['skills'].dropna())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(skills_string)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()


In [None]:
# Cell 6: Run Scraper (choose role & pages here)
job_role = "python tester"   # 🔹 Change this to any job role (e.g. "selenium", "qa automation")
num_pages = 1               # 🔹 Change this for number of pages to scrape

jobs_data = scrape_naukri_jobs(driver, job_role=job_role, num_pages=num_pages)

if jobs_data["job_no"]:
    df_raw = process_job_data(jobs_data)
    display(df_raw.head())
    df_raw.to_csv(f'Naukri_{job_role.replace(" ","_")}.csv', index=False)
    print(f"✅ Raw data saved as Naukri_{job_role.replace(' ','_')}.csv")
else:
    print("⚠️ No jobs scraped")


In [None]:
# Cell 7: Run Detailed Analysis
analyze_experience(df_raw)
analyze_salary(df_raw)
clean_and_analyze_locations(df_raw)
analyze_skills(df_raw)


In [None]:
# Cell 8: Close WebDriver
driver.quit()
