In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

import traceback
import logging

# Set up basic logging configuration
logging.basicConfig(
    level=logging.INFO,                     # Set the logging level to INFO
    format='%(asctime)s - %(levelname)s - %(message)s',  # Customize the log format
    handlers=[logging.StreamHandler()]      # Ensure logs are sent to the console (Jupyter output)
)

In [2]:
def get_jobs_data(df, bs_data):

    all_jobs = bs_data.find_all("div", attrs={"class": "cust-job-tuple layout-wrapper lay-2 sjw__tuple"})  # main container holds list of jobs

    for each_job in all_jobs:
        job_info = {}  # Creating the empty Dictionary
        
        try:
            job_info['title'] = each_job.find("div", attrs={"class": "row1"}).text
        except:
            job_info['title'] = np.nan
        
        company_details = each_job.find("div", attrs={"class": "row2"}).find_all("a", attrs={})
        
        try:
            job_info['companyName'] = company_details[0].text
        except:
            job_info['companyName'] = np.nan
        
        try:
            job_info['rating'] = company_details[1].text
        except:
            job_info['rating'] = np.nan
        
        try: 
            job_info['reviews'] = company_details[2].text
        except:
            job_info['reviews'] = np.nan
        
        other_details = each_job.find("div", attrs={"class": "row3"}).find_all("span", attrs={})
        
        try:    
            job_info['experience'] = other_details[2].text.strip()
        except:
            job_info['experience'] = np.nan
        
        try:    
            job_info['salary'] = other_details[4].text.strip()
        except:
            job_info['salary'] = np.nan
        
        try:
            job_info['location'] = other_details[6].text.strip()
        except:
            job_info['location'] = np.nan
        
        try:
            job_info['qualification_required'] = each_job.find("div", attrs={"class": "row4"}).text
        except:
            job_info['qualification_required'] = np.nan
        
        tags = each_job.find("div", attrs={"class": "row5"}).find_all("li", attrs={"class": "dot-gt tag-li"})
        
        try:
            job_info['skills'] = ",".join([tag.text.strip() for tag in tags])
        except:
            job_info['skills'] = np.nan
        
        try:
            job_info['day_posted'] = each_job.find("div", attrs={"class": "row6"}).text
        except:
            job_info['day_posted'] = np.nan
            
        each_df = pd.DataFrame(job_info, index=[0])
        df = pd.concat((df, each_df), ignore_index=True)
        
    return df


In [3]:
# Initialize DataFrame
df = pd.DataFrame()

# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

service = Service(r"C:\Users\sweth\Desktop\Innomatics\ClassRoom\WebScraping\Latest\chromedriver.exe")

try:
    
    driver = webdriver.Chrome(service=service, options=chrome_options)
    logging.info("WebDriver initialized successfully.")
    
    roles = ["Data Scientist", "Data Analyst", "Data Engineer", "Python Developer"]
    for each_role in roles:
        each_role.replace(" ", "+")
        
        for each_page in range(1,21):
            try:
                page_link = f"https://www.naukri.com/{each_role}-jobs-{each_page}?k={each_role}&nignbevent_src=jobsearchDeskGNB"
                
                logging.info(f"Processing page {each_page}")
                logging.info(f"Page link: {page_link}")
                
                driver.get(page_link)
                
                # Let the page load completely
                time.sleep(5)
                
                logging.info("Parsing the page source with BeautifulSoup")
                bs_data = BeautifulSoup(driver.page_source, 'html.parser')

                # Executing get_jobs_data method 
                df = get_jobs_data(df, bs_data)
                logging.info(f"Processed page {each_page} for role {each_role}")

            except Exception:
                logging.error(f"Error processing page {each_page} for role {each_role}: {traceback.format_exc()}")

except Exception as e:
    
    logging.warning(f"Failed to initialize WebDriver or process roles: {e}")
    
finally:
    
    driver.quit()
    logging.info("WebDriver closed successfully.")


2024-08-28 08:46:54,728 - INFO - WebDriver initialized successfully.
2024-08-28 08:46:54,729 - INFO - Processing page 1
2024-08-28 08:46:54,730 - INFO - Page link: https://www.naukri.com/Data Scientist-jobs-1?k=Data Scientist&nignbevent_src=jobsearchDeskGNB
2024-08-28 08:47:01,489 - INFO - Parsing the page source with BeautifulSoup
2024-08-28 08:47:01,638 - INFO - Processed page 1 for role Data Scientist
2024-08-28 08:47:01,638 - INFO - Processing page 2
2024-08-28 08:47:01,640 - INFO - Page link: https://www.naukri.com/Data Scientist-jobs-2?k=Data Scientist&nignbevent_src=jobsearchDeskGNB
2024-08-28 08:47:07,122 - INFO - Parsing the page source with BeautifulSoup
2024-08-28 08:47:07,338 - INFO - Processed page 2 for role Data Scientist
2024-08-28 08:47:07,339 - INFO - Processing page 3
2024-08-28 08:47:07,339 - INFO - Page link: https://www.naukri.com/Data Scientist-jobs-3?k=Data Scientist&nignbevent_src=jobsearchDeskGNB
2024-08-28 08:47:12,648 - INFO - Parsing the page source with Be

In [5]:
df.to_csv(r"C:\Users\sweth\Desktop\Innomatics\ClassRoom\WebScraping\Naukri_Data_Large.csv", index=False)
df

Unnamed: 0,title,companyName,rating,reviews,experience,salary,location,qualification_required,skills,day_posted
0,Data Scientist: Artificial Intelligence,IBM,4.1,20349 Reviews,5-7 Yrs,Not disclosed,Hyderabad,Your Role and ResponsibilitiesWork with broade...,"python,data analytics,artificial intelligence,...",1 Day Agosave
1,Python Backend Developer / Data Scientist,UBS India,3.9,1320 Reviews,0-10 Yrs,Not disclosed,Mumbai,Overall Software development experience 8+year...,"rest,python,software development,rdbms,natural...",1 Day Agosave
2,Data Scientist - BLR/ HYD/ GGN,Genpact,3.9,27612 Reviews,6-11 Yrs,Not disclosed,"Hybrid - Hyderabad, Gurugram, Bengaluru","Furthermore, please do note that Genpact does ...","Data Science,Predictive Modeling,Decision Tree...",4 Days Agosave
3,Data Analyst / Data Scientist,Ifood Web Media Technology,3.9,14 Reviews,5-8 Yrs,Not disclosed,"Hybrid - Hyderabad, Chennai, Bengaluru","Highly proficient in Excel data mining, data c...","Advanced Analytics,Data Analyst,Data Analytics...",5 Days Agosave
4,Hiring For Data Scientist,Trianz,3.7,375 Reviews,5-8 Yrs,9.5-17 Lacs PA,"Hybrid - Hyderabad, Bengaluru",Qualifications: . Educational Background: BE /...,"Data Science,Machine Learning,Tensorflow,GenAI...",1 Day Agosave
...,...,...,...,...,...,...,...,...,...,...
1595,Python Developer,IBM,4.1,20349 Reviews,3-8 Yrs,Not disclosed,Ahmedabad,"Define, analyse, and review technical architec...","sql,api,mongodb,python development,couchdb,css...",1 Day Agosave
1596,Python Software Developer ( Pune),Infosys,3.8,34379 Reviews,3-8 Yrs,Not disclosed,"Hyderabad, Chennai, Bengaluru",Hiring Python Developer with experience range ...,"Python Framework,Python Development,Python,Sof...",6 Days Agosave
1597,Python Developer,HTC Global Services,3.7,1214 Reviews,7-9 Yrs,Not disclosed,Chennai,Proven experience across the full system devel...,"Production support,Data management,Management ...",1 Day Agosave
1598,Python Developer,Bonami Software,4.0,69 Reviews,0 Yrs,2.5-3.5 Lacs PA,Noida(Sector-63 Noida),Must have hands-on experience in Data Structur...,"Software Engineering,Django,Python,Data Struct...",6 Days Agosave


In [None]:
"""
all_jobs = bs_data.find_all("div", attrs={"class": "cust-job-tuple layout-wrapper lay-2 sjw__tuple"})  # main container holds list of jobs

for each_job in all_jobs:
    job_info = {}  # Creating the empty 
    
    try:
        job_info['title'] = each_job.find("div", attrs={"class": "row1"}).text
    except:
        job_info['title'] = np.nan

    company_details = each_job.find("div", attrs={"class": "row2"}).find_all("a", attrs={})

    try:
        job_info['companyName'] = company_details[0].text
    except:
        job_info['companyName'] = np.nan

    try:
        job_info['rating'] = company_details[1].text
    except:
        job_info['rating'] = np.nan

    try: 
        job_info['reviews'] = company_details[2].text
    except:
        job_info['reviews'] = np.nan

    other_details = each_job.find("div", attrs={"class": "row3"}).find_all("span", attrs={})

    try:    
        job_info['experience'] = other_details[2].text.strip()
    except:
        job_info['experience'] = np.nan

    try:    
        job_info['salary'] = other_details[4].text.strip()
    except:
        job_info['salary'] = np.nan

    try:
        job_info['location'] = other_details[6].text.strip()
    except:
        job_info['location'] = np.nan

    try:
        job_info['qualification_required'] = each_job.find("div", attrs={"class": "row4"}).text
    except:
        job_info['qualification_required'] = np.nan

    tags = each_job.find("div", attrs={"class": "row5"}).find_all("li", attrs={"class": "dot-gt tag-li"})

    try:
        job_info['skills'] = ",".join([tag.text.strip() for tag in tags])
    except:
        job_info['skills'] = np.nan

    try:
        job_info['day_posted'] = each_job.find("div", attrs={"class": "row6"}).text
    except:
        job_info['day_posted'] = np.nan

    each_df = pd.DataFrame(job_info, index=[0])
    df = pd.concat((df, each_df), ignore_index=True)
"""

In [None]:
df