In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import re

In [57]:
def get_webpage(company, page):
    url = f"https://www.ambitionbox.com/salaries/{company}-salaries?page={page}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15"
    }

    # Set up the Selenium WebDriver with Safari
    driver = webdriver.Safari()
    driver.get(url)
    time.sleep(3)  # wait for 3 seconds, adjust as necessary

    html_content = driver.page_source
    soup = BeautifulSoup(html_content, "html.parser")

    driver.quit()
    
    return soup

In [58]:
def get_salary_in_numbers(string):
    if string.endswith('L'):
        number = float(string[:-1]) * 100000  # Convert lakh to the actual number
    elif string.endswith('K'):
        number = float(string[:-1]) * 1000  # Convert thousand to the actual number
    elif string.endswith('Cr'):
        number = float(string[:-2]) * 10000000
    else:
        number = float(string)
    return (round(number,-3))

In [59]:
def get_details(soup):
    
    exp_pattern = r"Experience:(\d+)-(\d+)\s+yrs"

    salary_pattern = r"₹(\d+(\.\d+)?)([a-zA-Z]+)\/yr\s*-\s*₹(\d+(\.\d+)?)([a-zA-Z]+)\/yr"

    job_role={}

    for job, salary in zip(soup.find_all('div', class_="card-content"),soup.find_all('div', class_="avg-salary")):
        role = job.find_all('p')[0].text.strip()

        text = job.find_all('span')[0].text.strip().replace("\xa0",'')

        match = re.search(exp_pattern, text)

        min_exp = match.group(1)
        max_exp = match.group(2)

        average = salary.find_all('div')[0].text.replace('\n','').replace('\t','')
        average_salary = re.sub(r'[^\d.]', '', average)  # Remove non-digit characters except dot
        average_salary = float(average_salary .replace(',', ''))  # Remove comma and convert to float
        average_salary= round(average_salary,-2)

        salary_range = salary.find_all('p')[0].text
        match = re.search(salary_pattern, salary_range)

        min_salary = match.group(1) + match.group(3)
        min_salary = get_salary_in_numbers(min_salary)

        max_salary = match.group(4) + match.group(6)
        max_salary = get_salary_in_numbers(max_salary)

        temp_dict = {
            'Minimum experience': min_exp,
            'Maximum experience': max_exp,
            'Average salary': average_salary,
            'Minimum salary': min_salary,
            'Maximum salary': max_salary,
        }
        job_role[role] = temp_dict
    
    return job_role

# Companies list

In [60]:
companies_list = ['cognizant', 'tcs', 'wipro', 'infosys', 'hcl', 'accenture']

for company in companies_list[1:]:
    company_job_roles={}
    for i in range(1,21):       # get content of 20 pages with each page having 10 jobs 
        soup = get_webpage(company, i)
        job_roles = get_details(soup)
        company_job_roles.update(job_roles)
        
    df = pd.DataFrame(company_job_roles).T.reset_index().rename(columns={'index':'Job roles'})
    df.to_csv(f'{company}.csv')