In [88]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

In [89]:
base_url = 'https://www.ambitionbox.com/list-of-companies?page='
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Referer': 'https://google.com',
}

In [135]:
data = []

# Function to convert a string with "k" or "Lakh" notation to a number
def convert_to_number(text):
    text = text.replace('Lah+', 'Lakh+').replace('Lah', 'Lakh').replace('Plus', '+')
    text = re.sub(r'\s+', '', text)  # Remove whitespace
    
    match = re.match(r'([\d.]+)([a-zA-Z]*)', text)
    if not match:
        return text
    
    number, suffix = match.groups()
    number = float(number)
    
    if 'k' in suffix.lower():
        return number * 1000
    elif 'lakh' in suffix.lower() or 'l' in suffix.lower():
        return number * 100000
    elif '+' in suffix:
        return number
    
    return number

# Function to parse a range and return the average
def parse_range(range_text):
    range_text = range_text.replace(',', '')  # Remove commas for better parsing
    if '-' in range_text:
        low, high = range_text.split('-')
        low, high = convert_to_number(low.strip()), convert_to_number(high.strip())
        return (low + high) / 2
    elif '+' in range_text:
        return convert_to_number(range_text.replace('+', '').strip())
    return convert_to_number(range_text.strip())

# Function to scrape data from a single page
def scrape_page(page_number):
    url = base_url + str(page_number)
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return

    webpage = BeautifulSoup(response.text, 'lxml')

    for company in webpage.find_all('div', class_='companyCardWrapper'):
        one_company = {}
        
        name_tag = company.find('h2', class_='companyCardWrapper__companyName')
        if name_tag:
            one_company["name"] = name_tag.text.strip()
        
        rating_tag = company.find('span', class_='companyCardWrapper__companyRatingValue')
        if rating_tag:
            one_company["rating"] = float(rating_tag.text.strip())
        
        other_info_tag = company.find('span', class_='companyCardWrapper__interLinking')
        if other_info_tag:
            other_info_list = other_info_tag.text.strip().split("|")
            one_company["sector"] = other_info_list[0].strip()
            for item in other_info_list[1:]:
                if 'Employees' in item:
                    one_company["no_of_employees"] = item.strip()
                elif 'years old' in item:
                    one_company["age_of_company"] = int(item.split()[0].strip())
                elif 'more' in item:
                    one_company["headquarters_location"] = item.split("+")[0].strip()
                else:
                    one_company["ownership_status"] = item.strip()
        
        actions = company.find_all('a', class_='companyCardWrapper__ActionWrapper')
        for action in actions:
            if 'Reviews' in action.text:
                continue
            elif 'Salaries' in action.text:
                one_company['salary'] = parse_range(action.text.strip().split()[0])
            elif 'Jobs' in action.text:
                one_company['available_jobs'] = convert_to_number(action.text.strip().split()[0])
        
        data.append(one_company)

In [141]:
# Timing the scraping process
start_time = time.time()

n_pages = 10  # Number of pages to scrape

for page_number in range(1, n_pages + 1):
    scrape_page(page_number)
    time.sleep(1)

end_time = time.time()
time_taken = end_time - start_time
print(f'It took {time_taken} seconds to Scrape {n_pages} Pages')

It took 26.17005944252014 seconds to Scrape 10 Pages


In [142]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,rating,sector,no_of_employees,ownership_status,age_of_company,headquarters_location,salary,available_jobs
0,TCS,3.8,IT Services & Consulting,1 Lakh+ Employees,Public,56.0,Mumbai,870000.0,1100.0
1,Accenture,4.0,IT Services & Consulting,1 Lakh+ Employees,Public,35.0,Dublin,590000.0,43200.0
2,Cognizant,3.9,IT Services & Consulting,1 Lakh+ Employees,Forbes Global 2000,30.0,Teaneck. New Jersey.,570000.0,559.0
3,Wipro,3.8,IT Services & Consulting,1 Lakh+ Employees,Public,79.0,Bangalore/Bengaluru,430000.0,226.0
4,Capgemini,3.8,IT Services & Consulting,1 Lakh+ Employees,Public,57.0,Paris,430000.0,568.0
...,...,...,...,...,...,...,...,...,...
295,Publicis Sapient,3.6,IT Services & Consulting,10k-50k Employees,,34.0,Gurgaon / Gurugram,28000.0,24.0
296,Atos,3.8,IT Services & Consulting,10k-50k Employees,Forbes Global 2000,27.0,Gurgaon / Gurugram,27500.0,--
297,Utkarsh Small Finance Bank,4.0,Banking,10k-50k Employees,Public,15.0,Gurgaon / Gurugram,9100.0,43.0
298,DCB Bank,3.8,Banking,5k-10k Employees,Fortune India 500,94.0,Gurgaon / Gurugram,8800.0,57.0


In [143]:
df.to_csv('company_data.csv', index=False)

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   40 non-null     object 
 1   rating                 40 non-null     float64
 2   sector                 40 non-null     object 
 3   no_of_employees        40 non-null     object 
 4   ownership_status       32 non-null     object 
 5   age_of_company         40 non-null     int64  
 6   headquarters_location  40 non-null     object 
 7   salary                 40 non-null     float64
 8   available_jobs         40 non-null     float64
dtypes: float64(3), int64(1), object(5)
memory usage: 2.9+ KB


In [139]:
df.describe()

Unnamed: 0,rating,age_of_company,salary,available_jobs
count,40.0,40.0,40.0,40.0
mean,3.9225,38.9,161575.0,1338.0
std,0.263592,32.81635,189689.820699,6797.610232
min,3.1,6.0,19200.0,6.0
25%,3.8,22.0,48575.0,73.75
50%,3.9,30.0,79100.0,156.0
75%,4.1,43.75,182500.0,323.0
max,4.7,179.0,870000.0,43200.0
