# Web Scraping "List of companies in India" from AmbitionBox Website

## Import Packages

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from datetime import timedelta

## Request and Fetch the Webpage

In [None]:
# hit "www.ambitionbox.com/list-of-companies"
requests.get("https://www.ambitionbox.com/list-of-companies?page=1")

<Response [403]>

In [None]:
# webpage's robot.txt doesn't allow bots!
requests.get("https://www.ambitionbox.com/list-of-companies?page=1").text

'<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "http&#58;&#47;&#47;www&#46;ambitionbox&#46;com&#47;list&#45;of&#45;companies&#63;" on this server.<P>\nReference&#32;&#35;18&#46;3035d917&#46;1587481164&#46;184b9b9f\n</BODY>\n</HTML>\n'

In [None]:
# google chrome browser's request header (to make it look like, we are making this request from a browser)
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

In [None]:
# hit using the header
response = requests.get("https://www.ambitionbox.com/list-of-companies?page=1", headers=header)

In [None]:
# see the recieved page source
response.text[0:500]

'<!doctype html>\n<html data-n-head-ssr lang="en" data-n-head="%7B%22lang%22:%7B%22ssr%22:%22en%22%7D%7D">\n  <head >\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"> \n    \n    <script type="text/javascript">window.NREUM||(NREUM={}),NREUM.init={distributed_tracing:{enabled:!0}},window.NREUM||(NREUM={}),__nr_require=function(n,r,t){function o(e){if(!r[e]){var t=r[e]={exports:{}};n[e][0].call(t'

## Pass the fetched webpage response to Beautiful Soup

In [None]:
# give the webpage to Beautiful Soup using parsers: "html.parser" or "lxml"
soup = BeautifulSoup(response.text, 'lxml')

In [None]:
# we see the whole webpage is made of cards and each card has the company info
# on inspecting, we see the cards are HTML: "div"s with class-name = "company-content-wrapper"
# Let us extract the first card and see how we can extract data from it...

first_company_card = soup.find("div", class_="company-content-wrapper")

In [None]:
# let's see what we got here...
print(first_company_card.prettify())

<div class="company-content-wrapper">
 <div class="company-content">
  <div class="company-logo">
   <img alt="Tata Consultancy Services logo" class="lazy" data-src="https://static.ambitionbox.com/alpha/company/photos/logos/tcs.jpg" onerror="this.onerror=null;this.src='/static/icons/company-placeholder.svg';" src="https://static.ambitionbox.com/static/icons/company-placeholder.svg"/>
  </div>
  <div class="company-info-wrapper">
   <div class="company-info">
    <a href="/overview/tcs-overview?utm_campaign=lc_card&amp;utm_source=ambitionbox&amp;utm_medium=desktop">
     <h2 class="company-name bold-title-l" title="TCS">
      TCS
     </h2>
    </a>
    <div class="rating-wrapper">
     <p class="rating badge-large rating-35">
      <i class="icon icon-star">
      </i>
      3.9
     </p>
     <a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews?utm_campaign=lc_ratings&amp;utm_source=ambitionbox&amp;utm_medium=desktop">
      <!-- -->
      16.1k 

### Let us try and extract the data from a single company card

### [1] company name

In [None]:
# 1. company name is inside h2 tag: <h2 class="company-name bold-title-l" title="TCS">
first_company_card.find("h2")

<h2 class="company-name bold-title-l" title="TCS">
                TCS
              </h2>

In [None]:
# extract company name
first_company_card.find("h2").text.strip()

'TCS'

### [2] company rating

In [None]:
# 2. company rating lies inside p tag: <p class="rating badge-large rating-35">
first_company_card.find("p", class_="rating")

<p class="rating badge-large rating-35"><i class="icon icon-star"></i>
                 3.9
              </p>

In [None]:
# extract company rating
first_company_card.find("p", class_="rating").text.strip()

'3.9'

### [3] number of company reviews

In [None]:
# 3. number of company reviews lies inside a tag: <a class="review-count sbold-Labels">
first_company_card.find("a", class_="review-count sbold-Labels")

<a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews?utm_campaign=lc_ratings&amp;utm_source=ambitionbox&amp;utm_medium=desktop"><!-- -->
                16.1k Reviews
              </a>

In [None]:
# extract number of company reviews
first_company_card.find("a", class_="review-count sbold-Labels").text.strip()

'16.1k Reviews'

In [None]:
first_company_card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", "")

'16.1k'

### [4] domain, [5] location, [6] years old, [7] employee strength

In [None]:
# Now this is tricky!
# extract "infoEntity" containing: 
# 4. 'domain', 
# 5. 'location', 
# 6. 'years old'
# 7. 'employee strength'

In [None]:
# Try 1:
first_company_card.find_all("p", class_="infoEntity sbold-list-header")

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
               Public
             </p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
               Mumbai + 156 more
             </p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
               52 years old
             </p>,
 <p class="infoEntity sbold-list-header"><i class="icon-supervisor-account"></i>
               10000+ employees
             </p>]

In [None]:
first_company_card.find_all("p", class_="infoEntity sbold-list-header")[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
              Public
            </p>

In [None]:
inner_company_info_list = first_company_card.find_all("p", class_="infoEntity sbold-list-header")
inner_company_info_list

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
               Public
             </p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
               Mumbai + 156 more
             </p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
               52 years old
             </p>,
 <p class="infoEntity sbold-list-header"><i class="icon-supervisor-account"></i>
               10000+ employees
             </p>]

In [None]:
inner_company_info_list[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
              Public
            </p>

In [None]:
inner_company_info_list[3].findChildren("i")[0]["class"][0]

'icon-supervisor-account'

In [None]:
inner_company_info_list[3].text.strip()

'10000+ employees'

In [None]:
# works, but not sure if it can be iterated through...

In [None]:
# Try 2

# let's try using it's parent tag: div
inner_company_info_card = first_company_card.find("div", class_="company-basic-info")
print(inner_company_info_card.prettify())

<div class="company-basic-info">
 <p class="infoEntity sbold-list-header">
  <i class="icon-domain">
  </i>
  Public
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-pin-drop">
  </i>
  Mumbai + 156 more
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-access-time">
  </i>
  52 years old
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-supervisor-account">
  </i>
  10000+ employees
 </p>
</div>



In [None]:
inner_company_info_card.findChildren("i")

[<i class="icon-domain"></i>,
 <i class="icon-pin-drop"></i>,
 <i class="icon-access-time"></i>,
 <i class="icon-supervisor-account"></i>]

In [None]:
inner_company_info_card.findChildren("i")[0]["class"][0]

'icon-domain'

In [None]:
inner_company_info_card.find_all("p")[0].text.strip()

'Public'

In [None]:
# This works... let's put it in a single cell code !!

In [None]:
# ignore
first_company_card.find_all("p", class_="infoEntity sbold-list-header")[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
              Public
            </p>

In [None]:
# extract "infoEntity" containing 'domain', 'location', 'years old' & 'employee strength'
info_list = first_company_card.find_all("p", class_="infoEntity sbold-list-header")
dom = None
loc = None
old = None
emp = None

for i in range(4):
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
        dom = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
        loc = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
        old = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
        emp = info_list[i].text.strip()
    
print("domain:", dom)
print("location:", loc)
print("years old:", old)
print("employee strength:", emp)

domain: Public
location: Mumbai + 156 more
years old: 52 years old
employee strength: 10000+ employees


### [8] company tags

In [None]:
# 8. company tags are inside a tags: <a class="ab_chip">
first_company_card.find_all("a", class_="ab_chip")

[<a class="ab_chip body-medium" href="/consulting-companies-in-india" title="Consulting companies in india">
               Consulting
             </a>,
 <a class="ab_chip body-medium" href="/it-or-ites-companies-in-india" title="IT/ITES companies in india">
               IT/ITES
             </a>,
 <a class="ab_chip body-medium" href="/mnc-companies-in-india" title="MNC companies in india">
               MNC
             </a>,
 <a class="ab_chip body-medium" href="/fortune500-companies-in-india" title="Fortune500 companies in india">
               Fortune500
             </a>]

In [None]:
# extract company tags
tags = []
for tag in first_company_card.find_all("a", class_="ab_chip"):
    tags.append(tag.text.strip())
tags

['Consulting', 'IT/ITES', 'MNC', 'Fortune500']

In [None]:
tags = ', '.join(tags)
tags

'Consulting, IT/ITES, MNC, Fortune500'

### [9] company description

In [None]:
# 9. company description is inside p tag: <p class="description">
first_company_card.find("p", class_="description")

<p class="description body-small" itemprop="description">Tata Consultancy Services is an IT services, consulting and business solutions organisation that has been partnering with the world’s largest businesses in their transformation journeys for the last 50 years. 

A part of the Tata group, India's largest multinational business group, TCS has over 436,000 of the world’s best-trained consultants in 46 countries. The company is listed on the BSE (formerly Bombay Stock Exchange) and the NSE (National Stock Exchange) in India. 

TCS'​ proactive stance on climate change and award winning work with communities across the world have earned it a place in leading sustainability indices such as the Dow Jones Sustainability Index (DJSI), MSCI Global Sustainability Index and the FTSE4Good Emerging Index. 
</p>

In [None]:
# extract company description
first_company_card.find("p", class_="description").text.strip()

"Tata Consultancy Services is an IT services, consulting and business solutions organisation that has been partnering with the world’s largest businesses in their transformation journeys for the last 50 years. \n\nA part of the Tata group, India's largest multinational business group, TCS has over 436,000 of the world’s best-trained consultants in 46 countries. The company is listed on the BSE (formerly Bombay Stock Exchange) and the NSE (National Stock Exchange) in India. \n\nTCS'\u200b proactive stance on climate change and award winning work with communities across the world have earned it a place in leading sustainability indices such as the Dow Jones Sustainability Index (DJSI), MSCI Global Sustainability Index and the FTSE4Good Emerging Index."

## Scraping a single webpage

In [None]:
# Now, Let's put it all together and scrape the webpage

In [None]:
# find all the company cards in the webpage (HTML divs that encloses data about each company)
company_cards = soup.find_all("div", class_="company-content-wrapper")
len(company_cards)

30

In [None]:
%%time

name = []
rating = []
reviews = []
domain = []
location = []
years_old = []
employee_strength = []
tags = []
about = []

for card in company_cards:
    # 1. name
    name.append(card.find("h2").text.strip())
    
    # 2. rating
    rating.append(card.find("p", class_="rating").text.strip())
    
    # 3. reviews
    reviews.append(card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", ""))
    
    # 4. domain, 5. location, 6. years old & 7. employee strength
    info_list = card.find_all("p", class_="infoEntity sbold-list-header")
    dom = None
    loc = None
    old = None
    emp = None
    for i in range(4):
        try:
            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
                dom = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
                loc = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
                old = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
                emp = info_list[i].text.strip()
        except:
            pass
    
    domain.append(dom)
    location.append(loc)
    years_old.append(old)
    employee_strength.append(emp)
    
    # 8. tags
    t = []
    for tag in card.find_all("a", class_="ab_chip"):
        t.append(tag.text.strip())
    t = ', '.join(t)
    tags.append(t)
    
    # 9. about
    about.append(card.find("p", class_="description").text.strip())
    
col_dic = {
    "name": name,
    "rating": rating,
    "reviews": reviews,
    "domain": domain,
    "location": location,
    "years_old": years_old,
    "employee_strength": employee_strength,
    "tags": tags,
    "about": about
}

df = pd.DataFrame(col_dic)

Wall time: 20.6 ms


In [None]:
df

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
0,TCS,3.9,16.1k,Public,Mumbai + 156 more,52 years old,10000+ employees,"Consulting, IT/ITES, MNC, Fortune500","Tata Consultancy Services is an IT services, c..."
1,Accenture,4.0,14.1k,Private,Dublin + 87 more,31 years old,10000+ employees,"Consulting, IT/ITES, MNC",Accenture is a leading global professional ser...
2,ICICI Bank,4.1,12.7k,Public,Mumbai + 724 more,26 years old,10000+ employees,"Banking/Insurance/Accounting, Financial Servic...",ICICI Bank is India's largest private sector b...
3,Cognizant,3.9,12.1k,Private,Teaneck + 44 more,26 years old,10000+ employees,"IT/ITES, MNC",Cognizant (NASDAQ-100: CTSH) is one of the wor...
4,HDFC Bank,4.0,10.9k,Public,Mumbai + 692 more,26 years old,10000+ employees,"Banking/Insurance/Accounting, Financial Servic...",HDFC Bank is one of India’s leading private ba...
5,Infosys,3.9,10.8k,Public,Bangalore + 64 more,39 years old,10000+ employees,"IT/ITES, Fortune500, MNC",Infosys is a global leader in next-generation ...
6,L&T,4.1,10.2k,Public,Mumbai + 333 more,82 years old,10000+ employees,"Construction, Fortune500, Conglomerate","Larsen & Toubro is a major technology, enginee..."
7,Capgemini,3.3,9.6k,Private,Paris + 42 more,53 years old,10000+ employees,"Computer Software, Consulting, IT/ITES, MNC","A global leader in consulting, technology serv..."
8,Tech Mahindra,3.5,9.1k,Public,Pune + 138 more,34 years old,10000+ employees,"Consulting, IT/ITES, MNC, Fortune500","Tech Mahindra represents the connected world, ..."
9,HCL Technologies,3.7,8.7k,Public,Noida + 106 more,14 years old,10000+ employees,"Consulting, IT/ITES, Fortune500, MNC","HCL Learning, a wholly owned subsidiary of HCL..."


## Scraping the whole Website!!

In [None]:
# ALL SET! LET'S PUT EVERYTHING TOGETHER AND SCRAPE THE WHOLE WEBSITE !!!

In [None]:
# Let's not scrape the whole website here... use a python script and run it in the terminal to do so!
# the external python script is attatched with this notebook

In [None]:
start_time = time.time()
dataframe_final = pd.DataFrame()

# 4,52,568 unique companies found / 30 per page = 15086 pages
# total_number_of_webpages = 15086
total_number_of_webpages = 10

for page in range(1, total_number_of_webpages+1):
    print("scraping webpage number: {page} of {total}".format(page=page, total=total_number_of_webpages))
    loop_time = time.time()
    
    # set page url and header
    url = "https://www.ambitionbox.com/list-of-companies?page={}".format(page)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
    
    # get page response from the website
    response = requests.get(url, headers=header)
    # time.sleep(0.1)
    
    # pass the page to BeautifulSoup
    soup = BeautifulSoup(response.text, 'lxml')
    
    # find all the company cards from the webpage
    company_cards = soup.find_all("div", class_="company-content-wrapper")
    
    # extract all the required data from each company card and store them in a list
    name = []
    rating = []
    reviews = []
    domain = []
    location = []
    years_old = []
    employee_strength = []
    tags = []
    about = []
    
    # scrap scrap scrap!
    for card in company_cards:
        # 1. name
        try:
            name.append(card.find("h2").text.strip())
        except:
            name.append(None)

        # 2. rating
        try:
            rating.append(card.find("p", class_="rating").text.strip())
        except:
            rating.append(None)

        # 3. reviews
        try:
            reviews.append(card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", ""))
        except:
            reviews.append(None)

        # 4. domain, 5. location, 6. years old & 7. employee strength
        info_list = card.find_all("p", class_="infoEntity sbold-list-header")
        dom = None
        loc = None
        old = None
        emp = None
        for i in range(4):
            try:
                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
                    dom = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
                    loc = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
                    old = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
                    emp = info_list[i].text.strip()
            except:
                pass

        domain.append(dom)
        location.append(loc)
        years_old.append(old)
        employee_strength.append(emp)

        # 8. tags
        t = []
        try:
            for tag in card.find_all("a", class_="ab_chip"):
                t.append(tag.text.strip())
            t = ', '.join(t)
            tags.append(t)
        except:
            tags.append(None)

        # 9. about
        try:
            about.append(card.find("p", class_="description").text.strip())
        except:
            about.append(None)
    
    # make a dictionary containing all the data extracted
    col_dic = {
        "name": name,
        "rating": rating,
        "reviews": reviews,
        "domain": domain,
        "location": location,
        "years_old": years_old,
        "employee_strength": employee_strength,
        "tags": tags,
        "about": about
    }
    
    # pass the dictionary to pandas to create a dataframe (page)
    df = pd.DataFrame(col_dic)
    
    # append the dataframe to the final dataframe (the whole website)
    dataframe_final = dataframe_final.append(df, ignore_index=True)
    
    # success
    print("success!")
    print("time taken:", round((time.time()-loop_time)*1000, 2), "ms")
    print("total time elapsed:", str(timedelta(seconds=(time.time()-start_time))))
    print()

end_time = time.time()
print("full website scraped successfully!")
print("total time taken:", str(timedelta(seconds=(end_time - start_time))))
print()


scraping webpage number: 1 of 10
success!
time taken: 1703.35 ms
total time elapsed: 0:00:01.705312

scraping webpage number: 2 of 10
success!
time taken: 3859.78 ms
total time elapsed: 0:00:05.565094

scraping webpage number: 3 of 10
success!
time taken: 1935.15 ms
total time elapsed: 0:00:07.500243

scraping webpage number: 4 of 10
success!
time taken: 2529.02 ms
total time elapsed: 0:00:10.030256

scraping webpage number: 5 of 10
success!
time taken: 1083.29 ms
total time elapsed: 0:00:11.113542

scraping webpage number: 6 of 10
success!
time taken: 995.52 ms
total time elapsed: 0:00:12.109065

scraping webpage number: 7 of 10
success!
time taken: 1618.12 ms
total time elapsed: 0:00:13.727190

scraping webpage number: 8 of 10
success!
time taken: 1511.58 ms
total time elapsed: 0:00:15.238774

scraping webpage number: 9 of 10
success!
time taken: 1509.74 ms
total time elapsed: 0:00:16.748513

scraping webpage number: 10 of 10
success!
time taken: 1404.3 ms
total time elapsed: 0:00:18

## Let's see what our dataframe looks like

In [None]:
dataframe_final.head()

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
0,TCS,3.9,16.1k,Public,Mumbai + 156 more,52 years old,10000+ employees,"Consulting, IT/ITES, MNC, Fortune500","Tata Consultancy Services is an IT services, c..."
1,Accenture,4.0,14.1k,Private,Dublin + 87 more,31 years old,10000+ employees,"Consulting, IT/ITES, MNC",Accenture is a leading global professional ser...
2,ICICI Bank,4.1,12.7k,Public,Mumbai + 724 more,26 years old,10000+ employees,"Banking/Insurance/Accounting, Financial Servic...",ICICI Bank is India's largest private sector b...
3,Cognizant,3.9,12.1k,Private,Teaneck + 44 more,26 years old,10000+ employees,"IT/ITES, MNC",Cognizant (NASDAQ-100: CTSH) is one of the wor...
4,HDFC Bank,4.0,10.9k,Public,Mumbai + 692 more,26 years old,10000+ employees,"Banking/Insurance/Accounting, Financial Servic...",HDFC Bank is one of India’s leading private ba...


In [None]:
# Print some statistics about the final dataframe:
print("dataframe shape", dataframe_final.shape)
print()
print("column-wise null count")
print(dataframe_final.isna().sum())
print()

dataframe shape (300, 9)

column-wise null count
name                 0
rating               0
reviews              0
domain               9
location             0
years_old            1
employee_strength    1
tags                 0
about                2
dtype: int64



In [None]:
dataframe_final.describe()

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
count,300,300.0,300,291,300,299,299,300,298
unique,300,15.0,143,4,287,107,3,185,298
top,Reliance Life Insurance,4.2,1.1k,Private,Mumbai + 67 more,20 years old,10000+ employees,"Banking/Insurance/Accounting, Financial Services",Through helping other organizations mitigate r...
freq,1,61.0,28,142,3,15,246,13,1


## Exporting the dataframe into an external CSV

In [None]:
# export the data to external csv
dataframe_final.to_csv("dataset/List_of_companies_in_India_ipynb_demo.csv", encoding="utf-8")