## Código Básico

In [11]:
from bs4 import BeautifulSoup
import requests

  long_versions = long_entities_by_first_character[short]


In [3]:
#url = 'https://www.glassdoor.com.br/Avalia%C3%A7%C3%B5es/ibis-Avalia%C3%A7%C3%B5es-E1443740.htm'
url = 'https://www.glassdoor.com.br/Avalia%C3%A7%C3%B5es/Autoglass-Avalia%C3%A7%C3%B5es-E36078.htm?countryRedirect=true'

In [4]:
page = requests.get(url)

In [5]:
soup = BeautifulSoup(page.text, 'html')

In [6]:
print(soup)

<html><body><p>Forbidden</p></body></html>


## Glassdoor

#### Tentativa normal

In [38]:
# import requests
# from fake_useragent import UserAgent

# ua = UserAgent()
# headers = {
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# }

# url ="https://www.glassdoor.com.br/Avalia%C3%A7%C3%B5es/ibis-Avalia%C3%A7%C3%B5es-E1443740.htm"

# response = requests.get(url, headers=headers)
# print(response.text)

# #soup = BeautifulSoup(response.text, 'html')

# # find description in the HTML:
# #print(soup.find_all('div'))

# # div class="review-details_topReview__BIP4D"

#### Scrapfly

In [4]:
"""
This is an example web scraper for Glassdoor.com used in scrapfly blog article:
https://scrapfly.io/blog/how-to-scrape-glassdoor/

To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
from enum import Enum
import json
import os
import re
from typing import Dict, List, Optional, Tuple, TypedDict
from urllib.parse import urljoin

from loguru import logger as log
from scrapfly import ScrapeApiResponse, ScrapeConfig, ScrapflyClient, ScrapflyScrapeError

SCRAPFLY = ScrapflyClient(key='scp-live-c6bc397cc2884718910b12838ab40894')
BASE_CONFIG = {
    # Glassdoor.com requires Anti Scraping Protection bypass feature.
    # for more: https://scrapfly.io/docs/scrape-api/anti-scraping-protection
    "asp": True,
    "country": "BR",
}

#os.environ["SCRAPFLY_KEY"]

In [5]:
def find_hidden_data(result: ScrapeApiResponse) -> dict:
    """
    Extract hidden web cache (Apollo Graphql framework) from Glassdoor page HTML
    It's either in NEXT_DATA script or direct apolloState js variable
    """
    # data can be in __NEXT_DATA__ cache
    data = result.selector.css("script#__NEXT_DATA__::text").get()
    if data:
        data = json.loads(data)["props"]["pageProps"]["apolloCache"]
    else:  # or in direct apolloState cache
        data = re.findall(r'apolloState":\s*({.+})};', result.content)[0]
        data = json.loads(data)

    def _unpack_apollo_data(apollo_data):
        """
        Glassdoor uses Apollo GraphQL client and the dataset is a graph of references.
        This function unpacks the __ref references to actual values.
        """

        def resolve_refs(data, root):
            if isinstance(data, dict):
                if "__ref" in data:
                    return resolve_refs(root[data["__ref"]], root)
                else:
                    return {k: resolve_refs(v, root) for k, v in data.items()}
            elif isinstance(data, list):
                return [resolve_refs(i, root) for i in data]
            else:
                return data

        return resolve_refs(apollo_data.get("ROOT_QUERY") or apollo_data, apollo_data)

    return _unpack_apollo_data(data)


def parse_jobs(result: ScrapeApiResponse) -> Tuple[List[Dict], List[str]]:
    """Parse Glassdoor jobs page for job data and other page pagination urls"""
    cache = find_hidden_data(result)
    job_cache = next(v for k, v in cache.items() if k.startswith("jobListings"))
    jobs = [v["jobview"]["header"] for v in job_cache["jobListings"]]
    other_pages = [
        urljoin(result.context["url"], page["urlLink"])
        for page in job_cache["paginationLinks"]
        if page["isCurrentPage"] is False
    ]
    return jobs, other_pages


async def scrape_jobs(url: str, max_pages: Optional[int] = None) -> List[Dict]:
    """Scrape Glassdoor job listing page for job listings (with pagination)"""
    log.info("scraping job listings from {}", url)
    first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))

    jobs, other_page_urls = parse_jobs(first_page)
    _total_pages = len(other_page_urls) + 1
    if max_pages and _total_pages > max_pages:
        other_page_urls = other_page_urls[:max_pages]

    log.info("scraped first page of jobs of {}, scraping remaining {} pages", url, _total_pages - 1)
    other_pages = [ScrapeConfig(url, **BASE_CONFIG) for url in other_page_urls]
    async for result in SCRAPFLY.concurrent_scrape(other_pages):
        if not isinstance(result, ScrapflyScrapeError):
            jobs.extend(parse_jobs(result)[0])
        else:
            log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
    log.info("scraped {} jobs from {} in {} pages", len(jobs), url, _total_pages)
    return jobs


def parse_reviews(result: ScrapeApiResponse) -> Dict:
    """parse Glassdoor reviews page for review data"""
    cache = find_hidden_data(result)
    reviews = next(v for k, v in cache.items() if k.startswith("employerReviews") and v.get("reviews"))
    return reviews


async def scrape_reviews(url: str, max_pages: Optional[int] = None) -> Dict:
    """Scrape Glassdoor reviews listings from reviews page (with pagination)"""
    log.info("scraping reviews from {}", url)
    first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url=url, **BASE_CONFIG))

    reviews = parse_reviews(first_page)
    total_pages = reviews["numberOfPages"]
    if max_pages and max_pages < total_pages:
        total_pages = max_pages

    log.info("scraped first page of reviews of {}, scraping remaining {} pages", url, total_pages - 1)
    other_pages = [
        ScrapeConfig(url=Url.change_page(first_page.context["url"], page=page), **BASE_CONFIG)
        for page in range(2, total_pages + 1)
    ]
    async for result in SCRAPFLY.concurrent_scrape(other_pages):
        if not isinstance(result, ScrapflyScrapeError):
            reviews["reviews"].extend(parse_reviews(result)["reviews"])
        else:
            log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
    log.info("scraped {} reviews from {} in {} pages", len(reviews["reviews"]), url, total_pages)
    return reviews


def parse_salaries(result: ScrapeApiResponse) -> Dict:
    """Parse Glassdoor salaries page for salary data"""
    cache = find_hidden_data(result)
    salaries = next(v for k, v in cache.items() if k.startswith("aggregatedSalaryEstimates") and v.get("results"))
    return salaries


async def scrape_salaries(url: str, max_pages: Optional[int] = None) -> Dict:
    """Scrape Glassdoor Salary page for salary listing data (with pagination)"""
    log.info("scraping salaries from {}", url)
    first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url=url, **BASE_CONFIG))
    salaries = parse_salaries(first_page)
    total_pages = salaries["numPages"]
    if max_pages and total_pages > max_pages:
        total_pages = max_pages

    log.info("scraped first page of salaries of {}, scraping remaining {} pages", url, total_pages - 1)
    other_pages = [
        ScrapeConfig(url=Url.change_page(first_page.context["url"], page=page), **BASE_CONFIG)
        for page in range(2, total_pages + 1)
    ]
    async for result in SCRAPFLY.concurrent_scrape(other_pages):
        if not isinstance(result, ScrapflyScrapeError):
            salaries["results"].extend(parse_salaries(result)["results"])
        else:
            log.error(f"failed to scrape {result.api_response.config['url']}, got: {result.message}")
    log.info("scraped {} salaries from {} in {} pages", len(salaries["results"]), url, total_pages)
    return salaries


class FoundCompany(TypedDict):
    """type hint for company search result"""
    name: str
    id: str
    url_overview: str
    url_jobs: str
    url_reviews: str
    url_salaries: str


async def find_companies(query: str) -> List[FoundCompany]:
    """find company Glassdoor ID and name by query. e.g. "ebay" will return "eBay" with ID 7853"""
    result = await SCRAPFLY.async_scrape(
        ScrapeConfig(
            url=f"https://www.glassdoor.com/searchsuggest/typeahead?numSuggestions=8&source=GD_V2&version=NEW&rf=full&fallback=token&input={query}",
            **BASE_CONFIG,
        )
    )
    data = json.loads(result.content)
    companies = []
    for result in data:
        if result["category"] == "company":
            companies.append(
                {
                    "name": result["suggestion"],
                    "id": result["employerId"],
                    "url_overview": Url.overview(result["suggestion"], result["employerId"]),
                    "url_jobs": Url.jobs(result["suggestion"], result["employerId"]),
                    "url_reviews": Url.reviews(result["suggestion"], result["employerId"]),
                    "url_salaries": Url.salaries(result["suggestion"], result["employerId"]),
                }
            )
    return companies


class Region(Enum):
    """glassdoor.com region codes"""

    UNITED_STATES = "1"
    UNITED_KINGDOM = "2"
    CANADA_ENGLISH = "3"
    INDIA = "4"
    AUSTRALIA = "5"
    FRANCE = "6"
    GERMANY = "7"
    SPAIN = "8"
    BRAZIL = "9"
    NETHERLANDS = "10"
    AUSTRIA = "11"
    MEXICO = "12"
    ARGENTINA = "13"
    BELGIUM_NEDERLANDS = "14"
    BELGIUM_FRENCH = "15"
    SWITZERLAND_GERMAN = "16"
    SWITZERLAND_FRENCH = "17"
    IRELAND = "18"
    CANADA_FRENCH = "19"
    HONG_KONG = "20"
    NEW_ZEALAND = "21"
    SINGAPORE = "22"
    ITALY = "23"


class Url:
    """
    Helper URL generator that generates full URLs for glassdoor.com pages
    from given employer name and ID
    For example:
    > GlassdoorUrl.overview("eBay Motors Group", "4189745")
    https://www.glassdoor.com/Overview/Working-at-eBay-Motors-Group-EI_IE4189745.11,28.htm

    Note that URL formatting is important when it comes to scraping Glassdoor
    as unusual URL formats can lead to scraper blocking.
    """

    @staticmethod
    def overview(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
        employer = employer.replace(" ", "-")
        url = f"https://www.glassdoor.com/Overview/Working-at-{employer}-EI_IE{employer_id}"
        # glassdoor is allowing any prefix for employer name and
        # to indicate the prefix suffix numbers are used like:
        # https://www.glassdoor.com/Overview/Working-at-eBay-Motors-Group-EI_IE4189745.11,28.htm
        # 11,28 is the slice where employer name is
        _start = url.split("/Overview/")[1].find(employer)
        _end = _start + len(employer)
        url += f".{_start},{_end}.htm"
        if region:
            return url + f"?filter.countryId={region.value}"
        return url

    @staticmethod
    def reviews(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
        employer = employer.replace(" ", "-")
        url = f"https://www.glassdoor.com/Reviews/{employer}-Reviews-E{employer_id}.htm?"
        if region:
            return url + f"?filter.countryId={region.value}"
        return url

    @staticmethod
    def salaries(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
        employer = employer.replace(" ", "-")
        url = f"https://www.glassdoor.com/Salary/{employer}-Salaries-E{employer_id}.htm?"
        if region:
            return url + f"?filter.countryId={region.value}"
        return url

    @staticmethod
    def jobs(employer: str, employer_id: str, region: Optional[Region] = None) -> str:
        employer = employer.replace(" ", "-")
        url = f"https://www.glassdoor.com/Jobs/{employer}-Jobs-E{employer_id}.htm?"
        if region:
            return url + f"?filter.countryId={region.value}"
        return url

    @staticmethod
    def change_page(url: str, page: int) -> str:
        """update page number in a glassdoor url"""
        if re.search(r"_P\d+\.htm", url):
            new = re.sub(r"(?:_P\d+)*.htm", f"_P{page}.htm", url)
        else:
            new = re.sub(".htm", f"_P{page}.htm", url)
        assert new != url
        return new

In [9]:
# Import the necessary modules
import asyncio

# Define your search query
query = "ibis"  # Replace with the company you are interested in

# Find the company
async def get_company_info(query: str):
    companies = await find_companies(query)
    if companies:
        return companies[0]  # Get the first match
    else:
        raise ValueError("No company found")

# Run the function in the existing event loop
company_info = await get_company_info(query)
print(company_info)

{'name': 'IBIS', 'id': '1443740', 'url_overview': 'https://www.glassdoor.com/Overview/Working-at-IBIS-EI_IE1443740.11,15.htm', 'url_jobs': 'https://www.glassdoor.com/Jobs/IBIS-Jobs-E1443740.htm?', 'url_reviews': 'https://www.glassdoor.com/Reviews/IBIS-Reviews-E1443740.htm?', 'url_salaries': 'https://www.glassdoor.com/Salary/IBIS-Salaries-E1443740.htm?'}


In [47]:
reviews_url

'https://www.glassdoor.com/Reviews/IBIS-Reviews-E1443740.htm?'

In [46]:
# Define the URL for reviews based on the company_info obtained
company_id = company_info["id"]
reviews_url = Url.reviews(company_info["name"], company_id)

# Scrape the reviews
async def get_reviews(url: str):
    reviews = await scrape_reviews(url)
    return reviews

# Run the function to get reviews
reviews_data = await get_reviews(reviews_url)
#print(reviews_data)

[32m2024-09-15 18:30:56.683[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_reviews[0m:[36m79[0m - [1mscraping reviews from https://www.glassdoor.com/Reviews/IBIS-Reviews-E1443740.htm?[0m
[32m2024-09-15 18:31:11.380[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_reviews[0m:[36m87[0m - [1mscraped first page of reviews of https://www.glassdoor.com/Reviews/IBIS-Reviews-E1443740.htm?, scraping remaining 26 pages[0m
[32m2024-09-15 18:34:19.864[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_reviews[0m:[36m97[0m - [1mscraped 263 reviews from https://www.glassdoor.com/Reviews/IBIS-Reviews-E1443740.htm? in 27 pages[0m


In [37]:
from pprint import pprint
pprint(reviews_data,  indent=4)

{   '__typename': 'EmployerReviewsRG',
    'allReviewsCount': 837,
    'currentPage': 1,
    'filteredReviewsCount': 263,
    'lastReviewDateTime': '2024-09-09T17:44:28.277',
    'numberOfPages': 27,
    'queryJobTitle': None,
    'queryLocation': None,
    'ratedReviewsCount': 709,
    'ratingCountDistribution': {   '__typename': 'RatingCountDistribution',
                                   'careerOpportunities': {   '_1': 79,
                                                              '_2': 101,
                                                              '_3': 126,
                                                              '_4': 127,
                                                              '_5': 144,
                                                              '__typename': 'FiveStarRatingCountDistribution'},
                                   'compensationAndBenefits': {   '_1': 79,
                                                                  '_2': 103,
           

                       'employer': {   '__typename': 'Employer',
                                       'activeStatus': 'INACTIVE',
                                       'approvalStatus': 'APPROVED',
                                       'bestLedCompanies({"onlyCurrent":true})': [   ],
                                       'bestPlacesToWork({"onlyCurrent":true})': [   ],
                                       'bestProfile': {   '__typename': 'EmployerProfile',
                                                          'id': 8038705,
                                                          'rowProfile': False},
                                       'counts': {   '__typename': 'EmployerCounts',
                                                     'benefitCount': 154,
                                                     'globalJobCount': {   '__typename': 'EmployerJobCountHolder',
                                                                           'jobCount': 80},
             

                                       'primaryIndustry': {   '__typename': 'EmployerIndustry',
                                                              'industryId': 200139,
                                                              'industryName': 'Hotéis '
                                                                              'e '
                                                                              'resorts',
                                                              'sectorId': 10025,
                                                              'sectorName': 'Hotel '
                                                                            'e '
                                                                            'acomodações '
                                                                            'de '
                                                                            'viagem'},
                                       'ratings': {  

                   {   '__typename': 'EmployerReviewRG',
                       'advice': None,
                       'adviceOriginal': None,
                       'cons': 'Muitas questões pessoais atrapalhando o '
                               'profissional',
                       'consOriginal': None,
                       'countHelpful': 0,
                       'countNotHelpful': 0,
                       'employer': {   '__typename': 'Employer',
                                       'activeStatus': 'INACTIVE',
                                       'approvalStatus': 'APPROVED',
                                       'bestLedCompanies({"onlyCurrent":true})': [   ],
                                       'bestPlacesToWork({"onlyCurrent":true})': [   ],
                                       'bestProfile': {   '__typename': 'EmployerProfile',
                                                          'id': 8038705,
                                                          'rowP

In [44]:
# def get_reviews_by_page(data, page_number):
#     page_key = f'page_{page_number}'
#     return data.get('reviews', {}).get(page_key, [])

# # Exemplo de uso:
# page_number = 2
# reviews_page_2 = get_reviews_by_page(reviews_data, page_number)
# print(reviews_page_2)

In [45]:
pprint(reviews_data['reviews'], indent=4)

[   {   '__typename': 'EmployerReviewRG',
        'advice': None,
        'adviceOriginal': None,
        'cons': 'Não encontrei nenhum ponto negativo',
        'consOriginal': None,
        'countHelpful': 0,
        'countNotHelpful': 0,
        'employer': {   '__typename': 'Employer',
                        'activeStatus': 'INACTIVE',
                        'approvalStatus': 'APPROVED',
                        'bestLedCompanies({"onlyCurrent":true})': [],
                        'bestPlacesToWork({"onlyCurrent":true})': [],
                        'bestProfile': {   '__typename': 'EmployerProfile',
                                           'id': 8038705,
                                           'rowProfile': False},
                        'counts': {   '__typename': 'EmployerCounts',
                                      'benefitCount': 154,
                                      'globalJobCount': {   '__typename': 'EmployerJobCountHolder',
                                    

        'employer': {   '__typename': 'Employer',
                        'activeStatus': 'INACTIVE',
                        'approvalStatus': 'APPROVED',
                        'bestLedCompanies({"onlyCurrent":true})': [],
                        'bestPlacesToWork({"onlyCurrent":true})': [],
                        'bestProfile': {   '__typename': 'EmployerProfile',
                                           'id': 8038705,
                                           'rowProfile': False},
                        'counts': {   '__typename': 'EmployerCounts',
                                      'benefitCount': 154,
                                      'globalJobCount': {   '__typename': 'EmployerJobCountHolder',
                                                            'jobCount': 80},
                                      'interviewCount': 76,
                                      'photoCount': 4,
                                      'reviewCount': 837,
                          

                        'restOfWorldProfile': {   '__typename': 'EmployerProfile',
                                                  'id': 8038705,
                                                  'rowProfile': False},
                        'shortName': 'ibis',
                        'size': 'Mais de 10.000 funcionários',
                        'squareLogoUrl': 'https://media.glassdoor.com/sql/1443740/ibis-squarelogo-1496146680356.png',
                        'squareLogoUrl({"size":"LARGE"})': 'https://media.glassdoor.com/sqll/1443740/ibis-squarelogo-1496146680356.png',
                        'squareLogoUrl({"size":"REGULAR"})': 'https://media.glassdoor.com/sql/1443740/ibis-squarelogo-1496146680356.png',
                        'subsidiaries': [],
                        'website': 'ibis.accor.com'},
        'employerResponses': [],
        'employmentStatus': None,
        'featured': False,
        'flaggingDisabled': None,
        'isCurrentJob': True,
        'jobTitle': {  