In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import requests
import pandas as pd

from re import sub
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

from typing import Iterable, Union

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Utilidades para *scrapear* Clutch.co

In [3]:
def get_search_page(category: str, page_number: int) -> BeautifulSoup:
  """Obtiene la página de búsqueda de una categoría en Clutch"""
  clutch_argentina = requests.get(f'https://clutch.co/ar/{category}?page={page_number}')
  return BeautifulSoup(clutch_argentina.text)

def get_company_profile(profile_link: str) -> BeautifulSoup:
  """Obtiene el perfil de una empresa"""
  profile = requests.get(f'https://clutch.co/{profile_link}')
  return BeautifulSoup(profile.text)

def is_last_page(beautiful_soup_page: BeautifulSoup) -> bool:
  """Determina si la página es la última de la categoría"""
  return not beautiful_soup_page.select('li.page-item.last > a')

def get_element_text_if_exists(elements: Iterable) -> Union[str, None]:
  """Obtiene el texto del primer elemento si existe"""
  if elements is not None and len(elements) > 0:
    return elements[0].text.strip()

def snake_case(value: str) -> str:
  """Convierte una cadena en `snake_case`"""
  return '_'.join(sub('[^0-9a-zA-Z\s\_]+', '', value).split()).lower()

In [4]:
def create_row(provider_li: BeautifulSoup) -> dict:
  """Crea una fila de datos a partir de una fila de la tabla de resultados"""
  row = {}
  
  # Obtiene el link del perfil de la empresa
  company_info = provider_li.select('h3.company_info a')
  row['company_name'] = get_element_text_if_exists(company_info)
  row['rating'] = get_element_text_if_exists(provider_li.select('.rating'))
  row['reviews'] = get_element_text_if_exists(provider_li.select('.reviews-link'))

  # Buscamos las propiedades: Min. project size, Avg. hourly rate, Employees, Location
  items = provider_li.select('.module-list > div')
  for item in items:
    key_name = BeautifulSoup(item.attrs['data-content']).text
    row[key_name] = get_element_text_if_exists(item.select('span'))

  # Buscamos el service "Service Focus" medido en porcentajes
  percentages = provider_li.select('.chartAreaContainer > div')
  for percentage in percentages:
    content = BeautifulSoup(percentage.attrs['data-content'])
    row[f'service_focus_{content.b.text}'] = content.i.text

  # Buscamos el "Client focus"
  profile_link = provider_li.select('li.website-profile > a')[0].attrs['href']
  profile = get_company_profile(profile_link)
  client_focus = profile.find("div", string='Client focus')

  if client_focus:
    for client in client_focus.find_next_sibling().select('div.grid.custom_popover'):
      key_name = BeautifulSoup(client.attrs['data-content']).b.text
      row[f'client_focus_{key_name}'] = client.text
    
  return row

In [5]:
def get_companies(category: str) -> pd.DataFrame:
  """Obtiene todas las empresas de una categoría"""
  page_number = 0
  rows = []

  with tqdm() as pbar:
    pbar.set_description(f"Categoria: {category}")

    while True:
      pbar.update(1)

      page = get_search_page(category, page_number)
      company_list = page.select('ul.directory-list li.provider-row')
      rows += [create_row(company) for company in company_list]
      page_number += 1

      if is_last_page(page):
        break
    
  return pd.DataFrame(rows)

Categorías a *scrapear* de Clutch.co

In [6]:
categories = ['app-developers', 'web-developers', 'developers']

Se *scrapean* las empresas de las categorías

In [7]:
companies = []

for category in categories:
    companies.append(get_companies(category))

companies = pd.concat(companies)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Algunas filas:

In [8]:
companies.head()

Unnamed: 0,company_name,rating,reviews,Min. project size,Avg. hourly rate,Employees,Location,service_focus_Mobile App Development,service_focus_Custom Software Development,service_focus_Web Development,client_focus_Midmarket ($10M - $1B),client_focus_Enterprise (>$1B),client_focus_Small Business (<$10M),service_focus_Artificial Intelligence,service_focus_UX/UI Design,service_focus_Application Testing,service_focus_Architectural Design,service_focus_IT Staff Augmentation,service_focus_IT Strategy Consulting,service_focus_BI & Big Data Consulting & SI,service_focus_Blockchain,service_focus_Cloud Consulting & SI,service_focus_ERP Consulting and SI,service_focus_E-Commerce Development,service_focus_Enterprise App Modernization,service_focus_AR/VR Development,service_focus_CRM Consulting and SI,service_focus_Social Media Marketing,service_focus_Content Marketing,service_focus_Digital Strategy,service_focus_Web Design,service_focus_IT Managed Services,service_focus_Other,service_focus_Other IT Consulting and SI,service_focus_Application Management & Support,service_focus_IoT Development,service_focus_Product Design,service_focus_Other Application Development,service_focus_Search Engine Optimization,service_focus_Graphic Design,service_focus_HR Services,service_focus_Wearable App Development,service_focus_Email Marketing,service_focus_Branding,service_focus_Business Consulting,service_focus_Advertising,service_focus_Logo,service_focus_Video Production,service_focus_Market Research,service_focus_Pay Per Click,service_focus_Cybersecurity,service_focus_Accounting,service_focus_Translation,service_focus_Other Design,service_focus_Other Digital Marketing,service_focus_Mobile & App Marketing,service_focus_Sales Outsourcing,service_focus_Media Planning & Buying,service_focus_Back Office Outsourcing,service_focus_Conversion Optimization,service_focus_Corporate Training,service_focus_Marketing Strategy,service_focus_Public Relations,service_focus_Corporate Photography,service_focus_Packaging Design,service_focus_Print Design,service_focus_Call Center Services,service_focus_Customer Service Outsourcing,service_focus_Ocean Freight,service_focus_Logistics & Supply Chain Consulting,service_focus_Trucking,service_focus_Air Freight,service_focus_Freight forwarding,service_focus_Customs brokerage,"service_focus_Commercial Financing, Funding, & Investment",service_focus_Unified Communications Consulting & SI
0,404 // Software crafters,4.7,5 reviews,"$10,000+",$25 - $49 / hr,10 - 49,"Buenos Aires, Argentina",40%,30%,30%,80%,10%,10%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Rootstrap,4.8,34 reviews,"$50,000+",$100 - $149 / hr,50 - 249,"Buenos Aires, Argentina",25%,10%,40%,40%,25%,35%,25%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Patagonian,4.9,6 reviews,"$50,000+",$25 - $49 / hr,50 - 249,"General Roca, Argentina",40%,40%,,50%,10%,40%,,20%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,XOOR Inc.,5.0,21 reviews,"$50,000+",$50 - $99 / hr,10 - 49,"Mar del Plata, Argentina",50%,,50%,40%,,60%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Flux IT,4.9,11 reviews,"$10,000+",Undisclosed,50 - 249,"Gonnet, Argentina",30%,40%,30%,50%,25%,25%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Se renombran las columnas para que sean `snake_case`.

In [9]:
companies.columns = map(snake_case, companies.columns)

In [10]:
companies.columns

Index(['company_name', 'rating', 'reviews', 'min_project_size',
       'avg_hourly_rate', 'employees', 'location',
       'service_focus_mobile_app_development',
       'service_focus_custom_software_development',
       'service_focus_web_development', 'client_focus_midmarket_10m_1b',
       'client_focus_enterprise_1b', 'client_focus_small_business_10m',
       'service_focus_artificial_intelligence', 'service_focus_uxui_design',
       'service_focus_application_testing',
       'service_focus_architectural_design',
       'service_focus_it_staff_augmentation',
       'service_focus_it_strategy_consulting',
       'service_focus_bi_big_data_consulting_si', 'service_focus_blockchain',
       'service_focus_cloud_consulting_si',
       'service_focus_erp_consulting_and_si',
       'service_focus_ecommerce_development',
       'service_focus_enterprise_app_modernization',
       'service_focus_arvr_development', 'service_focus_crm_consulting_and_si',
       'service_focus_social_media_

Se almacena la información tal cual es extraída para alimentar sucesivas etapas del proceso.

In [12]:
companies.to_csv('../data/00 - raw/companies_raw.csv', index=False)