In [1]:
from datetime import datetime
import random
import re
import requests
import time

import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# load the HTML content from url
base_url = "https://www.kiwoko.com"
dogs_search_url = base_url + "/servicios/kiwokoadopta/animal/buscar/Perro"

In [3]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run without a GUI

# Create and initialize the Chrome driver
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=chrome_options
)
driver.get(dogs_search_url)
wait = WebDriverWait(driver, 10)

# Wait for the cookie consent popup and reject all cookies
element = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler")))
element.click()

In [4]:
# Load all dog cards by clicking the "Show More" button
dog_entries = []
retries_left = 5
while True:
    try:
        # Get the current page source
        wait = WebDriverWait(driver, 10)
        content = driver.page_source

        # parse the HTML content using BeautifulSoup
        dogs_list = BeautifulSoup(content, "html.parser")
        new_dog_entries = dogs_list.select_one("#petSearch").select_one(".petContainer").find_all("div", recursive=False)

        # Check if new entries have been loaded
        if len(new_dog_entries) == len(dog_entries):
            if retries_left > 0:
                print(f"No new dog entries found. Retrying... ({retries_left} retries left)")
            else:
                retries_left -= 1
                print("No new dog entries found. Stopping the scraping process.")
                break
        else:
            # Reset retries if new entries are found
            retries_left = 5

            # Add new dog entries to the list
            dog_entries = new_dog_entries
            print(f"Found {len(dog_entries)} dog entries...")

        # Click the "Show More" button
        element = wait.until(EC.element_to_be_clickable((By.ID, "showMorePets")))
        driver.execute_script("arguments[0].click();", element)

        # Wait to avoid being blocked
        time.sleep(5 + random.random())

    except Exception as e:
        print("An error occurred:", e)
        break

# Print the number of dog entries found so far
print(f"Total dog entries found: {len(dog_entries)}")

Found 9 dog entries...
Found 18 dog entries...
Found 27 dog entries...
Found 36 dog entries...
Found 45 dog entries...
Found 54 dog entries...
Found 63 dog entries...
Found 72 dog entries...
Found 81 dog entries...
Found 90 dog entries...
Found 99 dog entries...
Found 108 dog entries...
Found 117 dog entries...
Found 126 dog entries...
Found 135 dog entries...
Found 144 dog entries...
Found 153 dog entries...
Found 162 dog entries...
Found 171 dog entries...
Found 180 dog entries...
Found 189 dog entries...
Found 198 dog entries...
Found 207 dog entries...
Found 216 dog entries...
Found 225 dog entries...
Found 234 dog entries...
Found 243 dog entries...
Found 252 dog entries...
Found 261 dog entries...
Found 270 dog entries...
Found 279 dog entries...
Found 288 dog entries...
Found 297 dog entries...
Found 306 dog entries...
Found 315 dog entries...
Found 324 dog entries...
Found 333 dog entries...
Found 342 dog entries...
Found 351 dog entries...
Found 360 dog entries...
Found 369 do

In [5]:
# shuffle the dog entries to randomize the order
random.shuffle(dog_entries)

# Extract the dogs information from each entry
dogs_dataset = {}
for current_dog in dog_entries:
    current_dog_info = {}

    # Get the URL of the dog's information page
    dog_info_url = base_url + current_dog.select_one("a").get("href")

    # Get the dog ID and name
    dog_id = dog_info_url.split("/")[-1]
    current_dog_info["name"] = current_dog.select_one(".petName").text.strip()

    # Get the dog gender
    dog_gender_icon_name = current_dog.select_one("img").get("src")
    if re.search("man", dog_gender_icon_name) is not None:
        current_dog_info["gender"] = "Macho"
    elif re.search("female", dog_gender_icon_name) is not None:
        current_dog_info["gender"] = "Hembra"
    else:
        current_dog_info["gender"] = None

    dog_age, dog_size = None, None
    for info in current_dog.find_all("p"):

        # Get the dog age
        if info.text.startswith("Edad:"):
            dog_age = int(info.text.split()[1].strip())

            if info.text.strip().endswith("meses"):
                # Convert months to years
                dog_age = round(dog_age / 12, ndigits=2)

            current_dog_info["age"] = dog_age

        # Get the dog size
        elif info.text.startswith("Tamaño:"):
            dog_size = info.text.split(": ")[1].strip()

            if dog_size == "Toy":
                dog_size = "Enano"
            elif dog_size == "S/P":
                dog_size = None

            current_dog_info["size"] = dog_size

    # Wait to avoid being blocked
    time.sleep(5 + random.random())

    try:
        response = requests.get(dog_info_url)
        response.raise_for_status()  # Raise an error for bad responses
        html_content = response.text

    except requests.HTTPError as e:
        print(f"Failed to retrieve the webpage. Status code: {e.response.status_code}")
        continue

    else:
        dog_extra_info = BeautifulSoup(html_content, "html.parser")

        current_dog_info["urgent_adoption"] = dog_extra_info.find("div", class_="urgentChip") is not None

        for data_item in dog_extra_info.find_all("div", class_="petDataItem"):
            data_item_info = data_item.find_all("p")

            if data_item_info[0].text.strip() == "Raza":
                current_dog_info["breed"] = data_item_info[1].text.strip()

            elif data_item_info[0].text.strip() == "Provincia":
                current_dog_info["province"] = data_item_info[1].text.strip()

            elif data_item_info[0].text.strip() == "Puedo viajar para adopción":
                current_dog_info["can_travel"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Vacunado":
                current_dog_info["is_vaccinated"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Desparasitado":
                current_dog_info["is_dewormed"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Sano":
                current_dog_info["is_healthy"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Esterilizado":
                current_dog_info["is_sterilized"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Identificado":
                current_dog_info["is_identified"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Microchip":
                current_dog_info["has_microchip"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Pasaporte":
                current_dog_info["has_passport"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Perros":
                current_dog_info["good_with_children"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Gatos":
                current_dog_info["good_with_cats"] = True if data_item_info[1].text.strip() == "Sí" else False

            elif data_item_info[0].text.strip() == "Niños":
                current_dog_info["good_with_dogs"] = True if data_item_info[1].text.strip() == "Sí" else False

        if (personalities := dog_extra_info.find("div", class_="personalities")) is not None:
            for personality in personalities.find_all("p"):
                if personality.text.strip() == "Cariñoso":
                    current_dog_info["is_affectionate"] = True

                elif personality.text.strip() == "Hiperactivo":
                    current_dog_info["is_hyperactive"] = True

                elif personality.text.strip() == "Miedoso":
                    current_dog_info["is_fearful"] = True

                elif personality.text.strip() == "Sedentario":
                    current_dog_info["is_sedentary"] = True

                elif personality.text.strip() == "Sociable":
                    current_dog_info["is_sociable"] = True

                elif personality.text.strip() == "Tranquilo":
                    current_dog_info["is_calm"] = True

                elif personality.text.strip() == "Necesita cuidados veterinarios":
                    current_dog_info["needs_vet_care"] = True

        if (description_1 := dog_extra_info.select_one("#collapseSeven")) is not None:
            current_dog_info["description_1"] = re.sub(r"\s+", " ", description_1.find("p", class_="petDescription").text).strip()
        else:
            current_dog_info["description_1"] = None

        if (description_2 := dog_extra_info.select_one("#collapsefour")) is not None:
            current_dog_info["description_2"] = re.sub(r"\s+", " ", description_2.find("p", class_="petDescription").text).strip()
        else:
            current_dog_info["description_2"] = None

        current_dog_info["img_url"] = base_url + current_dog.find("div", class_="similarPetImage").get("style").split("url('")[1].split("')")[0]

    finally:
        # Add the dog info URL to the current dog's information
        current_dog_info["info_url"] = dog_info_url

        # Update the dataset entry with the new information
        dogs_dataset[dog_id] = current_dog_info

        # print("Dog info:", pd.Series(current_dog_info))
        print(f"{current_dog_info["name"]} has been added to the dataset with ID {dog_id}.")

JACKY has been added to the dataset with ID 12519.
BOMBON has been added to the dataset with ID 10947.
PRISCILA has been added to the dataset with ID 4465.
BORA has been added to the dataset with ID 12713.
COCO has been added to the dataset with ID 12868.
Cora has been added to the dataset with ID 11820.
PERIQUILLA has been added to the dataset with ID 6527.
Chispa has been added to the dataset with ID 1089.
Kiya has been added to the dataset with ID 3832.
PITINGO has been added to the dataset with ID 13008.
😍 El pequeño MONTY busca hogar has been added to the dataset with ID 9943.
OSCAR has been added to the dataset with ID 10786.
FLECHER has been added to the dataset with ID 6294.
NICO has been added to the dataset with ID 13393.
FREYA has been added to the dataset with ID 4528.
INDIA has been added to the dataset with ID 8299.
TOFFEE has been added to the dataset with ID 6616.
MUFASA has been added to the dataset with ID 3868.
TOGO has been added to the dataset with ID 12457.
Wilma-

In [6]:
# Convert the dataset dictionary to a DataFrame and display the first few rows
dogs_df = pd.DataFrame.from_dict(dogs_dataset, orient="index")
dogs_df.head()

Unnamed: 0,name,gender,age,size,urgent_adoption,breed,province,is_vaccinated,is_dewormed,is_healthy,...,description_1,description_2,img_url,info_url,can_travel,is_calm,needs_vet_care,is_hyperactive,is_fearful,is_sedentary
12519,JACKY,Macho,2.0,Mediano,True,Bretón,Madrid,True,True,True,...,JACKY es un bretoncito que nos ha conquistado ...,"Perro joven, alegre y cariñoso",https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...,,,,,,
10947,BOMBON,Macho,0.92,Mediano,False,MESTIZO,Madrid,True,True,True,...,"Bombón es un simpático cachorro de 8 meses, na...",,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...,,,,,,
4465,PRISCILA,Hembra,2.0,Grande,False,MASTIN,Córdoba,True,True,True,...,"Una belleza de perrita, buena, tierna, sociabl...",,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...,True,,,,,
12713,BORA,Hembra,0.25,Mediano,False,X Pastor Alemán,Córdoba,True,True,True,...,*BORA* Bora y su hermana son dos bebés encanta...,*BORA* Sexo: Hembra Raza: X Pastor Alemán Fech...,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...,True,,,,,
12868,COCO,Macho,3.0,Grande,False,Mestizo,Barcelona,True,True,True,...,Se encontraba abandonado.,"Coco es un cruce de los más bonitos, probablem...",https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...,True,,,,,


In [7]:
dogs_df = dogs_df.sort_index()[[
    "name",
    "breed",
    "gender",
    "age",
    "size",
    "province",
    "can_travel",
    "urgent_adoption",
    "needs_vet_care",
    "is_vaccinated",
    "is_dewormed",
    "is_healthy",
    "is_sterilized",
    "is_identified",
    "has_microchip",
    "has_passport",
    "good_with_children",
    "good_with_cats",
    "good_with_dogs",
    "is_affectionate",
    "is_hyperactive",
    "is_fearful",
    "is_sociable",
    "is_calm",
    "is_sedentary",
    "description_1",
    "description_2",
    "img_url",
    "info_url",
]]
dogs_df.head()

Unnamed: 0,name,breed,gender,age,size,province,can_travel,urgent_adoption,needs_vet_care,is_vaccinated,...,is_affectionate,is_hyperactive,is_fearful,is_sociable,is_calm,is_sedentary,description_1,description_2,img_url,info_url
1001,TIRILLAS,Pitbull,Macho,6.0,Grande,Toledo,,False,,True,...,True,,,True,True,,A este bombón le encontramos junto a sus tres ...,,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...
1002,LEÓNIDAS,Mestizo,Macho,5.0,Grande,Toledo,,False,,True,...,True,,True,True,True,,Leónidas te rescatamos de un síndrome de Noe y...,,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...
10026,Rex y Chapi,Mestizo y bodeguero,Macho,5.0,Pequeño,Sevilla,True,False,,True,...,True,,,True,,,REX Y CHAPI 💜 La historia de estos dos bombone...,Activos y cariñosos,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...
1004,SUKO,CRUCE LABRADOR,Macho,3.0,Mediano,Madrid,,False,,True,...,True,,,,,,SUKO es un cruce de labrador nacido en junio d...,,https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...
10059,León,Mestizo,Macho,5.0,Mediano,Córdoba,True,True,,True,...,,,True,,,,LEÓN LLEVA 5 AÑOS EN EL REFUGIO 😔💔 Es uno de l...,"Se entrega con chip, pasaporte, vacunas corres...",https://www.kiwoko.com/servicios/kiwokoadopta/...,https://www.kiwoko.com/servicios/kiwokoadopta/...


In [8]:
for col in [
    "can_travel",
    "urgent_adoption",
    "needs_vet_care",
    "is_vaccinated",
    "is_dewormed",
    "is_healthy",
    "is_sterilized",
    "is_identified",
    "has_microchip",
    "has_passport",
    "good_with_children",
    "good_with_cats",
    "good_with_dogs",
    "is_affectionate",
    "is_hyperactive",
    "is_fearful",
    "is_sociable",
    "is_calm",
    "is_sedentary",
]:
    dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")

  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dogs_df[col].replace({np.nan: False}).astype("bool")
  dogs_df[col] = dog

In [9]:
for col in ["breed", "province",]:
    dogs_df[col] = dogs_df[col].fillna("Desconocida").astype("category")

In [10]:
# Export the DataFrame to a CSV file
dogs_df.to_csv(f"./kiwoko_dogs_data-{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.csv", encoding="utf-8-sig")