In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list << "EOF"
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << "EOF"
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.yvXJZjDP9d/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.HHBa20gbG2/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.EwdfWpCi4W/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
Get:2 http://deb.debian.org/debian bust



In [None]:
import os
import requests
import time
import pandas as pd
import random
import hashlib
import urllib.parse
from io import BytesIO
from PIL import Image

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
WEBDRIVER_DELAY_TIME_INT = 20
TIMEOUT_INT = 20
service = Service(executable_path=r"/usr/bin/chromedriver")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("window-size=1920x1080")
chrome_options.headless = True
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(TIMEOUT_INT)
wait = WebDriverWait(driver, WEBDRIVER_DELAY_TIME_INT)

In [None]:
def get_image_links_from_page(page_url, driver):
    driver.get(page_url)
    try:
        container = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.FS5UE28h.container")
        ))
        image_items = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div.LQY5mtmC div.aLnnpRah.text-center"))
        )

        image_links = []
        for img_elem in image_items:
            img_div = img_elem.find_element(By.CSS_SELECTOR, "div.Mw1EAtrx img, img")

            img_url = img_div.get_attribute("src")
            img_title = img_div.get_attribute("title")
            if img_url:
                image_links.append((img_url, img_title))

        return image_links
    except Exception as e:
        print(f"Error while trying to extract images: {e}")
        return []

def hash_image_content(url):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            return hashlib.md5(response.content).hexdigest()
        else:
            print(f"Error downloading image from {url}; status: {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error with the image download for {url}: {e}")
        return None

def convert_webp_to_jpg(webp_data):
    try:
        img = Image.open(BytesIO(webp_data))
        if img.format == 'WEBP':
            if img.mode == 'RGBA':
                img = img.convert('RGB')
            buffer = BytesIO()
            img.save(buffer, format="JPEG")
            return buffer.getvalue()
        else:
            return webp_data
    except Exception as e:
        print(f"Error converting WebP to JPG: {e}")
        return webp_data

def download_image(img_url, img_name, folder_path):
    try:
        response = requests.get(img_url, stream=True)
        if response.status_code == 200:
            img_path = os.path.join(folder_path, f"{img_name}")
            img_data = convert_webp_to_jpg(response.content)
            with open(img_path, "wb") as f:
                f.write(img_data)
        else:
            print(f"Error downloading image from {img_url}; status: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error with the image download for {img_url}: {e}")

def process_image_page(image_url, img_title, folder_path, idx, tag, seen_hashes):
    img_hash = hash_image_content(image_url)
    if img_hash and img_hash not in seen_hashes:
        seen_hashes.add(img_hash)
        new_file_name = f"{tag}_{idx:07d}.jpg"
        download_image(image_url, new_file_name, folder_path)
        metadata = {
            "file_name": new_file_name,
            "image_url": image_url,
            "image_title": img_title,
            "tag": tag
        }
        return metadata
    else:
        return None

def loop_over_pages(base_url, tags, total_pages, driver, folder_path):
    os.makedirs(folder_path, exist_ok=True)
    all_metadata = []
    seen_hashes = set()

    for tag in tags:
        all_images = []

        for page in tqdm(range(1, total_pages + 1), desc=f"Extracting Images for {tag}", unit="page"):
            page_url = f"{base_url}/emoji-list/tag/{tag}?page={page}"
            images = get_image_links_from_page(page_url, driver)
            all_images.extend(images)

            time.sleep(1)

        metadata_list = []
        for idx, (img_url, img_title) in enumerate(all_images, start=1):
            metadata = process_image_page(img_url, img_title, folder_path, idx, tag, seen_hashes)
            if metadata:
                metadata_list.append(metadata)

        all_metadata.extend(metadata_list)

    return all_metadata

def save_metadata(metadata_list, metadata_file):
    df = pd.DataFrame(metadata_list)
    df.to_csv(metadata_file, index=False, encoding="utf-8")

In [None]:
os.makedirs("crawled_data", exist_ok=True)
folder_path = os.path.join("crawled_data", "images")
metadata_file = os.path.join("crawled_data", "metadata.csv")

base_url = "https://discords.com"
tags = ["Panda"]
total_pages = 1000

metadata_list = loop_over_pages(base_url, tags, total_pages, driver, folder_path)
save_metadata(metadata_list, metadata_file)

print("Start downloading images...")
with tqdm(total=len(metadata_list), desc="Downloading Images", unit="image") as pbar:
    for metadata in metadata_list:
        img_url = metadata['image_url']
        file_name = metadata['file_name']
        download_image(img_url, file_name, folder_path)
        pbar.update(1)

print("Download images completed.")

total_crawled_images = len(os.listdir(folder_path))
print(f"Total crawled images: {total_crawled_images}.")

driver.quit()

Extracting Images for Panda:  57%|█████▋    | 567/1000 [38:23<33:33,  4.65s/page]

Error while trying to extract images: Message: 



Extracting Images for Panda:  57%|█████▋    | 568/1000 [38:45<1:10:55,  9.85s/page]

Error while trying to extract images: Message: 
Stacktrace:
#0 0x592a206807f9 <unknown>
#1 0x592a206203b3 <unknown>
#2 0x592a20368016 <unknown>
#3 0x592a2039c81e <unknown>
#4 0x592a203d28fb <unknown>
#5 0x592a203bfded <unknown>
#6 0x592a203d09e1 <unknown>
#7 0x592a203bfc93 <unknown>
#8 0x592a20391ce4 <unknown>
#9 0x592a203934d2 <unknown>
#10 0x592a2064c542 <unknown>
#11 0x592a2065bce7 <unknown>
#12 0x592a2065b9e4 <unknown>
#13 0x592a2066013a <unknown>
#14 0x592a2065c5b9 <unknown>
#15 0x592a20641e00 <unknown>
#16 0x592a206735d2 <unknown>
#17 0x592a20673778 <unknown>
#18 0x592a2068ba1f <unknown>
#19 0x7ae3c963fac3 <unknown>
#20 0x7ae3c96d1850 <unknown>



Extracting Images for Panda:  57%|█████▋    | 569/1000 [39:06<1:35:37, 13.31s/page]

Error while trying to extract images: Message: 
Stacktrace:
#0 0x592a206807f9 <unknown>
#1 0x592a206203b3 <unknown>
#2 0x592a20368016 <unknown>
#3 0x592a2039c81e <unknown>
#4 0x592a203d28fb <unknown>
#5 0x592a203bfded <unknown>
#6 0x592a203d09e1 <unknown>
#7 0x592a203bfc93 <unknown>
#8 0x592a20391ce4 <unknown>
#9 0x592a203934d2 <unknown>
#10 0x592a2064c542 <unknown>
#11 0x592a2065bce7 <unknown>
#12 0x592a2065b9e4 <unknown>
#13 0x592a2066013a <unknown>
#14 0x592a2065c5b9 <unknown>
#15 0x592a20641e00 <unknown>
#16 0x592a206735d2 <unknown>
#17 0x592a20673778 <unknown>
#18 0x592a2068ba1f <unknown>
#19 0x7ae3c963fac3 <unknown>
#20 0x7ae3c96d1850 <unknown>



Extracting Images for Panda: 100%|██████████| 1000/1000 [1:09:57<00:00,  4.20s/page]


Error downloading image from https://discords.com/_next/image?url=https%3A%2F%2Fcdn.discordapp.com%2Femojis%2F985017003288965210.png%3Fv%3D1&w=128&q=75; status: 500
Error downloading image from https://discords.com/_next/image?url=https%3A%2F%2Fcdn.discordapp.com%2Femojis%2F983666662668640286.png%3Fv%3D1&w=128&q=75; status: 500
Error downloading image from https://discords.com/_next/image?url=https%3A%2F%2Fcdn.discordapp.com%2Femojis%2F997109074258825226.png%3Fv%3D1&w=128&q=75; status: 500
Error downloading image from https://discords.com/_next/image?url=https%3A%2F%2Fcdn.discordapp.com%2Femojis%2F1013472727228751923.png%3Fv%3D1&w=128&q=75; status: 500
Error downloading image from https://discords.com/_next/image?url=https%3A%2F%2Fcdn.discordapp.com%2Femojis%2F809355888825597983.png%3Fv%3D1&w=128&q=75; status: 500
Start downloading images...


Downloading Images: 100%|██████████| 3942/3942 [13:57<00:00,  4.71image/s]


Download images completed.
Total crawled images: 3942.


In [None]:
!zip -r crawled_panda_emojis_1000_pages.zip crawled_data

  adding: crawled_data/ (stored 0%)
  adding: crawled_data/metadata.csv (deflated 86%)
  adding: crawled_data/images/ (stored 0%)
  adding: crawled_data/images/Panda_0000127.jpg (stored 0%)
  adding: crawled_data/images/Panda_0000278.jpg (stored 0%)
  adding: crawled_data/images/Panda_0018549.jpg (stored 0%)
  adding: crawled_data/images/Panda_0008522.jpg (stored 0%)
  adding: crawled_data/images/Panda_0017408.jpg (stored 0%)
  adding: crawled_data/images/Panda_0001191.jpg (stored 0%)
  adding: crawled_data/images/Panda_0005920.jpg (deflated 0%)
  adding: crawled_data/images/Panda_0019143.jpg (stored 0%)
  adding: crawled_data/images/Panda_0007375.jpg (stored 0%)
  adding: crawled_data/images/Panda_0013083.jpg (stored 0%)
  adding: crawled_data/images/Panda_0000886.jpg (stored 0%)
  adding: crawled_data/images/Panda_0010646.jpg (deflated 1%)
  adding: crawled_data/images/Panda_0001985.jpg (deflated 0%)
  adding: crawled_data/images/Panda_0014709.jpg (stored 0%)
  adding: crawled_data/i