# Download Logos

Acest notebook downloadeaza logo-urile in folderul 'dataset/logos_raw/', fara domeniile duplicate din fisier. Logo-urile acceptate sunt cele care contin in denumire cuvantul logo si au ca extensie .png, .jpg, .jpeg, .webp sau .svg. Am incercat sa paralelizez procesul trimitand pachetele de cereri prin mai multe fire de executie (120) simultan. Pentru fiecare domeniu, pipeline-ul a fost urmatorul:
1. sa extrag html-ul site-ului
2. sa gasesc calea fisierului de logo
3. sa downloadez fisierul de logo

In principiu, au fost 3 probleme cu care m-am confruntat:
- era acceptata la putine site-uri orice fel de cerere. Am rezolvat-o cat am putut de mult uitandu-ma pe erorile pe care le primeam de la acele site-uri si imbunatatind headerul pachetului si ritmul cererilor. Imbunatiri notabile au fost: randomizarea User_Agent si Accept-Language, retry cu backoff, reincercarea link-urilor la anumite coduri de eroare http, fallback la cloudscraper daca nu functiona metoda din libraria request, pastrarea cookie-urilor si sesiunilor prin session = requests.Session() si permiterea redirectionarii la alte pagini session.get(url, timeout=10, allow_redirects=True)
- a fost greu de gasit link-ul fisierului de logo. Am folosit BeautifulSoup pentru parsarea html-ului, am incercat metode euristice.
- era acceptata de putine site-uri functia de download, dar am abordat problema in aceeasi maniera in care am abordat-o pe prima

O imbunatatire buna ar fi implementarea unei logici de fallback la un browser headless, cum ar fi selenium pentru site-urile la care e necesara executie javascript.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import tldextract
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import openpyxl
import random
from urllib.parse import urlparse, urljoin
from fake_useragent import UserAgent
import cloudscraper
import os
from enum import Enum, auto
import re
from collections import Counter

In [2]:
# constants
PROTOCOLS = ['https://', 
             'http://',
]

HEADERS = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,"
                  "image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",
        "Sec-CH-UA": '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
}

ACCEPT_LANGS = [
    "en-US,en;q=0.9", 
    "ro-RO,ro;q=0.9,en-US;q=0.8", 
    "fr-FR,fr;q=0.9,en;q=0.8",
    "de-DE,de;q=0.9,en;q=0.8", 
    "es-ES,es;q=0.9,en;q=0.7", 
]

LOGO_FOLDER = 'dataset/logos_raw/'


In [3]:
class LogoError(Enum):
    OK = auto()
    PAGE_LOAD_FAIL = auto()
    NO_LOGO_FOUND = auto()
    DOWNLOAD_LOGO_FAIL = auto()
    SAVE_FAIL = auto()
    UNKNOWN = auto()

In [4]:
def find_logo_url(html, base):
    try:
        soup = BeautifulSoup(html, "html.parser")
        extensions = ('.png', '.jpg', '.jpeg', '.svg', '.webp')

        for img in soup.find_all('img'):
            for attr in ['src', 'data-src', 'data-lazy-src', 'srcset']:
                src_raw = img.get(attr)
                if not src_raw:
                    continue

                src_raw = src_raw.strip()
                if not src_raw or src_raw.startswith("data:"):
                    continue

                if " " in src_raw:
                    src_raw = src_raw.split()[0]

                src_lower = src_raw.lower()

                if 'logo' in src_lower and any(src_lower.endswith(ext) for ext in extensions):
                    return urljoin(base, src_raw)

        for source in soup.find_all('source'):
            for attr in ['srcset', 'data-srcset']:
                src_raw = source.get(attr)
                if not src_raw:
                    continue

                src_raw = src_raw.strip().split()[0]
                src_lower = src_raw.lower()

                if 'logo' in src_lower and any(src_lower.endswith(ext) for ext in extensions):
                    return urljoin(base, src_raw)

        return None

    except Exception as e:
        print(f"[find_logo_url error] {e}")
        return None


In [5]:
def download_logo(domain, url, session=None, max_retries=3, base_delay=1.5):
    ua = UserAgent()

    if not session:
        session = requests.Session()
        session.headers.update(HEADERS)

    session.headers["User-Agent"] = ua.random
    session.headers["Accept-Language"] = random.choice(ACCEPT_LANGS)

    last_err = None

    for attempt in range(1, max_retries + 1):
        try:
            resp = session.get(url, stream=True, timeout=10, allow_redirects=True)

            if resp.status_code in (429, 502, 503, 504):
                last_err = f"HTTP {resp.status_code}"
                delay = base_delay * (2 ** (attempt - 1)) + random.uniform(0.5, 1.5)
                time.sleep(delay)
                continue

            if resp.status_code != 200:
                # eroare non-retryable 
                return False, f"HTTP {resp.status_code} - {resp.reason}"

            parsed_url = urlparse(url)
            basename = os.path.basename(parsed_url.path)

            safe_domain = domain.replace("/", "_").replace("\\", "_")
            filename = f"{safe_domain}_{basename}"

            filepath = os.path.join(LOGO_FOLDER, filename)
            os.makedirs(LOGO_FOLDER, exist_ok=True)

            response = session.get(url, stream=True, timeout=10)
            response.raise_for_status()

            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(1024):
                    if chunk:
                        f.write(chunk)

            return True, ' '

        except requests.exceptions.RequestException as e:
            last_err = str(e)
            # backoff exponential + jitter
            delay = base_delay * (2 ** (attempt - 1)) + random.uniform(0.5, 1.5)
            time.sleep(delay)
            continue
        except Exception as e:
            # alte erori (IO etc.)
            return False, str(e)

    # dacă am epuizat retry-urile
    return False, (last_err or "Unknown error")

In [6]:
def request_html(domain, session = None):

    result = {
        'domain': domain,
        'url': None,
        'success': False,
        'error_type': None,
        'error': None
    }

    error = None

    ua = UserAgent() 

    if not session:
        session = requests.Session()
        session.headers.update(HEADERS)

    session.headers["User-Agent"] = ua.random
    session.headers["Accept-Language"] = random.choice(ACCEPT_LANGS)

    max_retries = 3
    base_delay = 1.5 

    for protocol in PROTOCOLS:
        url = protocol + domain
        result['url'] = url

        for attempt in range(1, max_retries + 1):
            try:
                response = session.get(url, timeout=10, allow_redirects=True)

                if response.status_code == 200:

                    logo_url = find_logo_url(response.text, url)
                    if not logo_url:
                        result['error_type'] = LogoError.NO_LOGO_FOUND
                        return result

                    ok_download_logo, err = download_logo(domain, logo_url, session)
                    if not ok_download_logo:
                        result['error_type'], result['error'] = LogoError.DOWNLOAD_LOGO_FAIL, err
                        return result

                    result['success'], result['error_type'] = True, LogoError.OK
                    return result

                if response.status_code in (429, 503, 502, 504):
                    delay = base_delay * attempt + random.uniform(0.5, 1.5)
                    time.sleep(delay)
                    continue

                error = f"HTTP {response.status_code}"
                break

            except requests.exceptions.RequestException as e:
                error = str(e)
                delay = base_delay * attempt + random.uniform(0.5, 1.5)
                time.sleep(delay)
                continue

    # cloudscraper extinde session
    if not isinstance(session, cloudscraper.CloudScraper):
        scraper = cloudscraper.create_scraper()
        scraper.headers.update(HEADERS)
        return request_html(domain, scraper)

    result['error_type'], result['error'] = LogoError.PAGE_LOAD_FAIL, error
    return result

In [None]:
# extract html
df = pd.read_parquet("./logos.snappy.parquet", engine='fastparquet')
df = df.drop_duplicates(subset='domain', keep='first')

domains = df['domain'].tolist()
reached = 0
not_reached = 0
errors = []

start_time = time.time()

with ThreadPoolExecutor(max_workers=120) as executor:
    futures = [executor.submit(request_html, d) for d in domains]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Procesare site-uri"):
        result = future.result()
        if result['success']:
            reached += 1
        else:
            errors.append(result)
            not_reached += 1

end_time = time.time()
elapsed = end_time - start_time
percent_reached = (reached / len(domains)) * 100

print(f'\nSite-uri succes: {reached}')
print(f'Site-uri la care nu a fost succes: {not_reached}')
print(f'Procentul de site-uri succes: {percent_reached:.2f}%')
print(f'Timp total: {elapsed:.2f} secunde')


if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_excel('dataset/errors.xlsx', index=False)
    print("Erorile au fost salvate în 'errors.xlsx'")
else:
    print("Nu au fost erori de salvat.")

Procesare site-uri: 100%|██████████| 3416/3416 [09:20<00:00,  6.10it/s] 



Site-uri succes: 2356
Site-uri la care nu a fost succes: 1060
Procentul de site-uri succes: 68.97%
Timp total: 580.27 secunde
Erorile au fost salvate în 'errors.xlsx'


In [8]:
error_counts = Counter(e['error_type'] for e in errors if e['error_type'])

print("\n================ ERROR SUMMARY ================")
for err_type, count in error_counts.items():
    print(f"{err_type.name:<20} : {count}")

total = len(errors)
print(f"Fail              : {total}")
print("==============================================\n")


PAGE_LOAD_FAIL       : 311
NO_LOGO_FOUND        : 723
DOWNLOAD_LOGO_FAIL   : 26
Fail              : 1060

