In [1]:
!pip install xlsxwriter pydrive

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25l[?25hdone
  Created wheel for pydrive: filename=PyDrive-1.3.1-py3-none-any.whl size=27433 sha256=cd1076bba3a1e8f331071be67ae058f443c4afdf6d2e14bd195d88b5b28dddd9
  Stored in directory: /root/.cache/pip/wheels/6c/10/da/a5b513f5b3916fc391c20ee7b4633e5cf3396d570cdd74970f
Successfully built pydrive
Installing collected packages: xlsxwriter, pydrive
Successfully installed pydrive-1.3.1 xlsx

In [6]:
#%% import
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import json
import re


#%% functions

def try_sitemap(base_url):
    """
    Próbuje pobrać linki z sitemap.xml
    """
    sitemap_urls = [
        f"{base_url}post-sitemap.xml",  # Priorytet dla bukbuk.pl
        f"{base_url}sitemap.xml",
        f"{base_url}sitemap_index.xml",
        f"{base_url}sitemap-posts.xml",
        f"{base_url}wp-sitemap.xml"
    ]

    for sitemap_url in sitemap_urls:
        try:
            print(f"  Próbuję: {sitemap_url}")
            r = requests.get(sitemap_url, timeout=10)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, 'xml')
                links = [loc.text.strip() for loc in soup.find_all('loc')]
                if links:
                    print(f"  ✓ Znaleziono sitemap: {sitemap_url}")
                    print(f"  ✓ Linków: {len(links)}")
                    return links
        except Exception as e:
            continue

    return None


def get_article_links_from_page(page_url):
    """
    Pobiera linki do artykułów z pojedynczej strony
    """
    try:
        r = requests.get(page_url, timeout=10)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'lxml')

        article_links = []

        # Szukamy w <article>
        for article in soup.find_all('article'):
            for header in article.find_all(['h1', 'h2', 'h3']):
                link = header.find('a', href=True)
                if link:
                    href = link['href']
                    if 'bukbuk.pl' in href or href.startswith('/'):
                        if href.startswith('/'):
                            href = 'https://bukbuk.pl' + href
                        article_links.append(href)
                        break

        return article_links
    except Exception as e:
        print(f"Błąd dla {page_url}: {e}")
        return []


def get_all_pages(base_url):
    """
    Znajduje wszystkie strony paginacji
    """
    pages = [base_url]
    try:
        page_num = 2
        while page_num <= 100:  # Max 100 stron
            # Różne formaty paginacji
            possible_urls = [
                f"{base_url}page/{page_num}/",
                f"{base_url}strona/{page_num}/",
                f"{base_url}?page={page_num}",
                f"{base_url}?paged={page_num}"
            ]

            found = False
            for next_page in possible_urls:
                try:
                    r = requests.get(next_page, timeout=10)
                    if r.status_code == 200 and r.url != base_url:
                        pages.append(next_page)
                        print(f"  Znaleziono stronę {page_num}")
                        found = True
                        break
                except:
                    continue

            if not found:
                break

            page_num += 1
            time.sleep(0.5)

        return pages
    except Exception as e:
        print(f"Błąd paginacji: {e}")
        return pages


def get_all_article_links(base_url):
    """
    Główna funkcja - pobiera wszystkie linki
    """
    # Najpierw próbuj sitemap
    print("Krok 1: Próba pobrania sitemap...")
    sitemap_links = try_sitemap(base_url)

    if sitemap_links:
        print(f"Znaleziono {len(sitemap_links)} linków w sitemap")
        return sitemap_links, []

    print("Brak sitemap, używam paginacji...")

    # Paginacja
    print("\nKrok 2: Szukanie stron...")
    all_pages = get_all_pages(base_url)
    print(f"Znaleziono {len(all_pages)} stron")

    all_article_links = []
    errors = []

    print("\nKrok 3: Pobieranie artykułów...")
    for page_url in tqdm(all_pages, desc="Przetwarzanie"):
        try:
            time.sleep(0.5)
            article_links = get_article_links_from_page(page_url)
            all_article_links.extend(article_links)
        except Exception as e:
            errors.append(page_url)

    # Usuń duplikaty
    all_article_links = list(set(all_article_links))

    return all_article_links, errors


def filter_article_links(all_links):
    """
    Filtruje linki - zostawia tylko artykuły
    """
    article_links = []
    excluded = {'category': 0, 'tag': 0, 'page': 0, 'author': 0, 'short': 0, 'digits': 0, 'media': 0, 'other': 0}

    for link in all_links:
        # Usuń fragmenty
        link = link.split('#')[0].rstrip('/')

        if not link:
            continue

        # Wykluczamy pliki multimedialne i statyczne
        media_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg',
                          '.pdf', '.zip', '.mp4', '.mp3', '.avi',
                          '.css', '.js', '.xml', '.json']
        if any(link.lower().endswith(ext) for ext in media_extensions):
            excluded['media'] += 1
            continue

        # Wykluczamy ścieżki do folderów z plikami
        if '/wp-content/' in link or '/uploads/' in link or '/files/' in link:
            excluded['media'] += 1
            continue

        # Wykluczamy kategorie
        if '/category/' in link or '/kategoria/' in link:
            excluded['category'] += 1
            continue
        if '/tag/' in link:
            excluded['tag'] += 1
            continue
        if '/author/' in link or '/autor/' in link:
            excluded['author'] += 1
            continue
        if '/page/' in link or '/strona/' in link:
            excluded['page'] += 1
            continue

        # Główna strona
        if link in ['https://bukbuk.pl', 'https://www.bukbuk.pl', 'http://bukbuk.pl']:
            excluded['other'] += 1
            continue

        # Wykluczamy linki które są podejrzanie krótkie lub tylko cyfry
        path = link.replace('https://bukbuk.pl/', '').replace('https://www.bukbuk.pl/', '').replace('http://bukbuk.pl/', '')

        # Jeśli path to tylko cyfry i myślniki
        if re.match(r'^\d+(-\d+)?$', path):
            excluded['digits'] += 1
            continue

        # Jeśli path jest bardzo krótki (< 8 znaków) i nie ma sensownych słów
        if len(path) < 8 and not re.search(r'[a-z]{3,}', path, re.IGNORECASE):
            excluded['short'] += 1
            continue

        article_links.append(link)

    # Usuń duplikaty
    article_links = list(set(article_links))

    print(f"\nStatystyki filtrowania:")
    print(f"  Kategorie: {excluded['category']}")
    print(f"  Tagi: {excluded['tag']}")
    print(f"  Autorzy: {excluded['author']}")
    print(f"  Paginacja: {excluded['page']}")
    print(f"  Same cyfry: {excluded['digits']}")
    print(f"  Za krótkie: {excluded['short']}")
    print(f"  Media/pliki: {excluded['media']}")
    print(f"  Inne: {excluded['other']}")
    print(f"  ✅ ZAAKCEPTOWANO: {len(article_links)}")

    return article_links


#%% main execution

if __name__ == "__main__":
    base_url = "https://bukbuk.pl/"

    print("="*60)
    print("Pobieranie linków z bukbuk.pl")
    print("="*60 + "\n")

    # Pobierz wszystkie linki
    all_links, errors = get_all_article_links(base_url)

    print(f"\nZnaleziono {len(all_links)} linków")

    # Filtruj
    article_links = filter_article_links(all_links)

    if not article_links:
        print("\n⚠️  Nie znaleziono artykułów!")
        exit(1)

    # Pokaż przykłady
    print("\nPrzykładowe linki (pierwsze 10):")
    for i, link in enumerate(sorted(article_links)[:10], 1):
        print(f"  {i}. {link}")

    if len(article_links) > 10:
        print(f"  ... i {len(article_links) - 10} więcej")

    # Sortuj
    article_links.sort()

    # Zapisz
    with open('bukbuk_linki.txt', 'w', encoding='utf-8') as f:
        for link in article_links:
            f.write(link + '\n')

    output_data = {
        'source': base_url,
        'total_links': len(article_links),
        'links': article_links,
        'errors': errors
    }

    with open('bukbuk_linki.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    # Raport
    print("\n" + "="*60)
    print("RAPORT")
    print("="*60)
    print(f"Znaleziono artykułów: {len(article_links)}")
    print(f"Błędów: {len(errors)}")
    print(f"\nZapisano do:")
    print(f"  - bukbuk_linki.txt")
    print(f"  - bukbuk_linki.json")
    print("="*60)

Pobieranie linków z bukbuk.pl

Krok 1: Próba pobrania sitemap...
  Próbuję: https://bukbuk.pl/post-sitemap.xml
  ✓ Znaleziono sitemap: https://bukbuk.pl/post-sitemap.xml
  ✓ Linków: 757
Znaleziono 757 linków w sitemap

Znaleziono 757 linków

Statystyki filtrowania:
  Kategorie: 0
  Tagi: 0
  Autorzy: 0
  Paginacja: 0
  Same cyfry: 0
  Za krótkie: 0
  Media/pliki: 549
  Inne: 1
  ✅ ZAAKCEPTOWANO: 207

Przykładowe linki (pierwsze 10):
  1. https://bukbuk.pl/2016/02/bialystok-biala-sila
  2. https://bukbuk.pl/2016/02/bukbuk-live
  3. https://bukbuk.pl/2016/02/jak-napisac-kryminal
  4. https://bukbuk.pl/2016/02/kazik-staszewski-o-swojej-biografii
  5. https://bukbuk.pl/2016/02/kazik-staszewski-poleca
  6. https://bukbuk.pl/2016/02/kinga-debska-poleca-ksiazki
  7. https://bukbuk.pl/2016/02/kuba-zulczyk-o-bialym-proszku-i-swiatlach-wielkiego-miasta
  8. https://bukbuk.pl/2016/02/magda-molek-poleca-ksiazki
  9. https://bukbuk.pl/2016/02/malgorzata-halber-i-bohater
  10. https://bukbuk.pl/2016

In [7]:
#%% import
from __future__ import unicode_literals
import re
import time
from datetime import datetime
from time import mktime
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json
import xlsxwriter


#%% functions

def date_change_format(date_string):
    """
    Konwertuje datę z różnych formatów na "YYYY-MM-DD"
    """
    try:
        date_string = ' '.join(date_string.strip().split())

        if re.match(r'\d{4}-\d{2}-\d{2}', date_string):
            return date_string[:10]

        if 'T' in date_string:
            return date_string.split('T')[0]

        # Format DD.MM.YYYY (bukbuk.pl)
        if re.match(r'\d{2}\.\d{2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
            changed_date = datetime.fromtimestamp(mktime(result))
            return format(changed_date.date())

        lookup_table = {
            "stycznia": "01", "lutego": "02", "marca": "03", "kwietnia": "04",
            "maja": "05", "czerwca": "06", "lipca": "07", "sierpnia": "08",
            "września": "09", "października": "10", "listopada": "11", "grudnia": "12",
            "styczeń": "01", "luty": "02", "marzec": "03", "kwiecień": "04",
            "maj": "05", "czerwiec": "06", "lipiec": "07", "sierpień": "08",
            "wrzesień": "09", "październik": "10", "listopad": "11", "grudzień": "12"
        }

        for k, v in lookup_table.items():
            date_string = date_string.replace(k, v)

        if re.match(r'\d{1,2}\.\d{1,2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
        else:
            result = time.strptime(date_string, "%d %m %Y")

        changed_date = datetime.fromtimestamp(mktime(result))
        return format(changed_date.date())
    except Exception as e:
        return "no date"


def dictionary_of_article(article_link):
    """
    Pobiera szczegóły artykułu z bukbuk.pl
    """
    try:
        r = requests.get(article_link, timeout=15)
        r.encoding = 'utf-8'

        # Sprawdź czy strona istnieje
        if r.status_code != 200:
            print(f"⚠️  HTTP {r.status_code} dla {article_link}")
            errors.append(article_link)
            return

        html_text = r.text

        # Sprawdź czy to nie jest strona błędu
        if 'error 404' in html_text.lower() or 'page not found' in html_text.lower():
            print(f"⚠️  404 dla {article_link}")
            errors.append(article_link)
            return

        soup = BeautifulSoup(html_text, 'lxml')

        # Data publikacji
        try:
            # Opcja 1: post__date (bukbuk.pl)
            date_element = soup.find('span', class_='post__date')

            if not date_element:
                date_element = soup.find('time')
            if not date_element:
                date_element = soup.find('span', class_=lambda x: x and 'date' in str(x).lower())
            if not date_element:
                date_element = soup.find(class_='entry-date')
            if not date_element:
                meta_date = soup.find('meta', property='article:published_time')
                if meta_date:
                    date_element = type('obj', (object,), {
                        'get_text': lambda: meta_date.get('content', ''),
                        'get': lambda x: meta_date.get('content', '')
                    })()

            if date_element:
                date_text = date_element.get('datetime') or date_element.get('content') or date_element.get_text(strip=True)
                date_of_publication = date_change_format(date_text)
            else:
                date_of_publication = "no date"
        except Exception as e:
            date_of_publication = "no date"

        # Tytuł
        try:
            title_element = soup.find('h1', class_=lambda x: x and 'entry-title' in str(x).lower())
            if not title_element:
                title_element = soup.find('h1')
            if not title_element:
                title_element = soup.find('title')

            if title_element:
                title = title_element.get_text(strip=True)
                # Usuń "| Bukbuk.pl" itp.
                title = re.sub(r'\s*[-|]\s*[Bb]ukbuk\.pl.*$', '', title)
                title = title.strip()

            if not title:
                title = "no title"
        except:
            title = "no title"

        # Autor
        try:
            # Opcja 1: post__author (bukbuk.pl)
            author_element = soup.find('span', class_='post__author')

            if not author_element:
                author_element = soup.find('a', rel='author')
            if not author_element:
                author_element = soup.find('span', class_=lambda x: x and 'author' in str(x).lower())
            if not author_element:
                author_element = soup.find(class_='author-name')

            if author_element:
                author = author_element.get_text(strip=True)
                author = re.sub(r'^(Autor|By|Opublikował):\s*', '', author, flags=re.IGNORECASE)
            else:
                author = "no author"
        except:
            author = "no author"

        # Treść artykułu
        try:
            # Bukbuk ma specyficzną strukturę: post__lead + post__content
            text_parts = []

            # Opcja 1: post__lead (główna treść w bukbuk.pl)
            lead = soup.find('div', class_='post__lead')
            if lead:
                text_parts.append(lead.get_text(strip=True))

            # Opcja 2: post__content (dodatkowa treść)
            content = soup.find('div', class_='post__content')
            if content:
                text_parts.append(content.get_text(strip=True))

            # Opcja 3: Standardowe entry-content
            if not text_parts:
                article_body = soup.find('div', class_=lambda x: x and 'entry-content' in str(x).lower())
                if article_body:
                    text_parts.append(article_body.get_text(strip=True))

            # Opcja 4: post-content
            if not text_parts:
                article_body = soup.find('div', class_=lambda x: x and 'post-content' in str(x).lower())
                if article_body:
                    text_parts.append(article_body.get_text(strip=True))

            # Opcja 5: article
            if not text_parts:
                article_elem = soup.find('article')
                if article_elem:
                    content_div = article_elem.find('div', class_=lambda x: x and 'content' in str(x).lower())
                    if content_div:
                        text_parts.append(content_div.get_text(strip=True))

            if text_parts:
                text = ' '.join(text_parts)
                text = text.replace('\n', ' ').replace('\xa0', ' ')
                text = re.sub(r'\s+', ' ', text)
            else:
                text = "no text"
        except Exception as e:
            text = "no text"

        # Kategoria
        try:
            category_links = soup.find_all('a', rel='category tag')
            if not category_links:
                category_links = soup.find_all('a', rel='category')

            if category_links:
                categories = [cat.get_text(strip=True) for cat in category_links]
                category = ' | '.join(categories)
            else:
                category = "no category"
        except:
            category = "no category"

        # Tagi
        try:
            tag_links = soup.find_all('a', rel='tag')
            if tag_links:
                tags = [tag.get_text(strip=True) for tag in tag_links]
                tags_str = ' | '.join(tags)
            else:
                tags_str = None
        except:
            tags_str = None

        # Linki zewnętrzne
        try:
            links = []
            # Zbierz z post__lead
            lead = soup.find('div', class_='post__lead')
            if lead:
                links.extend([a['href'] for a in lead.find_all('a', href=True)])

            # Zbierz z post__content
            content = soup.find('div', class_='post__content')
            if content:
                links.extend([a['href'] for a in content.find_all('a', href=True)])

            external_links = [link for link in links if not re.search(r'bukbuk\.pl', link)]
            external_links = ' | '.join(external_links) if external_links else None
        except (AttributeError, KeyError, IndexError):
            external_links = None

        # Zdjęcia
        try:
            images = []

            # Thumbnail/Featured
            thumbnail_div = soup.find('div', class_=lambda x: x and 'post-thumbnail' in str(x).lower())
            if thumbnail_div:
                thumb_img = thumbnail_div.find('img')
                if thumb_img:
                    img_url = thumb_img.get('src') or thumb_img.get('data-src')
                    if img_url and img_url not in images:
                        images.append(img_url)

            # Z post__lead
            lead = soup.find('div', class_='post__lead')
            if lead:
                for img in lead.find_all('img'):
                    img_url = img.get('src') or img.get('data-src')
                    if img_url and img_url not in images:
                        images.append(img_url)

            # Z post__content
            content = soup.find('div', class_='post__content')
            if content:
                for img in content.find_all('img'):
                    img_url = img.get('src') or img.get('data-src')
                    if img_url and img_url not in images:
                        images.append(img_url)

            has_images = len(images) > 0
            photos_links = ' | '.join(images) if images else None
        except (AttributeError, KeyError, IndexError):
            has_images = False
            photos_links = None

        # Filmy
        try:
            iframes = []
            # Z post__lead
            lead = soup.find('div', class_='post__lead')
            if lead:
                iframes.extend([iframe['src'] for iframe in lead.find_all('iframe', src=True)])

            # Z post__content
            content = soup.find('div', class_='post__content')
            if content:
                iframes.extend([iframe['src'] for iframe in content.find_all('iframe', src=True)])

            has_videos = len(iframes) > 0
        except:
            has_videos = False

        dictionary_of_article = {
            "Link": article_link,
            "Data publikacji": date_of_publication,
            "Tytuł artykułu": title.replace('\xa0', ' '),
            "Tekst artykułu": text,
            "Autor": author,
            "Kategoria": category,
            "Tagi": tags_str,
            "Linki zewnętrzne": external_links,
            "Zdjęcia/Grafika": has_images,
            "Filmy": has_videos,
            "Linki do zdjęć": photos_links
        }

        all_results.append(dictionary_of_article)

    except AttributeError as e:
        errors.append(article_link)
        print(f"Błąd dla {article_link}: {e}")
    except Exception as e:
        errors.append(article_link)
        print(f"Nieoczekiwany błąd dla {article_link}: {e}")


#%% main execution

if __name__ == "__main__":
    # Wczytaj linki
    try:
        with open('bukbuk_linki.txt', 'r', encoding='utf-8') as f:
            article_links = [line.strip() for line in f if line.strip()]
        print(f"Wczytano {len(article_links)} linków z pliku")
    except FileNotFoundError:
        print("Nie znaleziono pliku bukbuk_linki.txt")
        print("Użyj najpierw get_bukbuk_links.py!")
        article_links = []

    if not article_links:
        print("Brak linków do przetworzenia!")
        exit(1)

    all_results = []
    errors = []

    print("\n" + "="*60)
    print("Rozpoczynam scraping artykułów z bukbuk.pl")
    print("="*60 + "\n")

    # Scraping z wieloma wątkami
    max_workers = 10
    print(f"Używam {max_workers} równoległych wątków")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(tqdm(executor.map(dictionary_of_article, article_links), total=len(article_links)))

    # Zapisywanie
    timestamp = datetime.today().date()

    # JSON
    with open(f'bukbuk_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

    # Excel
    df = pd.DataFrame(all_results)
    with pd.ExcelWriter(f"bukbuk_{timestamp}.xlsx",
                       engine='xlsxwriter',
                       engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
        df.to_excel(writer, 'Posts', index=False)

    # Raport
    print(f"\n{'='*60}")
    print(f"Scraping zakończony!")
    print(f"Przetworzono artykułów: {len(all_results)}")
    print(f"Błędów: {len(errors)}")
    if errors:
        print(f"\nLinki z błędami (pierwsze 10):")
        for error_link in errors[:10]:
            print(f"  - {error_link}")
        if len(errors) > 10:
            print(f"  ... i {len(errors) - 10} więcej")
    print(f"\nPliki wyjściowe:")
    print(f"  - bukbuk_{timestamp}.json")
    print(f"  - bukbuk_{timestamp}.xlsx")
    print(f"{'='*60}\n")

Wczytano 207 linków z pliku

Rozpoczynam scraping artykułów z bukbuk.pl

Używam 10 równoległych wątków


100%|██████████| 207/207 [00:26<00:00,  7.76it/s]
  df.to_excel(writer, 'Posts', index=False)



Scraping zakończony!
Przetworzono artykułów: 207
Błędów: 0

Pliki wyjściowe:
  - bukbuk_2026-01-12.json
  - bukbuk_2026-01-12.xlsx

