In [None]:
!pip install xlsxwriter pydrive

In [3]:
import requests
from bs4 import BeautifulSoup

def get_all_links(first_article_link, base_url=None, visited=None):
    """
    Rekurencyjnie pobiera linki do wszystkich artykułów,
    przechodząc przez kolejne strony oznaczone przyciskiem 'Następny artykuł'.

    Args:
        first_article_link (str): pełny lub względny URL pierwszego artykułu.
        base_url (str, optional): adres bazowy strony (np. 'https://example.com').
        visited (set, optional): zbiór odwiedzonych linków (dla uniknięcia pętli).

    Returns:
        list[str]: lista wszystkich linków do artykułów w kolejności ich występowania.
    """
    if visited is None:
        visited = set()
    if base_url is None:

        from urllib.parse import urljoin
        base_url = first_article_link.split('/blog/')[0]


    from urllib.parse import urljoin
    full_url = urljoin(base_url, first_article_link)


    if full_url in visited:
        return []

    visited.add(full_url)
    print(f"Pobieram: {full_url}")


    response = requests.get(full_url)
    if response.status_code != 200:
        print(f"Błąd {response.status_code} przy pobieraniu {full_url}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")


    next_link = soup.find("a", class_="next")


    if next_link and next_link.get("href"):
        next_href = next_link["href"]
        return [full_url] + get_all_links(next_href, base_url, visited)
    else:

        return [full_url]


In [None]:
links = get_all_links("https://jacekwakar.pl/blog/varia/antygona-z-bejrutu")
links_recenzje = get_all_links("https://jacekwakar.pl/blog/recenzje/krzyk-we-mnie-wielki-wzbiera")
links_eseje = get_all_links("https://jacekwakar.pl/blog/eseje/bardzo-dobrze-dostateczny")
links_rozmowy = get_all_links("https://jacekwakar.pl/blog/rozmowy/nie-bedziemy-pisac-manifestow")
links_sylwetki = get_all_links("https://jacekwakar.pl/blog/sylwetki/blisko-coraz-blizej")
all_links = links + links_recenzje + links_eseje + links_rozmowy + links_sylwetki

In [None]:
#%% import
from __future__ import unicode_literals
import re
import time
from datetime import datetime
from time import mktime
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm  #licznik
from concurrent.futures import ThreadPoolExecutor
import json
import xlsxwriter


#%% functions

def date_change_format(date_string):
    """
    Konwertuje datę z formatu "09 marzec 2023" na "2023-03-09"
    Obsługuje obie formy miesięcy (mianownik i dopełniacz)
    """
    try:

        date_string = ' '.join(date_string.strip().split())


        lookup_table = {

            "stycznia": "01", "lutego": "02", "marca": "03", "kwietnia": "04",
            "maja": "05", "czerwca": "06", "lipca": "07", "sierpnia": "08",
            "września": "09", "października": "10", "listopada": "11", "grudnia": "12",

            "styczeń": "01", "luty": "02", "marzec": "03", "kwiecień": "04",
            "maj": "05", "czerwiec": "06", "lipiec": "07", "sierpień": "08",
            "wrzesień": "09", "październik": "10", "listopad": "11", "grudzień": "12"
        }


        for k, v in lookup_table.items():
            date_string = date_string.replace(k, v)


        result = time.strptime(date_string, "%d %m %Y")
        changed_date = datetime.fromtimestamp(mktime(result))
        new_date = format(changed_date.date())
        return new_date
    except Exception as e:
        print(f"Błąd konwersji daty '{date_string}': {e}")
        return "no date"


def dictionary_of_article(article_link):
    """
    Pobiera szczegóły artykułu ze strony jacekwakar.pl
    """
    try:
        r = requests.get(article_link)

        r.encoding = 'utf-8'
        html_text = r.text


        while '429 Too Many Requests' in html_text:
            time.sleep(5)
            r = requests.get(article_link)
            r.encoding = 'utf-8'
            html_text = r.text

        soup = BeautifulSoup(html_text, 'lxml')


        try:

            published_span = soup.find('span', class_='published')
            if published_span:
                date_element = published_span.find('time', itemprop='datePublished')
                if date_element:
                    date_text = date_element.text.strip()
                    date_of_publication = date_change_format(date_text)
                else:
                    date_of_publication = "no date"
            else:
                date_of_publication = "no date"
        except Exception as e:
            print(f"Błąd parsowania daty: {e}")
            date_of_publication = "no date"


        try:
            title_element = soup.find('h1', itemprop='headline')
            if not title_element:
                title_element = soup.find('h1', class_='entry-title')
            title = title_element.text.strip() if title_element else "no title"
        except:
            title = "no title"


        try:
            author_element = soup.find('span', class_='createdby')
            if author_element:
                author_name = author_element.find(itemprop='name')
                author = author_name.text.strip() if author_name else "no author"
            else:
                author = "no author"
        except:
            author = "no author"


        try:
            article_body = soup.find('div', itemprop='articleBody')
            if article_body:
                text = article_body.text.strip().replace('\n', ' ').replace('\xa0', ' ')
            else:
                text = "no text"
        except:
            text = "no text"


        try:
            category_element = soup.find('span', class_='category-name')
            if category_element:
                category_link = category_element.find('a')
                category = category_link.text.strip() if category_link else "no category"
            else:
                category = "no category"
        except:
            category = "no category"


        try:
            if article_body:
                links = [a['href'] for a in article_body.find_all('a', href=True)]

                external_links = [link for link in links if not re.search(r'jacekwakar\.pl', link)]
                external_links = ' | '.join(external_links) if external_links else None
            else:
                external_links = None
        except (AttributeError, KeyError, IndexError):
            external_links = None


        try:
            if article_body:
                images = [img['src'] for img in article_body.find_all('img', src=True)]
                has_images = len(images) > 0
            else:
                images = []
                has_images = False
        except (AttributeError, KeyError, IndexError):
            images = []
            has_images = False


        try:
            main_image = soup.find('div', class_='article-full-image')
            if main_image:
                main_img = main_image.find('img')
                if main_img and 'src' in main_img.attrs:
                    main_image_link = main_img['src']

                    if main_image_link not in images:
                        images.insert(0, main_image_link)
                    has_images = True
        except:
            pass

        photos_links = ' | '.join(images) if images else None


        try:
            if article_body:
                iframes = [iframe['src'] for iframe in article_body.find_all('iframe', src=True)]
                has_videos = len(iframes) > 0
            else:
                has_videos = False
        except:
            has_videos = False

        dictionary_of_article = {
            "Link": article_link,
            "Data publikacji": date_of_publication,
            "Tytuł artykułu": title.replace('\xa0', ' '),
            "Tekst artykułu": text,
            "Autor": author,
            "Kategoria": category,
            "Linki zewnętrzne": external_links,
            "Zdjęcia/Grafika": has_images,
            "Filmy": has_videos,
            "Linki do zdjęć": photos_links
        }

        all_results.append(dictionary_of_article)

    except AttributeError as e:
        errors.append(article_link)
        print(f"Błąd dla {article_link}: {e}")
    except Exception as e:
        errors.append(article_link)
        print(f"Nieoczekiwany błąd dla {article_link}: {e}")


#%% main execution

if __name__ == "__main__":
    article_links = all_links
    all_results = []
    errors = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        list(tqdm(executor.map(dictionary_of_article, article_links), total=len(article_links)))

    timestamp = datetime.today().date()

    with open(f'jacekwakar_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

    df = pd.DataFrame(all_results)
    with pd.ExcelWriter(f"jacekwakar_{timestamp}.xlsx",
                       engine='xlsxwriter',
                       engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
        df.to_excel(writer, 'Posts', index=False)


    print(f"\n{'='*50}")
    print(f"Scraping zakończony!")
    print(f"Przetworzono artykułów: {len(all_results)}")
    print(f"Błędów: {len(errors)}")
    if errors:
        print(f"\nLinki z błędami:")
        for error_link in errors:
            print(f"  - {error_link}")
    print(f"{'='*50}\n")