In [3]:
!pip install xlsxwriter pydrive

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/987.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m983.0/987.4 kB[0m [31m30.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25l[?25hdone
  Created wheel for pydrive: filename=PyDrive-1.3.1-py3-none-any.whl size=27433 sha256=d3c6d156395be8b6df193715c2f6497cf3cdfe062ab387

In [None]:
#%% import
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import json

#%% functions

def get_archive_page_links(archive_url):
    """
    Pobiera linki do wszystkich miesięcy/lat z głównej strony archiwum
    """
    try:
        r = requests.get(archive_url)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'lxml')
        archive_links = []


        for link in soup.find_all('a', href=True):
            href = link['href']
            if 'praktykiczytania.pl' in href and any(str(year) in href for year in range(2010, 2026)):
                if href not in archive_links:
                    archive_links.append(href)

        return archive_links
    except Exception as e:
        print(f"Błąd pobierania strony archiwum: {e}")
        return []


def get_article_links_from_month(month_url):
    """
    Pobiera wszystkie linki do artykułów z danego miesiąca
    """
    article_links = []
    try:
        r = requests.get(month_url)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'lxml')

        for article in soup.find_all('article'):
            title_link = article.find('a', href=True)
            if title_link and 'praktykiczytania.pl' in title_link['href']:
                article_links.append(title_link['href'])

        for heading in soup.find_all(['h2', 'h3'], class_=lambda x: x and ('entry-title' in x or 'post-title' in x)):
            link = heading.find('a', href=True)
            if link and 'praktykiczytania.pl' in link['href']:
                article_links.append(link['href'])

        content_area = soup.find('main') or soup.find('div', class_=lambda x: x and 'content' in x)
        if content_area:
            for link in content_area.find_all('a', href=True):
                href = link['href']
                if 'praktykiczytania.pl' in href and href not in article_links:
                    if not any(x in href for x in ['/tag/', '/category/', '/author/', '/archiwum/', '/page/']):
                        article_links.append(href)


        seen = set()
        unique_links = []
        for link in article_links:
            if link not in seen:
                seen.add(link)
                unique_links.append(link)

        return unique_links

    except Exception as e:
        print(f"Błąd pobierania artykułów z {month_url}: {e}")
        return []


def check_pagination(month_url):
    """
    Sprawdza czy strona ma paginację i zwraca wszystkie strony
    """
    pages = [month_url]
    try:
        r = requests.get(month_url)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'lxml')

        pagination = soup.find('nav', class_=lambda x: x and 'pagination' in x) or \
                    soup.find('div', class_=lambda x: x and 'pagination' in x)

        if pagination:
            for link in pagination.find_all('a', href=True):
                page_url = link['href']
                if page_url not in pages and 'praktykiczytania.pl' in page_url:
                    pages.append(page_url)

        next_link = soup.find('a', class_=lambda x: x and 'next' in x)
        if next_link:
            page_num = 2
            while True:
                next_page = f"{month_url}page/{page_num}/" if not month_url.endswith('/') else f"{month_url}page/{page_num}/"
                r = requests.get(next_page)
                if r.status_code == 200:
                    pages.append(next_page)
                    page_num += 1
                    time.sleep(0.5)
                else:
                    break

        return pages
    except:
        return pages


def get_all_article_links(archive_url):
    """
    Główna funkcja - pobiera wszystkie linki do artykułów z całego archiwum
    """
    print("Krok 1: Pobieranie listy miesięcy z archiwum...")
    archive_pages = get_archive_page_links(archive_url)

    print(f"Znaleziono {len(archive_pages)} stron archiwum")

    all_article_links = []
    errors = []

    print("\nKrok 2: Pobieranie artykułów z każdego miesiąca...")

    for month_url in tqdm(archive_pages, desc="Przetwarzanie miesięcy"):
        try:
            month_pages = check_pagination(month_url)

            for page_url in month_pages:
                time.sleep(0.5)
                article_links = get_article_links_from_month(page_url)
                all_article_links.extend(article_links)

        except Exception as e:
            print(f"\nBłąd dla {month_url}: {e}")
            errors.append(month_url)
    all_article_links = list(set(all_article_links))

    return all_article_links, errors


#%% main execution

if __name__ == "__main__":
    archive_url = "https://praktykiczytania.pl/archiwum/"

    print("="*60)
    print("Pobieranie wszystkich linków z praktykiczytania.pl")
    print("="*60)

    article_links, errors = get_all_article_links(archive_url)

    article_links.sort()
    with open('praktykiczytania_linki.txt', 'w', encoding='utf-8') as f:
        for link in article_links:
            f.write(link + '\n')

    output_data = {
        'source': archive_url,
        'total_links': len(article_links),
        'links': article_links,
        'errors': errors
    }

    with open('praktykiczytania_linki.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    # Raport
    print("\n" + "="*60)
    print("RAPORT")
    print("="*60)
    print(f"Znaleziono artykułów: {len(article_links)}")
    print(f"Błędów: {len(errors)}")
    if errors:
        print(f"\nProblematyczne URLe:")
        for error_url in errors:
            print(f"  - {error_url}")
    print(f"\nZapisano do:")
    print(f"  - praktykiczytania_linki.txt")
    print(f"  - praktykiczytania_linki.json")
    print("="*60)

In [None]:
#%% import
from __future__ import unicode_literals
import re
import time
from datetime import datetime
from time import mktime
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json
import xlsxwriter


#%% functions

def date_change_format(date_string):
    """
    Konwertuje datę z różnych formatów na "YYYY-MM-DD"
    Obsługuje formaty:
    - "09 marzec 2023"
    - "2023-03-09"
    - "09.03.2023"
    """
    try:
        # Usuwamy dodatkowe białe znaki
        date_string = ' '.join(date_string.strip().split())

        # Jeśli już jest w formacie YYYY-MM-DD
        if re.match(r'\d{4}-\d{2}-\d{2}', date_string):
            return date_string[:10]

        # Jeśli jest datetime z czasem
        if 'T' in date_string:
            return date_string.split('T')[0]

        # Słownik z obiema formami miesięcy
        lookup_table = {
            # Dopełniacz
            "stycznia": "01", "lutego": "02", "marca": "03", "kwietnia": "04",
            "maja": "05", "czerwca": "06", "lipca": "07", "sierpnia": "08",
            "września": "09", "października": "10", "listopada": "11", "grudnia": "12",
            # Mianownik
            "styczeń": "01", "luty": "02", "marzec": "03", "kwiecień": "04",
            "maj": "05", "czerwiec": "06", "lipiec": "07", "sierpień": "08",
            "wrzesień": "09", "październik": "10", "listopad": "11", "grudzień": "12"
        }

        # Zamieniamy nazwę miesiąca na numer
        for k, v in lookup_table.items():
            date_string = date_string.replace(k, v)

        # Format DD.MM.YYYY
        if re.match(r'\d{1,2}\.\d{1,2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
        # Format DD MM YYYY
        else:
            result = time.strptime(date_string, "%d %m %Y")

        changed_date = datetime.fromtimestamp(mktime(result))
        new_date = format(changed_date.date())
        return new_date
    except Exception as e:
        print(f"Błąd konwersji daty '{date_string}': {e}")
        return "no date"


def dictionary_of_article(article_link):
    """
    Pobiera szczegóły artykułu ze strony praktykiczytania.pl
    Zwraca dane w tym samym formacie co scraper dla jacekwakar.pl
    """
    try:
        r = requests.get(article_link)
        r.encoding = 'utf-8'
        html_text = r.text

        # Obsługa rate limiting
        while '429 Too Many Requests' in html_text:
            time.sleep(5)
            r = requests.get(article_link)
            r.encoding = 'utf-8'
            html_text = r.text

        soup = BeautifulSoup(html_text, 'lxml')

        # Data publikacji
        try:
            date_element = soup.find('time')
            if date_element:
                # Próbuj datetime attribute
                date_text = date_element.get('datetime') or date_element.get_text(strip=True)
                date_of_publication = date_change_format(date_text)
            else:
                # Alternatywnie szukaj w spanach/divach z "date"
                date_element = soup.find(['span', 'div'], class_=lambda x: x and 'date' in str(x).lower())
                if date_element:
                    date_text = date_element.get_text(strip=True)
                    date_of_publication = date_change_format(date_text)
                else:
                    date_of_publication = "no date"
        except Exception as e:
            print(f"Błąd parsowania daty dla {article_link}: {e}")
            date_of_publication = "no date"

        # Tytuł
        try:
            title_element = soup.find('h1')
            title = title_element.get_text(strip=True) if title_element else "no title"
        except:
            title = "no title"

        # Autor
        try:
            # Opcja 1: rel="author"
            author_element = soup.find('a', rel='author')
            if not author_element:
                # Opcja 2: klasa z "author"
                author_element = soup.find(['span', 'div', 'a'], class_=lambda x: x and 'author' in str(x).lower())

            if author_element:
                author = author_element.get_text(strip=True)
                # Usuwamy prefix "Autor:", "By:" itp.
                author = re.sub(r'^(Autor|By|Opublikował|Posted by):\s*', '', author, flags=re.IGNORECASE)
            else:
                author = "no author"
        except:
            author = "no author"

        # Treść artykułu
        try:
            # Próbujemy różnych opcji
            article_body = soup.find('div', class_=lambda x: x and 'entry-content' in str(x).lower())
            if not article_body:
                article_body = soup.find('div', class_=lambda x: x and 'post-content' in str(x).lower())
            if not article_body:
                article_body = soup.find('div', class_=lambda x: x and 'article-content' in str(x).lower())
            if not article_body:
                article_body = soup.find('article')

            if article_body:
                text = article_body.get_text(strip=True).replace('\n', ' ').replace('\xa0', ' ')
            else:
                text = "no text"
        except:
            text = "no text"

        # Kategoria
        try:
            category_links = soup.find_all('a', rel='category')
            if not category_links:
                category_links = soup.find_all('a', class_=lambda x: x and 'category' in str(x).lower())

            if category_links:
                categories = [cat.get_text(strip=True) for cat in category_links]
                category = ' | '.join(categories)
            else:
                category = "no category"
        except:
            category = "no category"

        # Tagi
        try:
            tag_links = soup.find_all('a', rel='tag')
            if not tag_links:
                tag_links = soup.find_all('a', class_=lambda x: x and 'tag' in str(x).lower())

            if tag_links:
                tags = [tag.get_text(strip=True) for tag in tag_links]
                tags_str = ' | '.join(tags)
            else:
                tags_str = None
        except:
            tags_str = None

        # Linki zewnętrzne
        try:
            if article_body:
                links = [a['href'] for a in article_body.find_all('a', href=True)]
                # Filtrujemy linki wewnętrzne
                external_links = [link for link in links if not re.search(r'praktykiczytania\.pl', link)]
                external_links = ' | '.join(external_links) if external_links else None
            else:
                external_links = None
        except (AttributeError, KeyError, IndexError):
            external_links = None

        # Zdjęcia
        try:
            images = []

            # 1. Thumbnail / post-thumbnail (główne zdjęcie artykułu)
            thumbnail_div = soup.find('div', class_=lambda x: x and 'post-thumbnail' in str(x).lower())
            if thumbnail_div:
                thumb_img = thumbnail_div.find('img', src=True)
                if thumb_img:
                    images.append(thumb_img['src'])

            # 2. Featured image (alternatywna nazwa)
            if not images:
                featured_img = soup.find('img', class_=lambda x: x and 'featured' in str(x).lower())
                if not featured_img:
                    featured_div = soup.find('div', class_=lambda x: x and 'featured' in str(x).lower())
                    if featured_div:
                        featured_img = featured_div.find('img')

                if featured_img and featured_img.get('src'):
                    if featured_img['src'] not in images:
                        images.append(featured_img['src'])

            # 3. Zdjęcia w treści artykułu
            if article_body:
                content_images = [img['src'] for img in article_body.find_all('img', src=True)]
                for img_src in content_images:
                    if img_src not in images:
                        images.append(img_src)

            # 4. Inne możliwe miejsca na obrazy (header, figure, etc.)
            for container_class in ['entry-header', 'article-header', 'post-header']:
                header = soup.find('div', class_=container_class)
                if header:
                    header_images = [img['src'] for img in header.find_all('img', src=True)]
                    for img_src in header_images:
                        if img_src not in images:
                            images.append(img_src)

            has_images = len(images) > 0
            photos_links = ' | '.join(images) if images else None
        except (AttributeError, KeyError, IndexError):
            has_images = False
            photos_links = None

        # Filmy (iframe)
        try:
            if article_body:
                iframes = [iframe['src'] for iframe in article_body.find_all('iframe', src=True)]
                has_videos = len(iframes) > 0
            else:
                has_videos = False
        except:
            has_videos = False

        dictionary_of_article = {
            "Link": article_link,
            "Data publikacji": date_of_publication,
            "Tytuł artykułu": title.replace('\xa0', ' '),
            "Tekst artykułu": text,
            "Autor": author,
            "Kategoria": category,
            "Tagi": tags_str,  # Dodane pole - nie było w jacekwakar.pl
            "Linki zewnętrzne": external_links,
            "Zdjęcia/Grafika": has_images,
            "Filmy": has_videos,
            "Linki do zdjęć": photos_links
        }

        all_results.append(dictionary_of_article)

    except AttributeError as e:
        errors.append(article_link)
        print(f"Błąd dla {article_link}: {e}")
    except Exception as e:
        errors.append(article_link)
        print(f"Nieoczekiwany błąd dla {article_link}: {e}")


#%% main execution

if __name__ == "__main__":
    # Wczytaj linki z pliku
    try:
        with open('praktykiczytania_linki.txt', 'r', encoding='utf-8') as f:
            article_links = [line.strip() for line in f if line.strip()]
        print(f"Wczytano {len(article_links)} linków z pliku")
    except FileNotFoundError:
        print("Nie znaleziono pliku praktykiczytania_linki.txt")
        print("Użyj najpierw get_praktyki_links.py aby pobrać linki!")
        print("\nLub podaj linki ręcznie:")
        article_links = [
            # Wstaw tutaj linki do artykułów
            # "https://praktykiczytania.pl/artykul1/",
            # "https://praktykiczytania.pl/artykul2/",
        ]

    if not article_links:
        print("Brak linków do przetworzenia!")
        exit(1)

    all_results = []
    errors = []

    print("\n" + "="*60)
    print("Rozpoczynam scraping artykułów z praktykiczytania.pl")
    print("="*60 + "\n")

    # Scraping z progress barem
    with ThreadPoolExecutor(max_workers=5) as executor:
        list(tqdm(executor.map(dictionary_of_article, article_links), total=len(article_links)))

    # Zapisywanie wyników
    timestamp = datetime.today().date()

    # JSON
    with open(f'praktykiczytania_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

    # Excel
    df = pd.DataFrame(all_results)
    with pd.ExcelWriter(f"praktykiczytania_{timestamp}.xlsx",
                       engine='xlsxwriter',
                       engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
        df.to_excel(writer, 'Posts', index=False)

    # Raport
    print(f"\n{'='*60}")
    print(f"Scraping zakończony!")
    print(f"Przetworzono artykułów: {len(all_results)}")
    print(f"Błędów: {len(errors)}")
    if errors:
        print(f"\nLinki z błędami (pierwsze 10):")
        for error_link in errors[:10]:
            print(f"  - {error_link}")
        if len(errors) > 10:
            print(f"  ... i {len(errors) - 10} więcej")
    print(f"\nPliki wyjściowe:")
    print(f"  - praktykiczytania_{timestamp}.json")
    print(f"  - praktykiczytania_{timestamp}.xlsx")
    print(f"{'='*60}\n")