In [1]:
!pip install xlsxwriter pydrive

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/987.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25l[?25hdone
  Created wheel for pydrive: filename=PyDrive-1.3.1-py3-none-any.whl size=27433 sha256=a092562e7c52225b951b70765b52db06e3bc18e9de7af3a2d37eacbb0f9b571c
  Stored in directory: /root/.cache/pip/wheels/6c/10/da/a5b513f5b3916fc391c20ee7b4633e5cf3396d570cdd74970f
Succ

In [3]:
#%% import
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import json
import re


#%% functions

def try_sitemap(base_url):
    """
    Próbuje pobrać linki z sitemap.xml
    """
    sitemap_urls = [
        f"{base_url}sitemap-1.xml",  # Sitemap z postami
        f"{base_url}sitemap.xml",
        f"{base_url}sitemap_index.xml",
        f"{base_url}wp-sitemap.xml",
        f"{base_url}sitemap-posts.xml"
    ]

    for sitemap_url in sitemap_urls:
        try:
            print(f"  Próbuję: {sitemap_url}")
            r = requests.get(sitemap_url, timeout=10)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, 'xml')
                links = [loc.text.strip() for loc in soup.find_all('loc')]
                if links:
                    print(f"  ✓ Znaleziono sitemap: {sitemap_url}")
                    print(f"  ✓ Linków w sitemap: {len(links)}")
                    return links
        except Exception as e:
            print(f"    Błąd: {e}")
            continue

    return None


def get_article_links_from_page(page_url):
    """
    Pobiera linki do artykułów z pojedynczej strony
    """
    try:
        r = requests.get(page_url)
        r.encoding = 'utf-8'
        soup = BeautifulSoup(r.text, 'lxml')

        article_links = []
        for article in soup.find_all('article'):
            title = article.find(['h1', 'h2', 'h3'], class_=lambda x: x and 'entry-title' in str(x).lower())
            if title:
                link = title.find('a', href=True)
                if link:
                    href = link['href']
                    if 'nameste.litglog.org' in href or href.startswith('/'):
                        if href.startswith('/'):
                            href = 'https://nameste.litglog.org' + href
                        article_links.append(href)

        return article_links
    except Exception as e:
        print(f"Błąd dla {page_url}: {e}")
        return []


def get_all_pages(blog_url):
    """
    Znajduje wszystkie strony paginacji
    """
    pages = [blog_url]
    try:
        page_num = 2
        while True:
            # WordPress paginacja: /page/2/, /page/3/
            next_page = blog_url.rstrip('/') + f'/page/{page_num}/'
            r = requests.get(next_page)

            if r.status_code == 200 and r.url != blog_url:
                pages.append(next_page)
                print(f"  Znaleziono stronę {page_num}")
                page_num += 1
                time.sleep(0.5)
            else:
                break

        return pages
    except Exception as e:
        print(f"Błąd paginacji: {e}")
        return pages


def get_all_article_links(blog_url):
    """
    Główna funkcja - pobiera wszystkie linki
    """
    # Najpierw próbuj sitemap
    print("Krok 1: Próba pobrania sitemap...")
    sitemap_links = try_sitemap(blog_url)

    if sitemap_links:
        print(f"Znaleziono {len(sitemap_links)} linków w sitemap")
        return sitemap_links, []

    print("Brak sitemap, używam paginacji...")

    # Paginacja
    print("\nKrok 2: Szukanie stron...")
    all_pages = get_all_pages(blog_url)
    print(f"Znaleziono {len(all_pages)} stron")

    all_article_links = []
    errors = []

    print("\nKrok 3: Pobieranie artykułów...")
    for page_url in tqdm(all_pages, desc="Przetwarzanie"):
        try:
            time.sleep(0.5)
            article_links = get_article_links_from_page(page_url)
            all_article_links.extend(article_links)
        except Exception as e:
            errors.append(page_url)

    # Usuń duplikaty
    all_article_links = list(set(all_article_links))

    return all_article_links, errors


def filter_article_links(all_links):
    """
    Filtruje linki - zostawia tylko artykuły
    """
    article_links = []
    excluded = {'category': 0, 'tag': 0, 'page': 0, 'author': 0, 'other': 0}

    for link in all_links:
        # Usuń fragmenty (#)
        link = link.split('#')[0].rstrip('/')

        if not link:
            continue

        # Wykluczamy
        if '/category/' in link or '/tag/' in link:
            excluded['category'] += 1
            continue
        if '/author/' in link:
            excluded['author'] += 1
            continue
        if '/page/' in link:
            excluded['page'] += 1
            continue

        # Główna strona
        if link == 'https://nameste.litglog.org' or link.endswith('.org/'):
            excluded['other'] += 1
            continue

        article_links.append(link)

    # Usuń duplikaty
    article_links = list(set(article_links))

    print(f"\nStatystyki filtrowania:")
    print(f"  Kategorie/tagi: {excluded['category'] + excluded['tag']}")
    print(f"  Autorzy: {excluded['author']}")
    print(f"  Paginacja: {excluded['page']}")
    print(f"  Inne: {excluded['other']}")
    print(f"  ✅ ZAAKCEPTOWANO: {len(article_links)}")

    return article_links


#%% main execution

if __name__ == "__main__":
    blog_url = "https://nameste.litglog.org/"

    print("="*60)
    print("Pobieranie linków z nameste.litglog.org")
    print("="*60 + "\n")

    # Pobierz wszystkie linki
    all_links, errors = get_all_article_links(blog_url)

    print(f"\nZnaleziono {len(all_links)} linków")

    # Filtruj
    article_links = filter_article_links(all_links)

    if not article_links:
        print("\n⚠️  Nie znaleziono artykułów!")
        exit(1)

    # Pokaż przykłady
    if article_links:
        print("\nPrzykładowe linki (pierwsze 10):")
        for i, link in enumerate(sorted(article_links)[:10], 1):
            print(f"  {i}. {link}")

        if len(article_links) > 10:
            print(f"  ... i {len(article_links) - 10} więcej")

    # Sortuj
    article_links.sort()

    # Zapisz
    with open('nameste_linki.txt', 'w', encoding='utf-8') as f:
        for link in article_links:
            f.write(link + '\n')

    output_data = {
        'source': blog_url,
        'total_links': len(article_links),
        'links': article_links,
        'errors': errors
    }

    with open('nameste_linki.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    # Raport
    print("\n" + "="*60)
    print("RAPORT")
    print("="*60)
    print(f"Znaleziono artykułów: {len(article_links)}")
    print(f"Błędów: {len(errors)}")
    print(f"\nZapisano do:")
    print(f"  - nameste_linki.txt")
    print(f"  - nameste_linki.json")
    print("="*60)

Pobieranie linków z nameste.litglog.org

Krok 1: Próba pobrania sitemap...
  Próbuję: https://nameste.litglog.org/sitemap-1.xml
  ✓ Znaleziono sitemap: https://nameste.litglog.org/sitemap-1.xml
  ✓ Linków w sitemap: 696
Znaleziono 696 linków w sitemap

Znaleziono 696 linków

Statystyki filtrowania:
  Kategorie/tagi: 0
  Autorzy: 0
  Paginacja: 0
  Inne: 1
  ✅ ZAAKCEPTOWANO: 695

Przykładowe linki (pierwsze 10):
  1. https://nameste.litglog.org/2008/08/rebetiko
  2. https://nameste.litglog.org/2008/08/saudade
  3. https://nameste.litglog.org/2009/01/lekcja-limeryczna
  4. https://nameste.litglog.org/2009/03/chazarski-smutek
  5. https://nameste.litglog.org/2009/04/konsekwencja-leksykalna
  6. https://nameste.litglog.org/2009/06/babel
  7. https://nameste.litglog.org/2009/06/tlingit
  8. https://nameste.litglog.org/2009/07/drugie-wyrojenie-budyniow
  9. https://nameste.litglog.org/2009/10/c-j-cz-cz
  10. https://nameste.litglog.org/2009/10/p
  ... i 685 więcej

RAPORT
Znaleziono artykułó

In [12]:
#%% import
from __future__ import unicode_literals
import re
import time
from datetime import datetime
from time import mktime
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json
import xlsxwriter


#%% functions

def date_change_format(date_string):
    """
    Konwertuje datę z różnych formatów na "YYYY-MM-DD"
    """
    try:
        date_string = ' '.join(date_string.strip().split())

        if re.match(r'\d{4}-\d{2}-\d{2}', date_string):
            return date_string[:10]

        if 'T' in date_string:
            return date_string.split('T')[0]

        lookup_table = {
            "stycznia": "01", "lutego": "02", "marca": "03", "kwietnia": "04",
            "maja": "05", "czerwca": "06", "lipca": "07", "sierpnia": "08",
            "września": "09", "października": "10", "listopada": "11", "grudnia": "12",
            "styczeń": "01", "luty": "02", "marzec": "03", "kwiecień": "04",
            "maj": "05", "czerwiec": "06", "lipiec": "07", "sierpień": "08",
            "wrzesień": "09", "październik": "10", "listopad": "11", "grudzień": "12"
        }

        for k, v in lookup_table.items():
            date_string = date_string.replace(k, v)

        if re.match(r'\d{1,2}\.\d{1,2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
        else:
            result = time.strptime(date_string, "%d %m %Y")

        changed_date = datetime.fromtimestamp(mktime(result))
        return format(changed_date.date())
    except Exception as e:
        return "no date"


def dictionary_of_article(article_link):
    """
    Pobiera szczegóły artykułu z nameste.litglog.org (WordPress)
    """
    try:
        r = requests.get(article_link)
        r.encoding = 'utf-8'
        html_text = r.text

        while '429 Too Many Requests' in html_text:
            time.sleep(5)
            r = requests.get(article_link)
            r.encoding = 'utf-8'
            html_text = r.text

        soup = BeautifulSoup(html_text, 'lxml')

        # Data publikacji
        try:
            # Opcja 1: span.date.updated
            date_element = soup.find('span', class_='date')
            if not date_element:
                date_element = soup.find('span', class_='updated')

            # Opcja 2: .entry-date
            if not date_element:
                date_element = soup.find(class_='entry-date')

            # Opcja 3: <time>
            if not date_element:
                date_element = soup.find('time')

            # Opcja 4: Meta tag
            if not date_element:
                meta_date = soup.find('meta', property='article:published_time')
                if meta_date:
                    date_element = type('obj', (object,), {
                        'get_text': lambda: meta_date.get('content', ''),
                        'get': lambda x: meta_date.get('content', '')
                    })()

            if date_element:
                date_text = date_element.get('datetime') or date_element.get('content') or date_element.get_text(strip=True)
                date_of_publication = date_change_format(date_text)
            else:
                date_of_publication = "no date"
        except Exception as e:
            date_of_publication = "no date"

        # Tytuł
        try:
            title_element = soup.find('h1', class_=lambda x: x and 'entry-title' in str(x).lower())
            if not title_element:
                title_element = soup.find('h1')

            title = title_element.get_text(strip=True) if title_element else "no title"
        except:
            title = "no title"

        # Autor
        try:
            author_element = soup.find('a', rel='author')
            if not author_element:
                author_element = soup.find(['span', 'div'], class_=lambda x: x and 'author' in str(x).lower())

            if author_element:
                author = author_element.get_text(strip=True)
                author = re.sub(r'^(Autor|By|Opublikował):\s*', '', author, flags=re.IGNORECASE)
            else:
                author = "no author"
        except:
            author = "no author"

        # Treść artykułu
        try:
            # Opcja 1: .entry-content
            article_body = soup.find('div', class_=lambda x: x and 'entry-content' in str(x).lower())

            # Opcja 2: .post-content
            if not article_body:
                article_body = soup.find('div', class_=lambda x: x and 'post-content' in str(x).lower())

            # Opcja 3: <article> - zbierz wszystkie <p> w środku
            if not article_body:
                article_elem = soup.find('article')
                if article_elem:
                    # Zbierz wszystkie paragrafy
                    paragraphs = article_elem.find_all('p')
                    if paragraphs:
                        # Stwórz sztuczny kontener
                        article_body = BeautifulSoup('<div></div>', 'lxml').div
                        for p in paragraphs:
                            article_body.append(p)

            # Opcja 4: Wszystkie <p> na stronie (ostateczność)
            if not article_body:
                paragraphs = soup.find_all('p')
                if paragraphs:
                    article_body = BeautifulSoup('<div></div>', 'lxml').div
                    for p in paragraphs:
                        article_body.append(p)

            if article_body:
                text = article_body.get_text(strip=True).replace('\n', ' ').replace('\xa0', ' ')
                # Usuń bardzo długie spacje
                text = re.sub(r'\s+', ' ', text)
            else:
                text = "no text"
        except Exception as e:
            print(f"Błąd pobierania tekstu dla {article_link}: {e}")
            text = "no text"

        # Kategoria
        try:
            # Opcja 1: span.categories z linkami
            categories_span = soup.find('span', class_='categories')
            if categories_span:
                category_links = categories_span.find_all('a', rel='category tag')
                if category_links:
                    categories = [cat.get_text(strip=True) for cat in category_links]
                    category = ' | '.join(categories)
                else:
                    category = "no category"
            else:
                # Opcja 2: Standardowe WordPress
                category_links = soup.find_all('a', rel='category tag')
                if not category_links:
                    category_links = soup.find_all('a', rel='category')

                if category_links:
                    categories = [cat.get_text(strip=True) for cat in category_links]
                    category = ' | '.join(categories)
                else:
                    category = "no category"
        except:
            category = "no category"

        # Tagi
        try:
            tag_links = soup.find_all('a', rel='tag')
            if tag_links:
                tags = [tag.get_text(strip=True) for tag in tag_links]
                tags_str = ' | '.join(tags)
            else:
                tags_str = None
        except:
            tags_str = None

        # Linki zewnętrzne
        try:
            if article_body:
                links = [a['href'] for a in article_body.find_all('a', href=True)]
                external_links = [link for link in links if not re.search(r'nameste\.litglog\.org', link)]
                external_links = ' | '.join(external_links) if external_links else None
            else:
                external_links = None
        except (AttributeError, KeyError, IndexError):
            external_links = None

        # Zdjęcia
        try:
            images = []

            # Thumbnail
            thumbnail_div = soup.find('div', class_=lambda x: x and 'post-thumbnail' in str(x).lower())
            if thumbnail_div:
                thumb_img = thumbnail_div.find('img', src=True)
                if thumb_img:
                    images.append(thumb_img['src'])

            # Featured image
            if not images:
                featured_img = soup.find('img', class_=lambda x: x and 'wp-post-image' in str(x).lower())
                if featured_img and featured_img.get('src'):
                    images.append(featured_img['src'])

            # Zdjęcia w treści
            if article_body:
                content_images = [img['src'] for img in article_body.find_all('img', src=True) if img.get('src')]
                for img_src in content_images:
                    if img_src not in images:
                        images.append(img_src)

            has_images = len(images) > 0
            photos_links = ' | '.join(images) if images else None
        except (AttributeError, KeyError, IndexError):
            has_images = False
            photos_links = None

        # Filmy
        try:
            if article_body:
                iframes = [iframe['src'] for iframe in article_body.find_all('iframe', src=True)]
                has_videos = len(iframes) > 0
            else:
                has_videos = False
        except:
            has_videos = False

        dictionary_of_article = {
            "Link": article_link,
            "Data publikacji": date_of_publication,
            "Tytuł artykułu": title.replace('\xa0', ' '),
            "Tekst artykułu": text,
            "Autor": author,
            "Kategoria": category,
            "Tagi": tags_str,
            "Linki zewnętrzne": external_links,
            "Zdjęcia/Grafika": has_images,
            "Filmy": has_videos,
            "Linki do zdjęć": photos_links
        }

        all_results.append(dictionary_of_article)

    except AttributeError as e:
        errors.append(article_link)
        print(f"Błąd dla {article_link}: {e}")
    except Exception as e:
        errors.append(article_link)
        print(f"Nieoczekiwany błąd dla {article_link}: {e}")


#%% main execution

if __name__ == "__main__":
    # Wczytaj linki
    try:
        with open('nameste_linki.txt', 'r', encoding='utf-8') as f:
            article_links = [line.strip() for line in f if line.strip()]

        print(f"Wczytano {len(article_links)} linków z pliku")
    except FileNotFoundError:
        print("Nie znaleziono pliku nameste_linki.txt")
        print("Użyj najpierw get_nameste_links.py!")
        article_links = []

    if not article_links:
        print("Brak linków do przetworzenia!")
        exit(1)

    all_results = []
    errors = []

    print("\n" + "="*60)
    print("Rozpoczynam scraping artykułów z nameste.litglog.org")
    print("="*60 + "\n")

    # Scraping
    with ThreadPoolExecutor(max_workers=5) as executor:
        list(tqdm(executor.map(dictionary_of_article, article_links), total=len(article_links)))

    # Zapisywanie
    timestamp = datetime.today().date()

    # JSON
    with open(f'nameste_{timestamp}.json', 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

    # Excel
    df = pd.DataFrame(all_results)
    with pd.ExcelWriter(f"nameste_{timestamp}.xlsx",
                       engine='xlsxwriter',
                       engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
        df.to_excel(writer, 'Posts', index=False)

    # Raport
    print(f"\n{'='*60}")
    print(f"Scraping zakończony!")
    print(f"Przetworzono artykułów: {len(all_results)}")
    print(f"Błędów: {len(errors)}")
    if errors:
        print(f"\nLinki z błędami (pierwsze 10):")
        for error_link in errors[:10]:
            print(f"  - {error_link}")
        if len(errors) > 10:
            print(f"  ... i {len(errors) - 10} więcej")
    print(f"\nPliki wyjściowe:")
    print(f"  - nameste_{timestamp}.json")
    print(f"  - nameste_{timestamp}.xlsx")
    print(f"{'='*60}\n")

Wczytano 695 linków z pliku

Rozpoczynam scraping artykułów z nameste.litglog.org



100%|██████████| 695/695 [02:55<00:00,  3.97it/s]
  df.to_excel(writer, 'Posts', index=False)



Scraping zakończony!
Przetworzono artykułów: 695
Błędów: 0

Pliki wyjściowe:
  - nameste_2026-01-12.json
  - nameste_2026-01-12.xlsx



In [13]:
df.head()

Unnamed: 0,Link,Data publikacji,Tytuł artykułu,Tekst artykułu,Autor,Kategoria,Tagi,Linki zewnętrzne,Zdjęcia/Grafika,Filmy,Linki do zdjęć
0,https://nameste.litglog.org/2008/08/saudade,2008-08-10,saudade,rebetiko ›2008-08-10inz-archiwum|1 commentSaud...,andsol,no category,,https://andsol.wordpress.com/ | https://andsol...,False,False,
1,https://nameste.litglog.org/2008/08/rebetiko,2008-08-30,rebetiko,‹ saudade•lekcja limeryczna ›2008-08-30inz-arc...,nameste,no category,,http://worldmusic.nationalgeographic.com/world...,False,False,
2,https://nameste.litglog.org/2009/03/chazarski-...,2009-03-15,chazarski smutek,‹ lekcja limeryczna•konsekwencja leksykalna ›2...,nameste,no category,,http://wordpress.org/ | https://likebtn.com,False,False,
3,https://nameste.litglog.org/2009/01/lekcja-lim...,2009-01-23,lekcja limeryczna,‹ rebetiko•chazarski smutek ›2009-01-23inz-arc...,nameste,no category,,http://pocztowkizlaputy.blogspot.com/2009/01/k...,False,False,
4,https://nameste.litglog.org/2009/04/konsekwenc...,2009-04-09,konsekwencja leksykalna,‹ chazarski smutek•Babel ›2009-04-09inz-archiw...,nameste,no category,,http://wordpress.org/ | https://likebtn.com,False,False,
