In [1]:
!pip install xlsxwriter pydrive

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25l[?25hdone
  Created wheel for pydrive: filename=PyDrive-1.3.1-py3-none-any.whl size=27433 sha256=eceeca79b3eda6c281a46e7507596a632106423e3d4f8971c6103d73cf7668ff
  Stored in directory: /root/.cache/pip/wheels/6c/10/da/a5b513f5b3916fc391c20ee7b4633e5cf3396d570cdd74970f
Successfully built pydrive
Installing collected packages: xlsxwriter, pydrive
Successfully installed pydrive-1.3.1 xlsxw

In [2]:
#%% import
from __future__ import unicode_literals
import re
import time
from datetime import datetime
from time import mktime
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json
import xlsxwriter


#%% functions - data conversion

def date_change_format(date_string):
    """
    Konwertuje datę z różnych formatów na "YYYY-MM-DD"
    """
    try:
        date_string = ' '.join(date_string.strip().split())

        if re.match(r'\d{4}-\d{2}-\d{2}', date_string):
            return date_string[:10]

        if 'T' in date_string:
            return date_string.split('T')[0]

        if re.match(r'\d{2}\.\d{2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
            changed_date = datetime.fromtimestamp(mktime(result))
            return format(changed_date.date())

        # Blogger format: "2024-12-15T10:30:00+01:00"
        if re.match(r'\d{4}-\d{2}-\d{2}T', date_string):
            return date_string.split('T')[0]

        return "no date"
    except Exception as e:
        return "no date"


#%% functions - link extraction

def get_all_post_links():
    """
    Pobiera wszystkie linki do postów z Blogspot
    Używa Atom feed API
    """
    base_url = "https://cuda-cudanakiju.blogspot.com"
    all_links = []

    print("="*60)
    print("KROK 1: Pobieranie linków z Blogspot")
    print("="*60 + "\n")

    # Blogspot Atom feed - automatyczna paginacja
    start_index = 1
    max_results = 500  # Max na stronę
    page = 1

    while True:
        feed_url = f"{base_url}/feeds/posts/default?start-index={start_index}&max-results={max_results}&alt=json"

        try:
            print(f"Strona {page} (start-index={start_index})...")
            r = requests.get(feed_url, timeout=10)

            if r.status_code == 200:
                data = r.json()

                # Sprawdź czy są wpisy
                if 'feed' in data and 'entry' in data['feed']:
                    entries = data['feed']['entry']

                    # Wyciągnij linki
                    for entry in entries:
                        if 'link' in entry:
                            for link in entry['link']:
                                if link.get('rel') == 'alternate' and link.get('type') == 'text/html':
                                    post_url = link['href']
                                    all_links.append(post_url)

                    print(f"  ✓ Znaleziono {len(entries)} postów")

                    # Sprawdź czy są kolejne strony
                    total_results = int(data['feed'].get('openSearch$totalResults', {}).get('$t', 0))
                    items_per_page = int(data['feed'].get('openSearch$itemsPerPage', {}).get('$t', 0))

                    if start_index + items_per_page >= total_results:
                        print(f"  → Koniec (łącznie {total_results} postów)")
                        break

                    start_index += max_results
                    page += 1
                    time.sleep(0.5)
                else:
                    print("  → Brak więcej postów")
                    break
            else:
                print(f"  ✗ Status {r.status_code}")
                break

        except Exception as e:
            print(f"  ✗ Błąd: {e}")
            break

    # Deduplikacja
    all_links = list(set(all_links))

    print(f"\n{'='*60}")
    print(f"Łącznie znaleziono: {len(all_links)} unikalnych postów")
    print(f"{'='*60}\n")

    return all_links


#%% functions - scraping

def dictionary_of_article(article_link):
    """
    Pobiera szczegóły artykułu z Blogspot
    """
    try:
        r = requests.get(article_link, timeout=15)
        r.encoding = 'utf-8'

        if r.status_code != 200:
            errors.append(article_link)
            return

        soup = BeautifulSoup(r.text, 'lxml')

        # Data publikacji
        try:
            # Blogspot używa <abbr> lub <time>
            date_element = soup.find('abbr', class_='published')
            if not date_element:
                date_element = soup.find('time', class_='published')
            if not date_element:
                date_element = soup.find('span', class_='post-timestamp')
            if not date_element:
                # Meta tag
                meta_date = soup.find('meta', property='article:published_time')
                if meta_date:
                    date_element = type('obj', (object,), {
                        'get_text': lambda: meta_date.get('content', ''),
                        'get': lambda x: meta_date.get('content', '')
                    })()

            if date_element:
                date_text = date_element.get('title') or date_element.get('datetime') or date_element.get('content') or date_element.get_text(strip=True)
                date_of_publication = date_change_format(date_text)
            else:
                date_of_publication = "no date"
        except:
            date_of_publication = "no date"

        # Tytuł
        try:
            title_element = soup.find('h3', class_='post-title')
            if not title_element:
                title_element = soup.find('h1', class_='post-title')
            if not title_element:
                title_element = soup.find('h2', class_='post-title')
            if not title_element:
                title_element = soup.find('h1')
            if not title_element:
                title_element = soup.find('title')

            if title_element:
                title = title_element.get_text(strip=True)
                # Usuń "Cuda na kiju: " lub podobne prefiksy
                title = re.sub(r'^.*?:\s*', '', title, count=1)
                title = title.strip()
            else:
                title = "no title"
        except:
            title = "no title"

        # Autor
        try:
            author_element = soup.find('span', class_='author')
            if not author_element:
                author_element = soup.find('span', class_='post-author')
            if not author_element:
                author_element = soup.find('a', rel='author')
            if not author_element:
                # Meta tag
                meta_author = soup.find('meta', {'name': 'author'})
                if meta_author:
                    author_element = type('obj', (object,), {
                        'get_text': lambda: meta_author.get('content', '')
                    })()

            if author_element:
                author = author_element.get_text(strip=True)
            else:
                author = "no author"
        except:
            author = "no author"

        # Treść artykułu
        try:
            # Blogspot używa różnych klas w zależności od szablonu
            article_body = soup.find('div', class_='post-body')
            if not article_body:
                article_body = soup.find('div', class_='entry-content')
            if not article_body:
                article_body = soup.find('div', class_='post-content')
            if not article_body:
                # Fallback - szukamy article
                article_elem = soup.find('article')
                if article_elem:
                    article_body = article_elem

            if article_body:
                text = article_body.get_text(separator=' ', strip=True)
                text = text.replace('\n', ' ').replace('\xa0', ' ')
                text = re.sub(r'\s+', ' ', text)
            else:
                text = "no text"
        except:
            text = "no text"

        # Kategoria / Labels
        try:
            # Blogspot używa "labels" zamiast kategorii
            label_elements = soup.find_all('a', rel='tag')
            if not label_elements:
                label_elements = soup.find_all('span', class_='post-labels')

            if label_elements:
                categories = []
                for elem in label_elements:
                    cat_text = elem.get_text(strip=True)
                    if cat_text and cat_text not in categories:
                        categories.append(cat_text)
                category = ' | '.join(categories)
            else:
                category = "no category"
        except:
            category = "no category"

        # Tagi - na Blogspot labels = kategorie = tagi
        try:
            label_elements = soup.find_all('a', rel='tag')
            if label_elements:
                tags = [tag.get_text(strip=True) for tag in label_elements]
                tags_str = ' | '.join(tags)
            else:
                tags_str = None
        except:
            tags_str = None

        # Linki zewnętrzne
        try:
            if article_body:
                links = [a['href'] for a in article_body.find_all('a', href=True)]
                external_links = [link for link in links if not re.search(r'blogspot\.com', link) and link.startswith('http')]
                external_links = ' | '.join(external_links) if external_links else None
            else:
                external_links = None
        except:
            external_links = None

        # Zdjęcia
        try:
            images = []

            if article_body:
                for img in article_body.find_all('img'):
                    img_url = img.get('src') or img.get('data-src')
                    if img_url:
                        # Pomijamy małe ikony
                        if 'icon' not in img_url.lower() and 'emoji' not in img_url.lower():
                            if img_url not in images:
                                images.append(img_url)

            has_images = len(images) > 0
            photos_links = ' | '.join(images) if images else None
        except:
            has_images = False
            photos_links = None

        # Filmy
        try:
            if article_body:
                iframes = [iframe['src'] for iframe in article_body.find_all('iframe', src=True)]
                has_videos = len(iframes) > 0
            else:
                has_videos = False
        except:
            has_videos = False

        result = {
            "Link": article_link,
            "Data publikacji": date_of_publication,
            "Tytuł artykułu": title.replace('\xa0', ' '),
            "Tekst artykułu": text,
            "Autor": author,
            "Kategoria": category,
            "Tagi": tags_str,
            "Linki zewnętrzne": external_links,
            "Zdjęcia/Grafika": has_images,
            "Filmy": has_videos,
            "Linki do zdjęć": photos_links
        }

        all_results.append(result)

    except Exception as e:
        errors.append(article_link)


#%% main execution

if __name__ == "__main__":
    print("\n" + "="*60)
    print("SCRAPER CUDA-CUDANAKIJU.BLOGSPOT.COM")
    print("="*60 + "\n")

    # KROK 1: Pobierz linki
    article_links = get_all_post_links()

    if not article_links:
        print("Nie znaleziono postów!")
        exit(1)

    # KROK 2: Scrapuj artykuły
    all_results = []
    errors = []

    print("="*60)
    print("KROK 2: Scraping artykułów")
    print("="*60 + "\n")

    max_workers = 10
    print(f"Używam {max_workers} równoległych wątków\n")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(tqdm(executor.map(dictionary_of_article, article_links), total=len(article_links)))

    # KROK 3: Zapisz wyniki
    timestamp = datetime.today().date()

    print(f"\n{'='*60}")
    print("KROK 3: Zapisywanie wyników")
    print("="*60)

    # JSON
    json_file = f'cuda_cudanakiju_{timestamp}.json'
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)
    print(f"  ✓ {json_file}")

    # Excel
    excel_file = f"cuda_cudanakiju_{timestamp}.xlsx"
    df = pd.DataFrame(all_results)
    with pd.ExcelWriter(excel_file,
                       engine='xlsxwriter',
                       engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
        df.to_excel(writer, 'Posts', index=False)
    print(f"  ✓ {excel_file}")

    # RAPORT KOŃCOWY
    print(f"\n{'='*60}")
    print("RAPORT KOŃCOWY")
    print("="*60)
    print(f"Pobranych artykułów: {len(all_results)}")
    print(f"Błędów: {len(errors)}")

    if errors and len(errors) <= 10:
        print(f"\nLinki z błędami:")
        for error_link in errors:
            print(f"  - {error_link}")
    elif errors:
        print(f"\nLinki z błędami (pierwsze 10):")
        for error_link in errors[:10]:
            print(f"  - {error_link}")
        print(f"  ... i {len(errors) - 10} więcej")

    print(f"\n{'='*60}")
    print("GOTOWE!")
    print("="*60 + "\n")


SCRAPER CUDA-CUDANAKIJU.BLOGSPOT.COM

KROK 1: Pobieranie linków z Blogspot

Strona 1 (start-index=1)...
  ✓ Znaleziono 150 postów
Strona 2 (start-index=501)...
  ✓ Znaleziono 150 postów
  → Koniec (łącznie 790 postów)

Łącznie znaleziono: 300 unikalnych postów

KROK 2: Scraping artykułów

Używam 10 równoległych wątków



100%|██████████| 300/300 [00:47<00:00,  6.38it/s]
  df.to_excel(writer, 'Posts', index=False)



KROK 3: Zapisywanie wyników
  ✓ cuda_cudanakiju_2026-01-13.json
  ✓ cuda_cudanakiju_2026-01-13.xlsx

RAPORT KOŃCOWY
Pobranych artykułów: 300
Błędów: 0

GOTOWE!

