In [1]:
!pip install xlsxwriter pydrive

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25l[?25hdone
  Created wheel for pydrive: filename=PyDrive-1.3.1-py3-none-any.whl size=27433 sha256=4b980195dd96437ce8d5a1972844f253b09f964596c5d7c6fdfb18d4192616bf
  Stored in directory: /root/.cache/pip/wheels/6c/10/da/a5b513f5b3916fc391c20ee7b4633e5cf3396d570cdd74970f
Successfully built pydrive
Installing collected packages: xlsxwriter, pydrive
Successfully installed pydrive-1.3.1 xlsxw

In [2]:
#%% import
from __future__ import unicode_literals
import re
import time
from datetime import datetime
from time import mktime
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json
import xlsxwriter


#%% functions - data conversion

def date_change_format(date_string):
    """
    Konwertuje datę z różnych formatów na "YYYY-MM-DD"
    """
    try:
        date_string = ' '.join(date_string.strip().split())

        if re.match(r'\d{4}-\d{2}-\d{2}', date_string):
            return date_string[:10]

        if 'T' in date_string:
            return date_string.split('T')[0]

        if re.match(r'\d{2}\.\d{2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
            changed_date = datetime.fromtimestamp(mktime(result))
            return format(changed_date.date())

        # Polski słownik miesięcy
        lookup_table = {
            "stycznia": "01", "lutego": "02", "marca": "03", "kwietnia": "04",
            "maja": "05", "czerwca": "06", "lipca": "07", "sierpnia": "08",
            "września": "09", "października": "10", "listopada": "11", "grudnia": "12",
            "styczeń": "01", "luty": "02", "marzec": "03", "kwiecień": "04",
            "maj": "05", "czerwiec": "06", "lipiec": "07", "sierpień": "08",
            "wrzesień": "09", "październik": "10", "listopad": "11", "grudzień": "12"
        }

        for k, v in lookup_table.items():
            date_string = date_string.replace(k, v)

        if re.match(r'\d{1,2}\.\d{1,2}\.\d{4}', date_string):
            result = time.strptime(date_string, "%d.%m.%Y")
        else:
            result = time.strptime(date_string, "%d %m %Y")

        changed_date = datetime.fromtimestamp(mktime(result))
        return format(changed_date.date())
    except Exception as e:
        return "no date"


#%% functions - issue extraction

def get_all_issues():
    """
    Pobiera wszystkie numery czasopisma Didaskalia z paginowanych stron
    """
    base_url = "https://didaskalia.pl"
    issues_url = f"{base_url}/en/issues"

    print("="*60)
    print("KROK 1: Pobieranie numerów czasopisma")
    print("="*60 + "\n")

    all_issues = []
    page = 0

    while True:
        try:
            if page == 0:
                url = issues_url
            else:
                url = f"{issues_url}?page={page}"

            print(f"Pobieranie strony {page + 1}: {url}")
            r = requests.get(url, timeout=10)
            r.encoding = 'utf-8'

            if r.status_code != 200:
                print(f"  ✗ Status {r.status_code}")
                break

            soup = BeautifulSoup(r.text, 'lxml')

            # Szukamy numerów w bloku "Previous issues"
            # <div class="issue-archive clearfix">
            issue_blocks = soup.find_all('div', class_='issue-archive')

            if not issue_blocks:
                print(f"  ✗ Brak numerów na stronie {page + 1}")
                break

            page_issues = 0
            for block in issue_blocks:
                # Tytuł numeru: <h3 class="issue-archive__title">
                title_elem = block.find('h3', class_='issue-archive__title')
                if not title_elem:
                    continue

                issue_title = title_elem.get_text(strip=True)

                # Link do numeru: <a href="/en/issue/english-issue-2025">
                link_elem = block.find('a', class_='issue-cover__link')
                if not link_elem:
                    continue

                issue_url = link_elem['href']
                if not issue_url.startswith('http'):
                    issue_url = base_url + issue_url

                # Data wydania: <div class="issue-archive__release-date">
                date_elem = block.find('div', class_='issue-archive__release-date')
                release_date = date_elem.get_text(strip=True) if date_elem else "no date"

                all_issues.append({
                    'issue_title': issue_title,
                    'issue_url': issue_url,
                    'release_date': release_date
                })
                page_issues += 1

            print(f"  ✓ Znaleziono {page_issues} numerów")

            # Sprawdzamy, czy jest następna strona
            # <li class="pager__item pager__item--next">
            next_page = soup.find('li', class_='pager__item--next')
            if not next_page:
                break

            page += 1
            time.sleep(0.5)

        except Exception as e:
            print(f"  ✗ Błąd: {e}")
            break

    print(f"\n{'='*60}")
    print(f"Łącznie znaleziono: {len(all_issues)} numerów")
    print(f"{'='*60}\n")

    return all_issues


def get_articles_from_issue(issue_data):
    """
    Pobiera listę artykułów z danego numeru czasopisma
    """
    issue_url = issue_data['issue_url']
    issue_title = issue_data['issue_title']

    try:
        r = requests.get(issue_url, timeout=10)
        r.encoding = 'utf-8'

        if r.status_code != 200:
            return []

        soup = BeautifulSoup(r.text, 'lxml')

        articles = []

        # Artykuły są w spisie treści: <div class="view-issue-toc">
        # Każdy artykuł: <div class="article-toc">
        article_blocks = soup.find_all('div', class_='article-toc')

        for block in article_blocks:
            # Tytuł i link: <h4 class="article-toc__title"> <a href="/pl/artykul/...">
            title_elem = block.find('h4', class_='article-toc__title')
            if not title_elem:
                continue

            link = title_elem.find('a')
            if not link:
                continue

            article_url = link['href']
            if not article_url.startswith('http'):
                article_url = 'https://didaskalia.pl' + article_url

            article_title = link.get_text(strip=True)

            # Autorzy: <div class="article-toc__authors">
            authors = []
            authors_div = block.find('div', class_='article-toc__authors')
            if authors_div:
                author_spans = authors_div.find_all('span', class_='article-toc__author')
                for span in author_spans:
                    author_text = span.get_text(strip=True)
                    if author_text:
                        authors.append(author_text)

            # Podtytuł: <div class="article-toc__subtitle">
            subtitle = ""
            subtitle_div = block.find('div', class_='article-toc__subtitle')
            if subtitle_div:
                subtitle = subtitle_div.get_text(strip=True)

            articles.append({
                'article_url': article_url,
                'article_title': article_title,
                'authors_preview': ', '.join(authors) if authors else "",
                'subtitle': subtitle,
                'issue_title': issue_title,
                'issue_url': issue_url,
                'release_date': issue_data['release_date']
            })

        return articles

    except Exception as e:
        print(f"  ✗ Błąd pobierania artykułów z {issue_title}: {e}")
        return []


def get_all_article_links():
    """
    Pobiera wszystkie linki do artykułów ze wszystkich numerów
    """
    print("="*60)
    print("KROK 2: Pobieranie artykułów z numerów")
    print("="*60 + "\n")

    issues = get_all_issues()
    all_articles = []

    for issue in tqdm(issues, desc="Pobieranie artykułów"):
        articles = get_articles_from_issue(issue)
        all_articles.extend(articles)
        time.sleep(0.5)

    print(f"\n{'='*60}")
    print(f"Łącznie znaleziono: {len(all_articles)} artykułów")
    print(f"{'='*60}\n")

    return all_articles


#%% functions - article scraping

def scrape_article(article_data):
    """
    Pobiera szczegóły pojedynczego artykułu
    """
    try:
        article_url = article_data['article_url']

        r = requests.get(article_url, timeout=15)
        r.encoding = 'utf-8'

        if r.status_code != 200:
            errors.append(article_url)
            return

        soup = BeautifulSoup(r.text, 'lxml')

        # Kategoria/Keyword: <div class="article__keyword">
        category = "no category"
        keyword_div = soup.find('div', class_='article__keyword')
        if keyword_div:
            category = keyword_div.get_text(strip=True)

        # Tytuł: <h1 class="article__title">
        title = "no title"
        title_elem = soup.find('h1', class_='article__title')
        if title_elem:
            title = title_elem.get_text(strip=True)

        # Podtytuł: <h2 class="article__subtitle">
        subtitle = ""
        subtitle_elem = soup.find('h2', class_='article__subtitle')
        if subtitle_elem:
            subtitle = subtitle_elem.get_text(strip=True)

        # Autorzy: <div class="article__authors">
        authors = []
        authors_div = soup.find('div', class_='article__authors')
        if authors_div:
            # Szukamy linków do biografii autorów
            author_links = authors_div.find_all('a', href=True)
            for link in author_links:
                author_name = link.get_text(strip=True)
                if author_name:
                    authors.append(author_name)

        author = ', '.join(authors) if authors else "no author"

        # Treść artykułu: <div class="article__content">
        text = "no text"
        content_div = soup.find('div', class_='article__content')
        if content_div:
            # Zbieramy tekst z paragrafów
            paragraphs = []

            # Szukamy w <div class="paragraph-text__content">
            text_blocks = content_div.find_all('div', class_='paragraph-text__content')
            for block in text_blocks:
                # Wyciągamy tekst z <div class="text-content">
                text_content = block.find('div', class_='text-content')
                if text_content:
                    # Pobieramy wszystkie paragrafy
                    for p in text_content.find_all('p'):
                        p_text = p.get_text(strip=True)
                        if p_text:
                            paragraphs.append(p_text)

            if paragraphs:
                text = ' '.join(paragraphs)
                text = text.replace('\n', ' ').replace('\xa0', ' ')
                text = re.sub(r'\s+', ' ', text).strip()

        # Data publikacji - z metadanych numeru
        date_of_publication = date_change_format(article_data['release_date'])

        # Linki zewnętrzne
        external_links = []
        if content_div:
            links = content_div.find_all('a', href=True)
            for link in links:
                href = link['href']
                if href.startswith('http') and 'didaskalia.pl' not in href:
                    external_links.append(href)

        external_links_str = ' | '.join(external_links) if external_links else None

        # Zdjęcia
        images = []

        # Szukamy zdjęć w <div class="paragraph-image">
        image_blocks = soup.find_all('div', class_='paragraph-image')
        for block in image_blocks:
            img = block.find('img')
            if img and img.get('src'):
                img_url = img['src']
                if not img_url.startswith('http'):
                    img_url = 'https://didaskalia.pl' + img_url
                if img_url not in images:
                    images.append(img_url)

        has_images = len(images) > 0
        photos_links = ' | '.join(images) if images else None

        # Filmy - sprawdzamy czy są iframe'y w treści
        has_videos = False
        if content_div:
            iframes = content_div.find_all('iframe', src=True)
            has_videos = len(iframes) > 0

        # Numer czasopisma - z metadanych
        issue_number = article_data['issue_title']

        result = {
            "Link": article_url,
            "Data publikacji": date_of_publication,
            "Numer czasopisma": issue_number,
            "Kategoria": category,
            "Tytuł artykułu": title,
            "Podtytuł": subtitle,
            "Autor": author,
            "Tekst artykułu": text,
            "Linki zewnętrzne": external_links_str,
            "Zdjęcia/Grafika": has_images,
            "Filmy": has_videos,
            "Linki do zdjęć": photos_links
        }

        all_results.append(result)

    except Exception as e:
        errors.append(article_data['article_url'])
        print(f"✗ Błąd: {e}")


#%% main execution

if __name__ == "__main__":
    print("\n" + "="*60)
    print("SCRAPER DIDASKALIA.PL")
    print("Gazeta Teatralna")
    print("="*60 + "\n")

    # KROK 1 i 2: Pobierz artykuły ze wszystkich numerów
    article_links = get_all_article_links()

    if not article_links:
        print("Nie znaleziono artykułów!")
        exit(1)

    # KROK 3: Scrapuj artykuły
    all_results = []
    errors = []

    print("="*60)
    print("KROK 3: Scraping artykułów")
    print("="*60 + "\n")

    max_workers = 10
    print(f"Używam {max_workers} równoległych wątków\n")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(tqdm(executor.map(scrape_article, article_links), total=len(article_links)))

    # KROK 4: Zapisz wyniki
    timestamp = datetime.today().date()

    print(f"\n{'='*60}")
    print("KROK 4: Zapisywanie wyników")
    print("="*60)

    # JSON
    json_file = f'didaskalia_{timestamp}.json'
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)
    print(f"  ✓ {json_file}")

    # Excel
    excel_file = f"didaskalia_{timestamp}.xlsx"
    df = pd.DataFrame(all_results)
    with pd.ExcelWriter(excel_file,
                       engine='xlsxwriter',
                       engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
        df.to_excel(writer, 'Articles', index=False)
    print(f"  ✓ {excel_file}")

    # RAPORT KOŃCOWY
    print(f"\n{'='*60}")
    print("RAPORT KOŃCOWY")
    print("="*60)
    print(f"Pobranych artykułów: {len(all_results)}")
    print(f"Błędów: {len(errors)}")

    if errors and len(errors) <= 10:
        print(f"\nLinki z błędami:")
        for error_link in errors:
            print(f"  - {error_link}")
    elif errors:
        print(f"\nLinki z błędami (pierwsze 10):")
        for error_link in errors[:10]:
            print(f"  - {error_link}")
        print(f"  ... i {len(errors) - 10} więcej")

    # Statystyki numerów
    if all_results:
        print(f"\n{'='*60}")
        print("STATYSTYKI NUMERÓW")
        print("="*60)
        issue_counts = {}
        for result in all_results:
            issue = result['Numer czasopisma']
            issue_counts[issue] = issue_counts.get(issue, 0) + 1

        for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {issue}: {count} artykułów")

    print(f"\n{'='*60}")
    print("GOTOWE!")
    print("="*60 + "\n")


SCRAPER DIDASKALIA.PL
Gazeta Teatralna

KROK 2: Pobieranie artykułów z numerów

KROK 1: Pobieranie numerów czasopisma

Pobieranie strony 1: https://didaskalia.pl/en/issues
  ✓ Znaleziono 8 numerów
Pobieranie strony 2: https://didaskalia.pl/en/issues?page=1
  ✓ Znaleziono 8 numerów
Pobieranie strony 3: https://didaskalia.pl/en/issues?page=2
  ✓ Znaleziono 8 numerów
Pobieranie strony 4: https://didaskalia.pl/en/issues?page=3
  ✓ Znaleziono 8 numerów
Pobieranie strony 5: https://didaskalia.pl/en/issues?page=4
  ✓ Znaleziono 3 numerów
Pobieranie strony 6: https://didaskalia.pl/en/issues?page=5
  ✓ Znaleziono 0 numerów

Łącznie znaleziono: 35 numerów



Pobieranie artykułów: 100%|██████████| 35/35 [00:41<00:00,  1.17s/it]



Łącznie znaleziono: 853 artykułów

KROK 3: Scraping artykułów

Używam 10 równoległych wątków



100%|██████████| 853/853 [00:54<00:00, 15.53it/s]



KROK 4: Zapisywanie wyników
  ✓ didaskalia_2026-01-14.json


  df.to_excel(writer, 'Articles', index=False)


  ✓ didaskalia_2026-01-14.xlsx

RAPORT KOŃCOWY
Pobranych artykułów: 853
Błędów: 0

STATYSTYKI NUMERÓW
  Didaskalia 165: 36 artykułów
  Didaskalia 187/188: 33 artykułów
  Didaskalia 175/176: 33 artykułów
  Didaskalia 161: 33 artykułów
  Didaskalia 155: 32 artykułów
  Didaskalia 178: 30 artykułów
  Didaskalia 157/158: 30 artykułów
  Didaskalia 177: 29 artykułów
  Didaskalia 169/170: 29 artykułów
  Didaskalia 166: 29 artykułów

GOTOWE!



In [3]:
df.head()

Unnamed: 0,Link,Data publikacji,Numer czasopisma,Kategoria,Tytuł artykułu,Podtytuł,Autor,Tekst artykułu,Linki zewnętrzne,Zdjęcia/Grafika,Filmy,Linki do zdjęć
0,https://didaskalia.pl/en/article/opera-poland-...,no date,English Issue 2025,HISTORIES,Opera in Poland: At the Intersection of Class ...,,Marcin Bogucki| \n University of Warsaw,Stanisław Moniuszko’sHalkawas first performed ...,,True,False,https://didaskalia.pl/sites/default/files/styl...
1,https://didaskalia.pl/en/article/choreopolitic...,no date,English Issue 2025,HISTORIES,Choreopolitics and Haunted Bodies,,Alicja Müller| \n Jagiellonian University in ...,"On the back wall of a smoke-filled stage, an a...",,True,False,https://didaskalia.pl/sites/default/files/styl...
2,https://didaskalia.pl/en/article/cultivating-r...,no date,English Issue 2025,POST-ANTHROPOCENE,Cultivating Resistance: An Analysis of Exhibit...,,Martyna Dziadek| \n Doctoral School in the Hu...,"The Earth eventually began to revolt, but it w...",,True,False,https://didaskalia.pl/sites/default/files/styl...
3,https://didaskalia.pl/en/article/utopia-embodi...,no date,English Issue 2025,POST-ANTHROPOCENE,Utopia Embodied: Utopian Impulses in Choreogra...,,Maria Treit| \n Jagiellonian University in Kr...,The desire for a better world appears to be in...,,True,False,https://didaskalia.pl/sites/default/files/styl...
4,https://didaskalia.pl/en/article/non-instituti...,no date,Didaskalia 189,CHOREOGRAFIA A INSTYTUCJE,"(Non-)Institution(alization), Emotional Econom...",,Pia Brezavšček| \n University of Ljubljana,"In this article, I thematize how the delayed a...",,True,False,https://didaskalia.pl/sites/default/files/styl...
