In [23]:
import os
from abc import ABC, abstractmethod
import pathlib

from wikipediaapi import Wikipedia, WikipediaPage

In [24]:
STATE = 'فارس'
DATA_PATH = os.path.join(
    pathlib.Path('../'),
    'fars',
    'data',
)

In [8]:
os.makedirs(DATA_PATH, exist_ok=True)

In [None]:
wiki = Wikipedia(user_agent='Mozilla', language='fa')

In [10]:
page = wiki.page(STATE)

In [11]:
with open(os.path.join(DATA_PATH, page.title), 'w+',encoding='utf-8') as f:
    f.write(page.text)

In [12]:
class CrawlingLogic(ABC):

    @abstractmethod
    def do_crawling(self, title: str, page: WikipediaPage) -> bool:
        raise NotImplementedError()

In [13]:
class PageContainsKeyword(CrawlingLogic):


    def __init__(self, keyword: str):
        super().__init__()
        self.keyword = keyword

    def do_crawling(self, title, page):
        return self.keyword in page.text

In [14]:
class LinkContainsKeyword(CrawlingLogic):


    def __init__(self, keyword: str):
        super().__init__()
        self.keyword = keyword

    def do_crawling(self, title, page):
        return self.keyword in title

In [15]:
class CrawlAll(CrawlingLogic):

    def do_crawling(self, title, page):
        return True

In [18]:
def wiki_crawler(
    page: WikipediaPage,
    base_path: str,
    crawling_logic: CrawlingLogic,
):
    """
        checks all references in the `page` and crawls references containing keyword
    """
    for title, info in page.links.items():
        path = os.path.join(base_path, title)
        if os.path.exists(path):
            print(title, 'already exists')
            continue
        try:
            if crawling_logic.do_crawling(title, info):
                with open(path, 'w+') as f:
                    f.write(info.text)
            else:
                print('irrelevant page:', title)
        except Exception as e:
            print(e)


In [None]:
wiki_crawler(
    page=page,
    base_path=DATA_PATH,
    crawling_logic=PageContainsKeyword(STATE)
)

Wayback Machine already exists
آب باریک already exists
آب پیازک already exists
آباده already exists
آباده طشک already exists
آبرُفتی already exists
آبشار مارگون already exists
آبلیمو already exists
آب‌های زیرزمینی already exists
آتشکده already exists
آرامگاه حافظ already exists
آرامگاه کوروش بزرگ already exists
آریایی already exists
آزادراه شیراز-اصفهان already exists
آزادراه شیراز–بوشهر already exists
آزادراه لامرد–پارسیان already exists
آزادراه کنارگذر شمال‌غربی شیراز already exists
آسیای میانه already exists
آش already exists
آش سبزی already exists
آش کارده already exists
آشوربنیپال already exists
آشوریان already exists
آفات already exists
آفریقا already exists
آقا محمد خان قاجار already exists
آل بویه already exists
آل مظفر already exists
آنغوزه already exists
آهو already exists
آهک already exists
آهک (سنگ) already exists
آینه‌کاری already exists
ابن بطوطه already exists
اتابکان فارس already exists
ادبیات already exists
ارار already exists
ارجان already exists
ارخالق already exists