In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython import display

In [None]:
class rbc_parser:
    def __init__(self):
        self.session = requests.Session()


    def _get_url(self, param_dict: dict) -> str:
        url = (
            f"https://www.rbc.ru/search/ajax/?"
            f"project={param_dict['project']}&"
            # f"project_nick={param_dict['project_nick']}&"
            f"category={param_dict['category']}&"
            f"dateFrom={param_dict['dateFrom']}&"
            f"dateTo={param_dict['dateTo']}&"
            f"page={param_dict['page']}&"
            f"query={param_dict['query']}&"
            f"material={param_dict['material']}"
        )
        return url


    def _get_search_table(self, param_dict: dict,
                          include_text: bool = True) -> pd.DataFrame:

        url = self._get_url(param_dict)
        response = self.session.get(url)

        search_table = pd.DataFrame(response.json()['items'])

        if include_text and not search_table.empty:
            get_text = lambda x: self._get_article_data(x['fronturl'])
            search_table[['overview', 'text']] = search_table.apply(get_text,
                                                                    axis=1).tolist()

        if 'publish_date_t' in search_table.columns:
            search_table.sort_values('publish_date_t', ignore_index=True)

        return search_table


    def _iterable_load_by_page(self, param_dict):
        param_copy = param_dict.copy()
        results = []

        result = self._get_search_table(param_copy)
        results.append(result)

        while not result.empty:
            param_copy['page'] = str(int(param_copy['page']) + 1)
            result = self._get_search_table(param_copy)
            results.append(result)

        results = pd.concat(results, axis=0, ignore_index=True)

        return results


    def _get_article_data(self, url: str):
        response = self.session.get(url)

        soup = bs(response.text, features="lxml")

        div_overview = soup.find('div', {'class': 'article__text__overview'})

        if div_overview:
            overview = div_overview.text.replace('<br />','\n').strip()
        else:
            overview = None

        all_paragraphs = soup.find_all('p')

        filtered_paragraphs = [
            p for p in all_paragraphs
            if not p.find_parent('div', class_='article__special_container')
            and not p.find_parent('div', class_='showcase-collection__subtitle')
            and not p.find_parent('div', class_='showcase-collection-card__text')
            and not p.find_parent('div', class_='showcase-collection__footer')
        ]

        if filtered_paragraphs:
            text = ' '.join(
                p.text.replace('<br />', '\n').strip()
                for p in filtered_paragraphs
                )
        else:
            text = None

        return overview, text


    def get_articles(self,
                     param_dict,
                     time_step = 1,
                     save_every = 5,
                     save_excel = True) -> pd.DataFrame:

        param_copy = param_dict.copy()
        time_step = timedelta(days=time_step)

        dateFrom = datetime.strptime(param_copy['dateFrom'], '%d.%m.%Y')
        dateTo = datetime.strptime(param_copy['dateTo'], '%d.%m.%Y')

        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = pd.DataFrame()
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime("%d.%m.%Y")
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime("%d.%m.%Y")

            print('Parsing articles from ' + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])

            out = pd.concat([out, self._iterable_load_by_page(param_copy)], axis=0, ignore_index=True)

            dateFrom += time_step + timedelta(days=1)

            param_copy['dateFrom'] = dateFrom.strftime("%d.%m.%Y")

            save_counter += 1
            if save_counter == save_every:
                display.clear_output(wait=True)
                out.to_excel(f"/tmp/rbc_{param_dict['dateFrom']}_{param_dict['dateTo']}_checkpoint_table.xlsx")
                print('Checkpoint saved!')
                save_counter = 0

        if save_excel:
            out.to_excel(f"rbc_{param_dict['dateFrom']}_{param_dict['dateTo']}.xlsx")
        print("Finish")

        return out

### РБК

* __project__ - проекты РБК. Возможные значения: ["rbcnews", "rbctv", "rbcstyle", "sport", "realty", "crypto", "autonews", "quote", "bc3", "trends"]

* __category__ - рубрики: ["TopRbcRu_economics", "TopRbcRu_auto", "TopRbcRu_business", "TopRbcRu_money", "TopRbcRu_realty", "TopRbcRu_society", "TopRbcRu_politics", "TopRbcRu_own_business", "TopRbcRu_specials", "TopRbcRu_technology_and_media", "TopRbcRu_finances"]

* __material__ - материалы: ["video", "quiz", "interview", "research", "card", "opinion", "multimedia", "short_news", "olympics_online", "online", "investigation", "rating", "article_specproject", "article", "story"]

* __dateFrom__ - с даты

* __dateTo__ - по дату

* __page__ - смещение запроса (с шагом 20)

* __Deprecated__:

    * __offset__ - смещение поисковой выдачи

    * __limit__ - лимит запроса, максимум 20

_Чтобы не специфировать параметр, оставляем поле пустым_

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from datetime import datetime, timedelta
from IPython import display

class rbc_parser:
    def __init__(self):
        self.session = requests.Session()

    def _get_url(self, param_dict: dict) -> str:
        url = (
            f"https://www.rbc.ru/search/ajax/?"
            f"project={param_dict['project']}&"
            f"category={param_dict['category']}&"
            f"dateFrom={param_dict['dateFrom']}&"
            f"dateTo={param_dict['dateTo']}&"
            f"page={param_dict['page']}&"
            f"query={param_dict['query']}&"
            f"material={param_dict['material']}"
        )
        return url

    def _get_search_table(self, param_dict: dict, include_text: bool = True) -> pd.DataFrame:
        url = self._get_url(param_dict)
        response = self.session.get(url)

        raw_items = response.json()['items']
        print("🔍 До фильтра:", len(raw_items))

        search_table = pd.DataFrame(raw_items)

        #
        blocked_categories = ["Политика", "Экономика", "Общество", "Спорт", "Полистать"]
        search_table = search_table[search_table["category"].notna()]
        search_table = search_table[~search_table["category"].isin(blocked_categories)]

        print("После фильтра:", len(search_table))

        if include_text and not search_table.empty:
            get_text = lambda x: self._get_article_data(x['fronturl'])
            search_table[['overview', 'text']] = search_table.apply(get_text, axis=1).tolist()

        if 'publish_date_t' in search_table.columns:
            search_table.sort_values('publish_date_t', ignore_index=True)

        return search_table

    def _iterable_load_by_page(self, param_dict):
        param_copy = param_dict.copy()
        results = []

        result = self._get_search_table(param_copy)
        results.append(result)

        while not result.empty:
            param_copy['page'] = str(int(param_copy['page']) + 1)
            result = self._get_search_table(param_copy)
            results.append(result)

        results = pd.concat(results, axis=0, ignore_index=True)
        return results

    def _get_article_data(self, url: str):
        response = self.session.get(url)
        soup = bs(response.text, features="lxml")

        div_overview = soup.find('div', {'class': 'article__text__overview'})
        overview = div_overview.text.replace('<br />', '\n').strip() if div_overview else None

        all_paragraphs = soup.find_all('p')
        filtered_paragraphs = [
            p for p in all_paragraphs
            if not p.find_parent('div', class_='article__special_container')
            and not p.find_parent('div', class_='showcase-collection__subtitle')
            and not p.find_parent('div', class_='showcase-collection-card__text')
            and not p.find_parent('div', class_='showcase-collection__footer')
        ]

        text = ' '.join(p.text.replace('<br />', '\n').strip() for p in filtered_paragraphs) if filtered_paragraphs else None
        return overview, text

    def get_articles(self, param_dict, time_step=1, save_every=5, save_excel=True) -> pd.DataFrame:
        param_copy = param_dict.copy()
        time_step = timedelta(days=time_step)

        dateFrom = datetime.strptime(param_copy['dateFrom'], '%d.%m.%Y')
        dateTo = datetime.strptime(param_copy['dateTo'], '%d.%m.%Y')

        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = pd.DataFrame()
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime("%d.%m.%Y")
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime("%d.%m.%Y")

            print('Parsing articles from ' + param_copy['dateFrom'] + ' to ' + param_copy['dateTo'])

            out = pd.concat([out, self._iterable_load_by_page(param_copy)], axis=0, ignore_index=True)

            dateFrom += time_step + timedelta(days=1)
            param_copy['dateFrom'] = dateFrom.strftime("%d.%m.%Y")

            save_counter += 1
            if save_counter == save_every:
                display.clear_output(wait=True)
                out.to_excel(f"/tmp/rbc_{param_dict['dateFrom']}_{param_dict['dateTo']}_checkpoint_table.xlsx")
                print('Checkpoint saved!')
                save_counter = 0

        if save_excel:
            out.to_excel(f"rbc_{param_dict['dateFrom']}_{param_dict['dateTo']}.xlsx")
        print("Finish")

        return out

In [None]:
use_parser = "РБК"

query = "РБК"
project = ""
category = ""
material = ""
dateFrom = '2025-04-11'
dateTo = "2025-04-12"
page = 0

if use_parser == "РБК":
    param_dict = {
        'query'   : query,
        'project' : project,
       # 'project_nick': 'project_nick',
        'category': category,
        'dateFrom': datetime.
        strptime(dateFrom, '%Y-%m-%d').
        strftime('%d.%m.%Y'),
        'dateTo'  : datetime.
        strptime(dateTo, '%Y-%m-%d').
        strftime('%d.%m.%Y'),
        'page'   : str(page),
        'material': material
    }

print(use_parser, "- param_dict:", param_dict)

РБК - param_dict: {'query': 'РБК', 'project': '', 'category': '', 'dateFrom': '11.04.2025', 'dateTo': '12.04.2025', 'page': '0', 'material': ''}


In [None]:
assert use_parser == "РБК"
parser = rbc_parser()
tbl = parser._get_search_table(
    param_dict,
    include_text = True

)

print(len(tbl))
tbl.head()

🔍 До фильтра: 20
✅ После фильтра: 0
0


Unnamed: 0,id,project,project_nick,type,category,title,body,publish_date,publish_date_t,fronturl,picture,badge,pay_option,data,_score


In [None]:
%%time

table = parser.get_articles(
    param_dict=param_dict,
    time_step = 1, # Шаг - 7 дней, можно больше, но есть риск отсечения статей в неделях, где статей больше 100
    save_every = 1, # Сохранять чекпойнт каждые x шагов
    save_excel = True # Сохранить итоговый файл
)
print(len(table))
table.head()

Checkpoint saved!
Finish
0
CPU times: user 58 ms, sys: 2 ms, total: 60 ms
Wall time: 940 ms


Unnamed: 0,id,project,project_nick,type,category,title,body,publish_date,publish_date_t,fronturl,picture,badge,pay_option,data,_score
