In [1]:
import time
import numpy as np
import pandas as pd
import pickle
import datetime

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import json
import dateparser
from tqdm import tqdm
import html5lib

In [2]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('headless')    
service = Service('./chromedriver')
driver = webdriver.Chrome(service=service, options=options)

In [3]:
class InterfaxParser:
    
    def __init__(self, driver: webdriver, dates=None):
        self.driver = driver
        self.dates = dates
        
    def create_dates(self, start_date=datetime.date(2019,1,1), end_date=datetime.date.today()):
        dates = []
        # тесты на дебила :)
        if start_date > end_date:
            start_date, end_date = end_date, start_date
            
        if end_date > datetime.date.today():
            end_date = datetime.date.today()
            
        if start_date < datetime.date(2000,1,1):
            start_date = datetime.date(2000,1,1)
            
        # формируем даты
        dates = pd.date_range(start=start_date, end=end_date, periods=len(range(start_date.year, end_date.year))*2)
        dates = dates.strftime('%d.%m.%Y').tolist()
#         start_day = start_date.strftime('%d')
#         for year in range(start_date.year, end_date.year+1):
#             dates.append(start_day + str(+str(year)) # например ['03.07.2021, 03.07.2022'], если стартовая дата 03.07.2021
            
        # если наша конечная дата не вошла в список
        if pd.to_datetime(dates[-1], dayfirst=True).date() < end_date:
            dates.append(end_date.strftime('%d.%m.%Y'))
        # или если последний элемент списка превышает нашу конечную дату
        elif pd.to_datetime(dates[-1], dayfirst=True).date() > end_date:
            dates[-1] = end_date.strftime('%d.%m.%Y')

        self.dates = dates
        return dates
    
    
    def __get_page_content(self, url, sleep_time=1) -> BeautifulSoup:
        '''Открывает ссылку, парсит html'''
        while True:
            try:
                self.driver.get(url) # открываем ссылку
                time.sleep(sleep_time) # даем прогрузиться
                break
            except TimeoutException:
                time.sleep(1)
                continue
        page_source = self.driver.page_source # берем страницу
        soup = BeautifulSoup(page_source, 'html5lib') # парсим

        return soup
    
    
    def __get_article(self, url):
        soup = self.__get_page_content(url, 1)
        
        try:
            time_div = soup.find('div', class_="d-flex align-items-sm-start align-items-center mb-20 flex-sm-column flex-md-row")
            text_dic = soup.find('div', class_='editor-content')
            ps = text_dic.find_all('p')
        except Exception as e:
            print(e.__class__)
            print(url)
            print(soup)
        
        date = time_div.find_all('meta')[-1]['content'].split('T')[0]
        text = ' '.join([p.text.strip('\t') for p in ps]).strip()
        
        return date, text
    
    
    def __parse_page(self, soup: BeautifulSoup, news, start, end) -> list:
        
        ul = soup.find('ul', class_='list-unstyled lenta-all-news')
        news = ul.find_all('a', class_='d-block mb-0')
        for i in tqdm(range(len(news))):
            item = news[i]
            cur_news = {}
            cur_news['url'] = item['href']
            cur_news['title'] = item.text
            cur_news['date'], cur_news['text'] = self.__get_article(cur_news['url'])
            news.append(cur_news)

    
    
    def download_news(self, news=[]) -> list:
        
        if not self.dates:
            self.create_dates()
        
        for i in range(len(self.dates)-1):
            start, end = self.dates[i], self.dates[i+1]
            url = f'https://www.interfax-russia.ru/ural/news?from={start}&to={end}&per-page=20000'
            soup = self.__get_page_content(url, 5)
            self.__parse_page(soup=soup, news=news, start=start, end=end)
            with open(f'./interfax_news/interfax_ural_{start}_{end}', 'wb') as f:
                pickle.dump(news, f)
            time.sleep(20)
            print(f'Updated news from {start} up to {end}!')
            
        return news       

In [4]:
parser = InterfaxParser(driver)
parser.create_dates(start_date=datetime.date(2009,1,1))

['01.01.2009',
 '16.07.2009',
 '29.01.2010',
 '14.08.2010',
 '27.02.2011',
 '12.09.2011',
 '27.03.2012',
 '10.10.2012',
 '25.04.2013',
 '08.11.2013',
 '24.05.2014',
 '07.12.2014',
 '22.06.2015',
 '04.01.2016',
 '19.07.2016',
 '01.02.2017',
 '17.08.2017',
 '02.03.2018',
 '15.09.2018',
 '31.03.2019',
 '14.10.2019',
 '28.04.2020',
 '11.11.2020',
 '27.05.2021',
 '10.12.2021',
 '25.06.2022']

In [5]:
news = []

In [None]:
dates = parser.download_news(news=news)

 40%|█████████████████████████████▋                                            | 2526/6286 [1:08:55<1:43:47,  1.66s/it]