In [None]:
import re
import json
import requests

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from bs4 import BeautifulSoup
from html2text import html2text

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

### 1. Парсинг URL адресов с новостями (Selenium)

In [None]:
CHROME_DRIVER_PATH = "tools/chromedriver.exe"
LINKS_FILE_PATH = "data/habr_small/habr_links.txt"
ARTICLES_PATH = "data/habr_small/habr_articles.json"
WEBSITE_LINK = "https://habr.com/ru/hubs/"
PAGES = 50

In [None]:
def parse_website_category(category: str, pages_cnt: int, links_storage: set):
    options = Options()
    options.add_argument('headless')
    driver = Chrome(service=Service(executable_path=CHROME_DRIVER_PATH), options=options)
    
    with open(LINKS_FILE_PATH, 'a') as file:
        for page in range(1, pages_cnt + 1):
            driver.get(WEBSITE_LINK + category + "/articles/page" + str(page))
            for element in driver.find_elements(By.CLASS_NAME , "tm-articles-list__item"):
                    try:
                        link = element.find_element(By.CLASS_NAME, "tm-title_h2").find_element(By.TAG_NAME, 'a').get_attribute('href')
                        if link not in links_storage:
                            links_storage.add(link)
                            file.write(
                                json.dumps({
                                    'link' : element.find_element(By.CLASS_NAME, "tm-title_h2").find_element(By.TAG_NAME, 'a').get_attribute('href'),
                                    'category': category,
                                },
                                ensure_ascii=False) + '\n'
                            )  
                    except:
                        pass 
                
    driver.quit()

In [None]:
categories = ["artificial_intelligence", "hr_management", "physics", "gadgets", "biotech"]
links_storage = set()
for cat in tqdm(categories, desc="Parsed categories"):
    parse_website_category(category=cat, pages_cnt=PAGES, links_storage=links_storage)

### 2. Парсинг новостей (BeautifulSoup + html2text)

In [None]:
def remove_bad_symbols(text):
    bad_symbols_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags=re.UNICODE)

    return bad_symbols_pattern.sub(r'', text)

In [None]:
def parse_article(article: dict):
    soup = BeautifulSoup(requests.get(article['link']).text, "lxml")

    title = soup.find("h1", {"class": "tm-title_h1"}).select_one("span").text
    tags = [tag.contents[0] for tag in soup.find_all("a", {"class": "tm-tags-list__link"})]
    text = soup.find('div', {"class": "article-formatted-body"}).get_text()
    text = re.sub(' +', ' ', text).replace('\n', ' ').replace('\r', '')
    text = remove_bad_symbols(text)
    
    return {
        'article_id': article['link'],
        'title': title,
        'category': article['category'],
        'tags': tags,
        'text': text.strip(),
    } if len(text) != 0 else None

In [None]:
articles_corpus = {'catalog': []}

cat_dict = {i: 0 for i in categories}
with open(LINKS_FILE_PATH) as file:
    for article in tqdm(file.readlines()):
        article_dict = json.loads(article)
        article_info = parse_article(article_dict)
        if article_info:
            cat_dict[article_dict['category']] += 1
            articles_corpus['catalog'].append(article_info) 
            
articles_corpus['meta'] = cat_dict

with open(ARTICLES_PATH, 'a', encoding='UTF-8') as file:    
    json.dump(articles_corpus, fp=file, ensure_ascii=False)