In [1]:
import re
import json
import requests

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from bs4 import BeautifulSoup
from html2text import html2text

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

## 1. Парсинг URL адресов с новостями (Selenium)

In [None]:
CHROME_DRIVER_PATH = "tools/chromedriver.exe"
LINKS_FILE_PATH = "data/habr_links.txt"
ARTICLES_PATH = "data/habr_articles.json"
WEBSITE_LINK = "https://habr.com/ru/hubs/"
PAGES = 50

In [None]:
options = Options()
options.add_argument('headless')

driver = Chrome(service=Service(executable_path=CHROME_DRIVER_PATH), options=options)
print("Current session is {}".format(driver.session_id))

In [5]:
def parse_website_category(category, pages_cnt):
    for page in range(1, pages_cnt + 1):
        driver.get(WEBSITE_LINK + category + "/articles/page" + str(page))
        for element in driver.find_elements(By.CLASS_NAME , "tm-articles-list__item"):
            with open(LINKS_FILE_PATH, 'a') as file:
                file.write(
                    json.dumps({
                        'link' : element.find_element(By.CLASS_NAME, "tm-title_h2").find_element(By.TAG_NAME, 'a').get_attribute('href'),
                        'category': category,
                    },
                    ensure_ascii=False) + '\n'
                )   
                
    driver.quit()

In [6]:
categories = ["machine_learning", "infosecurity", "gamedev", "maths", "webdev", "algorithms"]

for cat in tqdm(categories, desc="Parsed categorie"):
    parse_website_category(category=cat, pages_cnt=PAGES)

Current session is 831d8a8e4e96cffe34bd7439ceac0d47


Parsed categories: 100%|██████████| 6/6 [39:12<00:00, 392.02s/it]


## 2. Парсинг новостей (BeautifulSoup + html2text)

In [3]:
def remove_bad_symbols(text):
    bad_symbols_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags=re.UNICODE)

    return bad_symbols_pattern.sub(r'', text)

In [4]:
def parse_article(article):
    soup = BeautifulSoup(requests.get(article['link']).text, "lxml")

    title = soup.find("h1", {"class": "tm-title_h1"}).select_one("span").text
    tags = [tag.contents[0] for tag in soup.find_all("a", {"class": "tm-tags-list__link"})]

    text = ''
    findall_tags = ['p', 'h3', 'h4']
    for paragraph in soup.find_all(name=findall_tags, attrs={'class': ''}):
        text += html2text(paragraph.text).replace('\n', ' ') + ' '
    text = remove_bad_symbols(text)

    return {
        'article_id': article['link'],
        'title': title,
        'category': article['category'],
        'tags': tags,
        'text': text.strip(),
    }

In [5]:
articles_corpus = {'catalog': []}
with open(LINKS_FILE_PATH) as file:
    for article in file:
        articles_corpus['catalog'].append(parse_article(json.loads(article))) 
        
with open(ARTICLES_PATH, 'a', encoding='UTF-8') as file:    
    json.dump(articles_corpus, fp=file, ensure_ascii=False)