In [1]:
import csv
import re
from dataclasses import asdict, dataclass
from datetime import datetime
from urllib.parse import urljoin

import bs4.element
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
ROOT_URL = 'https://habr.com/'
BASE_URL = 'https://habr.com/ru/hubs/webdev/articles/'
MAX_PAGES = 50

In [3]:
@dataclass
class Article:
    id: int
    author: str
    published_datetime: datetime
    title: str
    url: str
    complexity: str
    reading_time: int
    views: int
    tags: list[str]
    votes: int
    bookmarks: int
    comments: int


@dataclass
class Comment:
    article_id: int
    author: str
    published_datetime: datetime
    text: str
    votes: int

In [4]:
def convert_str_to_datetime(iso_str: str) -> datetime:
    return datetime.fromisoformat(iso_str.replace('Z', '+00:00'))


def convert_views_to_int(views: str) -> int:
    multiplier = 1
    if 'K' in views:
        multiplier = 1_000
        views = views.replace('K', '')
    elif 'M' in views:
        multiplier = 1_000_000
        views = views.replace('M', '')
    
    return int(float(views) * multiplier)


def extract_int(int_str: str) -> int:
    return int(re.search(r'\d+', int_str).group())


def get_text_or_none(tag: bs4.element.Tag | None, *args, **kwargs) -> str | None:
    kwargs['strip'] = True
    return tag.get_text(*args, **kwargs) if tag else None


def get_id_from_url(url: str) -> int:
    return int(re.search(r'/articles/(\d+)/', url).group(1))

In [5]:
def generate_pages_urls(base_url: str, max_pages: int):
    for i in range(1, max_pages + 1):
        yield urljoin(base_url, f'page{i}/')


def parse_article(article: bs4.element.Tag, url: str) -> Article:
    """ article = soup.find('div', class_="tm-article-snippet") + soup.find('div', class_="tm-data-icons")"""
    id = get_id_from_url(url)
    author = get_text_or_none(article.find('a', class_='tm-user-info__username'))
    published_datetime = convert_str_to_datetime(article.find('time').get('datetime'))
    title = get_text_or_none(article.find('h1', class_='tm-title'))
    complexity = get_text_or_none(article.find('span', class_='tm-article-complexity__label'))
    reading_time = extract_int(get_text_or_none(article.find('span', class_='tm-article-reading-time__label')))
    views = convert_views_to_int(get_text_or_none(article.find('span', class_='tm-icon-counter__value')))
    tags = [tag.find('span').get_text(strip=True) for tag in article.find_all('a', class_='tm-publication-hub__link')]
    votes = extract_int(get_text_or_none(article.find('span', class_='tm-votes-meter__value')))
    bookmarks = extract_int(get_text_or_none(article.find('span', class_='bookmarks-button__counter')))
    comments = extract_int(get_text_or_none(article.find('span', class_='tm-article-comments-counter-link__value')))
    
    return Article(
        id=id,
        author=author,
        published_datetime=published_datetime,
        title=title,
        url=url,
        complexity=complexity,
        reading_time=reading_time,
        views=views,
        tags=tags,
        votes=votes,
        bookmarks=bookmarks,
        comments=comments,
    )


def parse_comment(comment: bs4.element.Tag, article_id: int) -> Comment:
    """ comment = soup.find('article', class_='tm-comment-thread__comment') """
    author = get_text_or_none(comment.find('a', class_='tm-user-info__username'))
    published_datetime = convert_str_to_datetime(comment.find('time').get('datetime'))
    text = get_text_or_none(comment.find('div', class_='tm-comment__body-content'), '\n')
    votes = extract_int(get_text_or_none(comment.find('span', class_='tm-votes-meter__value')))
    
    return Comment(
        article_id=article_id,
        author=author,
        published_datetime=published_datetime,
        text=text,
        votes=votes
    )

In [6]:
with open('data/articles.csv', 'w', newline='', encoding='utf-8') as articles_f, \
        open('data/comments.csv', 'w', newline='', encoding='utf-8') as comments_f, \
        requests.Session() as session:
    articles_writer = csv.DictWriter(articles_f, Article.__annotations__.keys())
    comment_writer = csv.DictWriter(comments_f, Comment.__annotations__.keys())
    
    articles_writer.writeheader()
    comment_writer.writeheader()
    
    for page_url in tqdm(generate_pages_urls(BASE_URL, MAX_PAGES), desc='Processing pages', total=MAX_PAGES):
        response = session.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        articles = soup.find_all('a', class_='tm-title__link')
        
        for a in tqdm(articles, desc='Processing articles', leave=False):
            article_url = urljoin(ROOT_URL, a.get('href'))
            response = session.get(urljoin(article_url, 'comments/'))
            soup = BeautifulSoup(response.text, 'html.parser')
            
            if 'articles' not in article_url:
                continue
            
            article = parse_article(soup.find('div', class_='tm-article-comments__article-snippet'), article_url)
            
            articles_writer.writerow(asdict(article))  # CSV
            
            comments = soup.find_all('article', class_='tm-comment-thread__comment')
            comments = [c for c in comments if
                        c.find('div', class_='tm-comment__body-content_empty') is None]  # Check if comment is not empty
            
            for c in comments:
                comment = parse_comment(c, article.id)
                comment_writer.writerow(asdict(comment))  # CSV

Processing pages:   0%|          | 0/50 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]

Processing articles:   0%|          | 0/20 [00:00<?, ?it/s]