<h3>Parsing iteration</h3>
This file implements parsing of pages. HTML pages are opened sequentially, article tags are highlighted through the BeautifulSoup library. Parsing is performed according to the manually selected location of information in the DOM. Each article becomes an object of the Article class. The received data is recorded in csv.

In [15]:
from Article import Article
from Utils import parse_text, parse_tags, parse_saves, parse_views, is_ads
from bs4 import BeautifulSoup
from tqdm import tqdm
import os
import csv

<i>A set of attributes for retrieving information from pages.</i>

In [5]:
# id
DATA_ID = 'data-story-id'
# title
CLASS_TITLE = 'story__title-link'
# author id
DATA_AUTHOR_ID = 'data-author-id'
# author name
CLASS_AUTHOR = 'story__user-link user__nick'
# author name
DATA_AUTHOR_NAME = 'data-name'
# comments count
DATA_COMMENTS_COUNT = 'data-comments'
# rating info
DATA_RATING = 'data-rating'
# meta rating info
DATA_META_RATING = 'data-meta-rating'
# time
DATA_TIME = 'data-timestamp'
# tag info
CLASS_TAG = 'tags__tag'
# tag name
DATA_TAG = 'data-tag'
# saves
CLASS_SAVES = 'story__save'
# saves info
DATA_SAVES = 'aria-label'
# views
CLASS_VIEWS = 'story__views-count'
# views info
DATA_VIEWS = 'aria-label'
# text
CLASS_TEXT = 'story-block_type_text'

In [4]:
def parse_story(story, atr):
    text = parse_text(story.find_all('div', class_=CLASS_TEXT))
    tags = parse_tags(story.find_all('a', class_=CLASS_TAG), DATA_TAG)

    saves = parse_saves(story.find(class_=CLASS_SAVES).attrs[DATA_SAVES])

    views_info = story.find(class_=CLASS_VIEWS)
    if views_info is None or views_info.get(CLASS_VIEWS) is None:
        views = 0
    else:
        views = parse_views(views_info.get(CLASS_VIEWS), DATA_AUTHOR_ID)


    if atr.get(DATA_RATING) is None:
        rating = 0
    else:
        rating = atr[DATA_RATING]

    article = Article(
        atr[DATA_ID], 
        story.find(class_=CLASS_TITLE).text, 
        atr[DATA_AUTHOR_ID],
        story.find(class_=CLASS_AUTHOR).attrs[DATA_AUTHOR_NAME],
        atr[DATA_COMMENTS_COUNT],
        rating,
        atr[DATA_META_RATING],
        atr[DATA_TIME],
        tags,
        views,
        saves,
        text)

    return article.get_json()

In [3]:
dir_name = 'pages'
files = os.listdir(dir_name)

In [2]:
def data_to_csv():
    with open('result.csv', 'w') as csvfile:
        fieldnames = ['id', 'title', 'author_id', 'author_name', 'comments', 'rating', 'rating_full', 'data', 'tags', 'views', 'saves', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for file in tqdm(files):
            with open(os.path.join(dir_name, file), 'r', encoding='utf-8') as f:
                content = f.read()
                soup = BeautifulSoup(content, 'lxml')
                stories = soup.find_all('article', class_='story')

                for story in stories:
                    atr = story.attrs
                    if is_ads(atr, DATA_AUTHOR_ID):
                        continue
                    else:
                        writer.writerow(parse_story(story, atr))

<h3>Launches data_to_csv</h3>
<ul>
<li>60 495 files, last is 3186, time: 2:07:00</li>
</ul>

In [None]:
data_to_csv()