In [1]:
import requests 
from html.parser import HTMLParser
import os
from pymystem3 import Mystem

In [2]:
page_url = "http://ulpravda.ru/ulpravda-newspaper/news/news-"

directory = "data/"

for i in range(20000, 30000):
    page = requests.get(page_url + str(i))
    
    if page.status_code == 404:
        continue
    
    try:
        content = page.content.decode("utf-8")
        page_file = open(directory + str(i), "w")
        page_file.write(content)
        page_file.close()
    except UnicodeDecodeError:
        continue

In [3]:
class Parser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        
        self._SECTION_TAG = "section"
        self._SECTION_CLASS = "main-column"
        self._P_TAG = "p"
        self._H1_TAG = "h1"
        self._DIV_TAG = "div"
        self._DIV_CLASS = "news-item-additional"

        
        self._in_section_tag = False
        self._in_h1_tag = False
        self._in_p_tag = False
        self._in_div_tag = False
        
        self._text_end = False
        
        
        self._p_tag_count = 0
        
        self._tittle = None
        self._text = []
        self._author = None
        self._date = None
        
    def handle_starttag(self, tag, attrs):
        if tag == self._SECTION_TAG and dict(attrs).get("class") == self._SECTION_CLASS:
            self._in_section_tag = True
            
        if tag == self._DIV_TAG and dict(attrs).get("class") == self._DIV_CLASS:
            self._in_div_tag = True
            self._text_end = True
        
        if tag == self._H1_TAG and self._in_section_tag:
            self._in_h1_tag = True
            
        if tag == self._P_TAG and self._in_section_tag:
            self._in_p_tag = True
            self._p_tag_count += 1
            
        

    def handle_endtag(self, tag):
        if tag == self._SECTION_TAG and self._in_section_tag:
            self._in_section_tag = False
            
        if tag == self._H1_TAG and self._in_h1_tag:
            self._in_h1_tag = False
            
        if tag == self._P_TAG and self._in_p_tag:
            self._in_p_tag = False
            
        if tag == self._DIV_TAG and self._in_div_tag:
            self._in_div_tag = False

    def handle_data(self, data):
        if self._in_h1_tag:
            self._tittle = data
        
        if self._in_p_tag and self._p_tag_count >= 2 and not self._text_end:
            if len(data) > 0:
                self._text += [data]
            
        if self._in_div_tag:
            self._date = data
        
                     

    def parse(self, page_text):
        chars = ["\r", "\t", "\n", "&nbsp;", "<br />"]
        page_text = page_text.replace("&laquo;", '"')
        page_text = page_text.replace("&raquo;", '"')
        for char in chars:
            page_text = page_text.replace(char, "")
        self.feed(page_text)
        if len(self._text) == 0:
            self._text.append("")
        
        author = self._text[-1]
        if len(author) > 50 or len(author.split()) != 2:
            author = "Неизвестно"
        return self._tittle, "".join(self._text[:-1]), author, self._date    

In [4]:
def get_month(month_text):
    months = dict()
    months["января"] = 1
    months["февраля"] = 2
    months["марта"] = 3
    months["апреля"] = 4
    months["мая"] = 5
    months["июня"] = 6
    months["июля"] = 7
    months["августа"] = 8
    months["сентября"] = 9
    months["октября"] = 10
    months["ноября"] = 11
    months["декабря"] = 12
    
    return months[month_text]

In [5]:
def mystem_to_xml(obj):
    res = '<?xml version="1.0" encoding="utf-8"?>\n<html>\n\t<body>\n\t\t<se>\n'
    for word in obj:
        if "analysis" not in word:
            continue
        res += "\t\t\t<w>"
        res += word["text"]
        for ana in word["analysis"]:
            res += "<ana "
            for ana_dict in ana.items():
                res += ana_dict[0] + '="' + str(ana_dict[1]) + '" '
            res += "/>"
            
        res += "</w>\n"
    
    res += "\t\t</se>\n\t</body>\n</html>"
    return res

In [6]:
def mystem_to_text(obj):
    res = ''
    for word in obj:
        if "analysis" not in word:
            continue
        res += word["text"] + "{"
        for ana in word["analysis"]:
            for ana_dict in ana.items():
                res += ana_dict[0] + '=' + str(ana_dict[1]) + ' '
        
        res += "}\n"
    
    return res

In [7]:
def save_to(text, path, year, month, file_name):
    if not os.path.exists(path):
        os.mkdir(path)
        
    path += "/"
    if not os.path.exists(path + year):
        os.mkdir(path + year)
    
    if not os.path.exists(path + year + "/" + month):
        os.mkdir(path + year + "/" + month)
    
    file_path = path + year + "/" + month + "/" + file_name
    
    file = open(file_path, "w")
    file.write(text)
    file.close()

In [8]:
def get_row(article_id, mystem):
    page_file = open("data/" + str(article_id))
    page_text = page_file.read()
    page_file.close()
    
    page_parser = Parser()
    page_tittle, page_text, page_author, page_date = page_parser.parse(page_text)
    
    day = page_date[:page_date.find(" ")]
    month = page_date[page_date.find(" ") + 1:]
    month = month[:month.find(" ")]
    year = page_date[page_date.find(month) + len(month) + 1:]
    year = year[:year.find(" ")]
    month = str(get_month(month))
    
    date = ""
    if len(day) == 1:
        day = "0" + day
        
    if len(month) == 1:
        date = "0" + month
        
    date = day + "." + month + "." + year
    
    result = []
    result.append("plain/" + year + "/" + month + "/" + str(article_id))
    result.append(page_author)
    result.append("")
    result.append("")
    result.append(page_tittle)
    result.append(date)
    result.append("публицистика")
    result.append("")
    result.append("")
    result.append("")
    result.append("")
    result.append("нейтральный")
    result.append("н-возраст")
    result.append("н-уровень")
    result.append("городская")
    result.append("http://ulpravda.ru/ulpravda-newspaper/news/news-" + str(article_id))
    result.append("Ульяновская правда")
    result.append("")
    result.append(int(year))
    result.append("газета")
    result.append("Россия")
    result.append("Ульяновская область")
    result.append("ru")
    
    analyzed = mystem.analyze(page_text)
    xml_text = mystem_to_xml(analyzed)
    plain_text = mystem_to_text(analyzed)
    
    header = "@au " + page_author + "\n"
    header += "@ti " + page_tittle + "\n"
    header += "@da " + date + "\n"
    header += "@url " + "http://ulpravda.ru/ulpravda-newspaper/news/news-" + str(article_id) + "\n"
    page_text = header + page_text
    
    save_to(page_text, "plain", year, month, str(article_id))
    save_to(xml_text, "mystem-xml", year, month, str(article_id))
    save_to(plain_text, "mystem-plain", year, month, str(article_id))
    
    return result

In [9]:
table = []
mystem = Mystem()
for file_name in os.listdir("data"):
    row = get_row(int(file_name), mystem)
    table.append(row)

  app.launch_new_instance()


In [10]:
file = open("metadata.csv", "w")
header = "path	author	sex	birthday	header	created	sphere	genre_fi	type	topic	chronotop	style	audience_age	audience_level	audience_size	source	publication	publisher	publ_year	medium	country	region	language"
file.write(header + "\n")
for row in table:
    for i in range(len(row) - 1):
        file.write(str(row[i]) + "\t")
    file.write(str(row[i]) + "\n")
file.close()
