In [60]:
import torch
import os
from pathlib import Path
import warnings
from tqdm import tqdm

from razdel import sentenize
import fitz
import re

import pandas as pd
import seaborn as sns

warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv("data/habr/raw_texts.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166320 entries, 0 to 166319
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  166320 non-null  int64 
 1   id          166320 non-null  int64 
 2   title       166320 non-null  object
 3   text        165728 non-null  object
 4   tags        166320 non-null  object
dtypes: int64(2), object(3)
memory usage: 6.3+ MB


In [11]:
def custom_str_to_lst(string):
    return string[1:-1].replace("'", "").split(", ")

In [12]:
df['tags'] = df.tags.apply(custom_str_to_lst)

In [35]:
(df['tags']
 .explode()
 .apply(lambda x: x.lower())
 .value_counts()
 .index[:200]
 .tolist())

['google',
 'android',
 'javascript',
 'microsoft',
 'linux',
 'php',
 'apple',
 'java',
 'python',
 '–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
 '—Å—Ç–∞—Ä—Ç–∞–ø—ã',
 '—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
 'ios',
 '—Å—Ç–∞—Ä—Ç–∞–ø',
 '—Å–æ—Ü–∏–∞–ª—å–Ω—ã–µ —Å–µ—Ç–∏',
 'iphone',
 '.net',
 '—è–Ω–¥–µ–∫—Å',
 'windows',
 'css',
 '–∏–≥—Ä—ã',
 '–¥–∏–∑–∞–π–Ω',
 'c++',
 'open source',
 '—Ö–∞–±—Ä–∞—Ö–∞–±—Ä',
 '–±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç—å',
 '–∏–Ω—Ç–µ—Ä–Ω–µ—Ç',
 '',
 'facebook',
 '—Ä–µ–∫–ª–∞–º–∞',
 '–∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω–∞—è –±–µ–∑–æ–ø–∞—Å–Ω–æ—Å—Ç—å',
 '–≤–∏–¥–µ–æ',
 '—é–º–æ—Ä',
 'firefox',
 '–∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏—è',
 'c#',
 'html',
 'html5',
 'ubuntu',
 'opera',
 '—Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞',
 '—é–∑–∞–±–∏–ª–∏—Ç–∏',
 '—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ',
 '–æ–±–∑–æ—Ä',
 '–º–∞—Ä–∫–µ—Ç–∏–Ω–≥',
 '—É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –ø—Ä–æ–µ–∫—Ç–∞–º–∏',
 '–ø–µ—Ä–µ–≤–æ–¥',
 'ruby',
 'jquery',
 '—Ä–∞–±–æ—Ç–∞',
 '—Ö–æ—Å—Ç–∏–Ω–≥',
 'twitter',
 '–±–∏–∑–Ω–µ—Å',
 'nokia',
 'web-—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
 '–∫–æ–Ω–∫—É—Ä—Å',
 'api',
 '–±—Ä–∞—É–∑–µ—Ä—ã',
 'mail.ru',
 

In [36]:
tags_to_save = set([
 'javascript',
 'linux',
 'php',
 'java',
 'python',
 '–ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ',
 '—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
 '.net',
 'c++',
 'c#',
 'ubuntu',
 'ruby',
 'web-—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
 'api',
 'mysql',
 'c',
 'node.js',
 '–≤–µ–±-—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞',
 '–∞–ª–≥–æ—Ä–∏—Ç–º—ã',
 '–≤–∏—Ä—Ç—É–∞–ª–∏–∑–∞—Ü–∏—è',
 'qt',
 '–∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å—ã',
 'sql','–º–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ', 'ml', 'dl', 'machine learning','docker'])

In [37]:
mask = df.tags.apply(lambda x: bool(list(set(x) & tags_to_save)))

In [45]:
res = df[mask]

In [46]:
res["text"] = res.text.apply(str)
res["text_len"] = res.text.apply(len)
res.text_len.mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res["text"] = res.text.apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res["text_len"] = res.text.apply(len)


8207.561475581639

In [69]:
res.text

0        –≠—Ç–æ –ø–µ—Ä–≤–∞—è –∑–∞–ø–∏—Å—å –≤ –±–ª–æ–≥, —á—Ç–æ–±—ã –ø—Ä–æ–≤–µ—Ä–∏—Ç—å —Ä–∞–±–æ...
1        –°–µ–≥–æ–¥–Ω—è –Ω–æ—á—å—é –Ω–∞—à –°–∞–º—ã–π –ì–ª–∞–≤–Ω—ã–π –ø–æ –º–∞—à–∏–Ω–Ω–æ–º—É –∫...
2        –ö–∞–∫ —è –ø–æ–Ω–∏–º–∞—é, —ç—Ç–æ—Ç –±–ª–æ–≥ –ø–æ—Å–≤—è—â–µ–Ω linux. –ù–µ –∑–Ω...
3        –í –ñ–ñ –∏, —á–∞—Å—Ç–∏—á–Ω–æ, –Ω–∞ Linux.org.ru –∏–¥—ë—Ç –∞–∫—Ç–∏–≤–Ω–∞...
4        –î—Ä–∞–π–≤–µ—Ä–∞ –ø–æ–¥ Linux –¥–ª—è –∫–ª–∞–≤–∏–∞—Ç—É—Ä—ã –û–ø—Ç–∏–º—É—Å –æ—Ç —Å...
                               ...                        
23850    \n–ó–¥—Ä–∞—Å—Ç–µ!\r\n–í –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è –≤—Å–µ —á–∞—â–µ –∑–∞–¥—É–º...
23851    –°–æ–≤—Ä–µ–º–µ–Ω–Ω—ã–µ —Ñ—Ä–µ–π–º–≤–æ—Ä–∫–∏ –¥–ª—è —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏ –≤–µ–±-–ø—Ä–∏–ª...
23852    \n\r\n–ù–µ—Å–º–æ—Ç—Ä—è –Ω–∞ –Ω–æ–≤–æ–≥–æ–¥–Ω–∏–µ –∫–∞–Ω–∏–∫—É–ª—ã, –≤ –∑–∞–ø–∞–¥...
23853    –ö–æ–º–∞–Ω–¥–∞ Rust —Ä–∞–¥–∞ —Å–æ–æ–±—â–∏—Ç—å –æ –Ω–æ–≤–æ–π –≤–µ—Ä—Å–∏–∏ Rust...
23854    \r\n–°–µ–π—á–∞—Å —è –∏–∑—É—á–∞—é –æ—Ç—á—ë—Ç –æ—á–µ—Ä–µ–¥–Ω–æ–π –ø—Ä–æ–≤–µ—Ä–∫–∏ –ø...
Name: te

In [47]:
res.reset_index(drop=True, inplace=True)

In [48]:
res.to_csv("data/habr/clean_habr.csv")

In [52]:
res.text[5]

'–í—á–µ—Ä–∞ —Å—Ç–∞–ª–æ –∏–∑–≤–µ—Å—Ç–Ω–æ, —á—Ç–æ –ì–∞–Ω—Å –†–µ–π–∑–µ—Ä, —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫ —Ñ–∞–π–ª–æ–≤–æ–π —Å–∏—Å—Ç–µ–º—ã ReiserFS, –∞—Ä–µ—Å—Ç–æ–≤–∞–Ω –ø–æ –ø–æ–¥–æ–∑—Ä–µ–Ω–∏—é –≤ —É–±–∏–π—Å—Ç–≤–µ —Å–≤–æ–µ–π –∂–µ–Ω—ã –ù–∏–Ω—ã –†–µ–π–∑–µ—Ä. \n\r\nReiserFS –ø—Ä–µ—Å–ª–µ–¥—É—é—Ç —Å–ø–ª–æ—à–Ω—ã–µ –Ω–µ—É–¥–∞—á–∏. –ù–µ—Å–º–æ—Ç—Ä—è –Ω–∞ –≤—Å–µ —É—Å–∏–ª–∏—è —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤ —ç—Ç—É –§–° —Ç–∞–∫ –∏ –Ω–µ –≤–∫–ª—é—á–∏–ª–∏ –≤ —è–¥—Ä–æ, –≤ –∞–≤–≥—É—Å—Ç–µ –º–µ—Å—è—Ü–µ –∫–æ–º–∞–Ω–¥–∞ –±—ã–ª–∞ –æ–∑–∞–±–æ—á–µ–Ω–∞ –ø–æ–∏—Å–∫–æ–º —Ä–∞–±–æ—Ç—ã, –∞ —Å–æ–≤—Å–µ–º –Ω–µ–¥–∞–≤–Ω–æ Suse Labs –æ—Ç–∫–∞–∑–∞–ª–∏—Å—å  –æ—Ç –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è ReiserFS –≤ –∫–∞—á–µ—Å—Ç–≤–µ –æ—Å–Ω–æ–≤–Ω–æ–π —Ñ–∞–π–ª–æ–≤–æ–π —Å–∏—Å—Ç–µ–º—ã –¥–ª—è —Å–≤–æ–µ–π –û–°. –¢–µ–ø–µ—Ä—å –≤–æ—Ç –µ—â—ë –∏ –∞—Ä–µ—Å—Ç —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—è.\n\r\n–í —Å–≤—è–∑–∏ —Å —ç—Ç–∏–º–∏ —Å–æ–±—ã—Ç–∏—è–º–∏ –¥–∞–ª—å–Ω–µ–π—à–∏–µ –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤—ã —Ä–∞–∑–≤–∏—Ç–∏—è ReiserFS –≤—ã–≥–ª—è–¥—è—Ç —Ç—É–º–∞–Ω–Ω—ã–º–∏.'

In [54]:
ruABC = "—ë–π—Ü—É–∫–µ–Ω–≥—à—â–∑—Ö—ä—Ñ—ã–≤–∞–ø—Ä–æ–ª–¥–∂—ç—è—á—Å–º–∏—Ç—å–±—é–Å–ô–¶–£–ö–ï–ù–ì–®–©–ó–•–™–§–´–í–ê–ü–†–û–õ–î–ñ–≠–Ø–ß–°–ú–ò–¢–¨–ë–Æ"
enABC = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
nums =  r'1234567890,.-)([]:@%‚Ññ$" '
acceptable_chars = ruABC + enABC + nums

In [71]:
def convert_case(match_obj):
        if match_obj.group(1) is not None:
            return match_obj.group(1)

class PDFParser:
    
    SPECIAL_CHARS = "#^&*+_=<‚úìŒ±ùëé>/\‚â°‚â°Œ£‚àë‚àà‚óè}{‚â§‚â•ÔøΩ√•√∞√ø√¶œÄ"
    NUMBERS = ("1","2", "3","4" ,"5" ,"6" ,"7" ,"8" ,"9")
    ACCEPTABLE_CHARS = acceptable_chars
    REPLACEMENT_DICT = {
        "¬ª": r'"',
        "¬´": r'"',
        "‚Äù": r'"',
        "‚Äú": r'"',
        "‚Äî": r'-',
        "‚Äì": r'-'
    }
    
    def __init__(self):
        return
    
    @staticmethod
    def convert_case(match_obj):
        if match_obj.group(1) is not None:
            return match_obj.group(1)
    
    @staticmethod
    def delete_repeating_whitespaces(sent):
        return re.sub(' +', ' ', sent)
    
    @staticmethod
    def delete_unicode(sent):
        sent = re.sub('\xad', ' ', sent)
        return sent.encode("utf-8", "ignore").decode()

    @staticmethod
    def replace_hyphenation(sent):
        return re.sub("(\S)- ", convert_case, sent)
    
    
    def replace_chars(self, sent):
        for key in self.REPLACEMENT_DICT:
            sent = sent.replace(key, self.REPLACEMENT_DICT[key])
        return sent
    
    
    def mark_blocks(self):
        for block in self.blocks:
            if 84 < block['bbox'][0] < 86:
                block['type'] = "text"
            if block['lines'][0]['spans'][0]['font'] == 'CMUSerif-Bold':
                block['type'] = 'title'
            
    def blocks_to_text(self):
        textlines = [] 
        for block in self.blocks:
            block_textlines = []
            for line in block['lines']:
                for span in line['spans']:
                    block_textlines.append(span['text'])
            if not block_textlines[0].startswith("["):
                textlines += block_textlines
        return " ".join(textlines)
    
    def text_to_sents(self, text):
        sents = [sent.text for sent in list(sentenize(text))]        
        
        sents = list(filter(lambda x: not any (c in self.SPECIAL_CHARS for c in x), sents))
        sents = list(filter(lambda x: not x.startswith(self.NUMBERS), sents))
        sents = list(filter(lambda x: not "https:" in x, sents))
        sents = list(filter(lambda x: not re.search(r"[1-9]\.", x), sents))
        
        sents = [self.replace_chars(sent) for sent in sents] 
        sents = [self.delete_repeating_whitespaces(sent) for sent in sents]
        sents = [self.replace_hyphenation(sent) for sent in sents]
        sents = [self.delete_unicode(sent) for sent in sents]
        
        sents = list(filter(lambda x: all (c in self.ACCEPTABLE_CHARS for c in x), sents))
        sents = list(filter(lambda x: len(x) > 21, sents))
        sents = list(filter(lambda x: len(x) < 512, sents))
        
        return sents 
    
    def get_sentences(self, doc_path):
        try:
            doc = fitz.open(doc_path)
        except:
            return []
        
        self.blocks = []
        
        for page in doc:
            self.blocks += page.get_text("dict", flags=16)['blocks']

        self.mark_blocks()
        self.blocks = list(filter(lambda x: x['type'] == 'text', self.blocks))
        
        return self.text_to_sents(self.blocks_to_text())

In [72]:
parser = PDFParser()

sents = []
for text in tqdm(res.text):
    sents += parser.text_to_sents(text)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23855/23855 [02:01<00:00, 196.24it/s]


In [73]:
len(sents)

772987

In [76]:
for sent in sents[100000:110000]:
    print(sent)

–ü–æ –±–æ–ª—å—à–æ–º—É —Å—á–µ—Ç—É, Logback –Ω–∞ —Å–µ–≥–æ–¥–Ω—è—à–Ω–∏–π –¥–µ–Ω—å - –≤–µ—Ä—à–∏–Ω–∞ —ç–≤–æ–ª—é—Ü–∏–∏.
–ü–æ–º–∏–º–æ Logback –ø–æ—è–≤–∏–ª–æ—Å—å —É–∂–µ —Å –¥–µ—Å—è—Ç–æ–∫ –Ω–æ–≤—ã—Ö logging-–±–∏–±–ª–∏–æ—Ç–µ–∫, –Ω–æ —Å –±–æ–ª—å—à–æ–π –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å—é –Ω–∏ –æ–¥–Ω–∞ –∏–∑ –Ω–∏—Ö –Ω–µ –≤—ã–∂–∏–≤–µ—Ç.
JUL - —Ç–∏—Ö–æ —É–º–∏—Ä–∞—é—â–∏–π —Å—Ç–∞–Ω–¥–∞—Ä—Ç.
SLF4J - –æ—á–µ–Ω—å –ø–æ–ø—É–ª—è—Ä–µ–Ω –≤ –±–∏–±–ª–∏–æ—Ç–µ–∫–∞—Ö.
–Ø —É–∂–µ –≥–æ–≤–æ—Ä–∏–ª, —á—Ç–æ Open Source —Å–æ–æ–±—â–µ—Å—Ç–≤–æ –∏–º–µ–µ—Ç —Ç–µ–Ω–¥–µ–Ω—Ü–∏—é —Å—Ç–µ–∫–∞—Ç—å—Å—è –∫ "—Ü–µ–Ω—Ç—Ä–∞–º —Ç—è–∂–µ—Å—Ç–∏".
–°–µ–π—á–∞—Å —Ç–∞–∫–∏–º —Ü–µ–Ω—Ç—Ä–æ–º —Ç—è–∂–µ—Å—Ç–∏ –≤—ã—Å—Ç—É–ø–∞–µ—Ç —Å–∫–æ—Ä–µ–µ SLF4J –≤ —Å–∏–ª—É "—É–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–æ—Å—Ç–∏".
–û—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–∞—è –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç—å SLF4J –≤ –∫–∞–∫–æ–π-—Ç–æ —Å—Ç–µ–ø–µ–Ω–∏ –≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ—Ç –æ—Ç –ø–æ—è–≤–ª–µ–Ω–∏—è –Ω–æ–≤—ã—Ö –æ–±–µ—Ä—Ç–æ–∫.
–ß–∏—Å–ª–æ –ø—Ä–æ–µ–∫—Ç–æ–≤, –∏—Å–ø–æ–ª—å–∑—É—é—â–∏—Ö SLF4J, —É–∂–µ —è–≤–ª—è–µ—Ç—Å—è –¥–æ—Å—Ç–∞—Ç–æ—á–Ω—ã–º –¥–ª—è 

–ò–º–µ–Ω–∞ –º–µ—Ç–æ–¥–æ–≤ –º–æ–∂–Ω–æ –∏–∑–º–µ–Ω–∏—Ç—å, —É–∫–∞–∑–∞–≤ —ç—Ç–æ —è–≤–Ω–æ.
–ú–æ–¥—É–ª–∏ –≤—Å–µ –µ—â–µ –≤–∫–ª—é—á–∞—é—Ç—Å—è –≤ —Ü–µ–ø–æ—á–∫—É –Ω–∞—Å–ª–µ–¥–æ–≤–∞–Ω–∏—è —Å—Ç—Ä–∞–Ω–Ω—ã–º –æ–±—Ä–∞–∑–æ–º.
(–≠—Ç–∞ –ø—Ä–æ–±–ª–µ–º–∞ —Ç–∞–∫ –∏ –Ω–µ —Ä–µ—à–µ–Ω–∞, —É–¥–∞—á–∏.
–ü–æ—Ç–æ–º—É —á—Ç–æ –±–æ–ª–µ–µ –∫–æ—Ä–æ—Ç–∫–æ–µ –Ω–∞–∑–≤–∞–Ω–∏–µ –º–µ—Ç–æ–¥–∞ –æ–∑–Ω–∞—á–∞–µ—Ç, —á—Ç–æ –æ–Ω –ø—Ä–µ–¥–ø–æ—á—Ç–∏—Ç–µ–ª—å–Ω–µ–µ –∫ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—é.
Nakada [–≤–µ—Ä–æ—è—Ç–Ω–æ, Nobuyoshi Nakada, –æ–¥–∏–Ω –∏–∑ —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤ —è–¥—Ä–∞ Ruby] —É–∂–µ –Ω–∞—á–∞–ª —Ä–∞–∑—Ä–∞–±–æ—Ç–∫—É —Ç–∏–ø–∞–∂–µ–π, –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö –¥–ª—è —Ä–µ–∞–ª–∏–∑–∞—Ü–∏–∏ –º–µ—Ç–æ–¥–∞ mix, –≤ –º–æ–º–µ–Ω—Ç –∞–Ω–æ–Ω—Å–∏—Ä–æ–≤–∞–Ω–∏—è —ç—Ç–æ–π —Ñ—É–Ω–∫—Ü–∏–∏ –Ω–∞ RubyKaigi [—è–ø–æ–Ω—Å–∫–∞—è Ruby-–∫–æ–Ω—Ñ–µ—Ä–µ–Ω—Ü–∏—è], —Ç–∞–∫ —á—Ç–æ –ø–∞—Ç—á —É–∂–µ –≥–æ—Ç–æ–≤.
–ù–µ –ø—Ä–µ–¥—É—Å–º–æ—Ç—Ä–µ–Ω–æ –Ω–∏–∫–∞–∫–∏—Ö —Å–ø–æ—Å–æ–±–æ–≤ —Ä–∞–∑—Ä–µ—à–µ–Ω–∏—è –∫–æ–Ω—Ñ–ª–∏–∫—Ç–æ–≤ —Å—Ä–µ–¥–∏ –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö-—á–ª–µ–Ω–æ–≤ –∫–ª–∞

–û–¥–Ω–æ –∏–∑ –µ–≥–æ –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–π –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤ —Ç–æ–º, —á—Ç–æ –æ–Ω —Ä–∞—Å–ø–æ–∑–Ω–∞—ë—Ç –≤ —Ü–µ–ø–æ—á–∫–µ –ø–æ—Å—Ç—É–ø–∞—é—â–∏—Ö —Å–∏–º–≤–æ–ª–æ–≤ —Ç–µ –∏–ª–∏ –∏–Ω—ã–µ –∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏: —Å–ª–æ–≤–∞, —Å–æ—Å—Ç–∞–≤–Ω—ã–µ –∑–Ω–∞–∫–∏ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è, —Ñ—É–Ω–∫—Ü–∏–∏, —Å—Ç—Ä—É–∫—Ç—É—Ä—ã, —Ü–µ–ª—ã–µ –∫–ª–∞—Å—Å—ã —Å –∏—Ö –º–µ—Ç–æ–¥–∞–º–∏ –∏ –ø–æ–ª—è–º–∏.
–¢–∞–∫ –¥–µ–π—Å—Ç–≤—É—é—Ç —Å–ø–µ–ª–ª—á–µ–∫–µ—Ä—ã, –∞–Ω–∞–ª–∏–∑–∞—Ç–æ—Ä—ã –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –∫–æ–¥–∞, –∫–æ–º–ø–∏–ª—è—Ç–æ—Ä—ã, –∏–Ω—Å—Ç—Ä—É–º–µ–Ω—Ç—ã –¥–ª—è –ø–æ–¥—Å–≤–µ—Ç–∫–∏ —Å–∏–Ω—Ç–∞–∫—Å–∏—Å–∞, —Ç–∞–∫ –¥–µ–π—Å—Ç–≤—É–µ—Ç –≤–∞—à –∫–æ–º–ø—å—é—Ç–µ—Ä, –∏ –ø—Ä–æ—á–µ–µ, –∏ –ø—Ä–æ—á–µ–µ.
–ü—Ä–∏–º–µ–Ω–∏–º–æ—Å—Ç—å —É –ö–ê –æ–≥—Ä–æ–º–Ω–∞—è.
–ù–∞–ø—Ä–∏–º–µ—Ä, —Å–≤–æ–∏–º —Å—Ç—É–¥–µ–Ω—Ç–∞–º —è –∑–∞–¥–∞–≤–∞–ª —Å–¥–µ–ª–∞—Ç—å –≥–µ–Ω–µ—Ä–∞—Ü–∏—é —Å–∫–∞–∑–∫–∏ —Å –ø–æ–º–æ—â—å—é –Ω–µ–¥–µ—Ç–µ—Ä–º–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –ö–ê, - –∏ —Ç–∞–∫–æ–µ –≤–æ–∑–º–æ–∂–Ω–æ.
–ó–¥–µ—Å—å –ø—Ä–∏–º–µ–Ω—è–µ—Ç—Å—è –ö–ê –Ω–∞ —Å—Ç—Ä–∞—Ç–µ–≥–∏—è—Ö ([1]), –Ω–∞–

–ó–¥–µ—Å—å –∏ –∑–¥–µ—Å—å –º–æ–∂–Ω–æ –Ω–∞–π—Ç–∏ –±–æ–ª–µ–µ –ø–æ–¥—Ä–æ–±–Ω—É—é –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é.
–¢.–∫. –ª—é–±–∞—è –æ—à–∏–±–∫–∞ –≥–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç –∏—Å–∫–ª—é—á–µ–Ω–∏–µ, —Ç–∞–∫–æ–π –∫–æ–¥ –º–æ–∂–µ—Ç —Å–¥–µ–ª–∞—Ç—å –º–Ω–æ–≥–∏–µ –æ—à–∏–±–∫–∏ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—è –ø–æ—Ö–æ–∂–∏–º–∏ –Ω–∞ –æ—à–∏–±–∫–∏ –≤—Ä–µ–º–µ–Ω–∏ –∏—Å–ø–æ–ª–Ω–µ–Ω–∏—è, –∏ –∑–∞—Ç—Ä—É–¥–Ω—è—Ç –æ—Ç–ª–∞–¥–∫—É –ø—Ä–æ–≥—Ä–∞–º–º—ã.
–ü–æ—Å–∫–æ–ª—å–∫—É "except" –æ—Ç–ª–∞–≤–ª–∏–≤–∞–µ—Ç –≤—Å–µ –∏—Å–∫–ª—é—á–µ–Ω–∏—è, –≤–∫–ª—é—á–∞—è "SystemExit", "KeyboardInterrupt", –∏ "GeneratorExit" (–∫–æ—Ç–æ—Ä—ã–µ –ø–æ —Å—É—Ç–∏ –Ω–µ —è–≤–ª—è—é—Ç—Å—è –æ—à–∏–±–∫–∞–º–∏ –∏ –Ω–µ –¥–æ–ª–∂–Ω—ã –æ—Ç–ª–∞–≤–ª–∏–≤–∞—Ç—å—Å—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–º –∫–æ–¥–æ–º), –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –≥–æ–ª–æ–≥–æ "except" –≤ –ª—é–±–æ–º —Å–ª—É—á–∞–µ –ø–ª–æ—Ö–∞—è –∏–¥–µ—è.
–í —Å–∏—Ç—É–∞—Ü–∏—è—Ö, –∫–æ–≥–¥–∞ –Ω–∞–º –Ω—É–∂–Ω–æ –≤—Å–µ-—Ç–∞–∫–∏ –ø–æ–∫—Ä—ã—Ç—å –≤—Å–µ –≤–æ–∑–º–æ–∂–Ω—ã–µ –∏—Å–∫–ª—é—á–∏—Ç–µ–ª—å–Ω—ã–µ —Å–∏—Ç—É–∞—Ü–∏–∏, –º—ã –º–æ–∂–µ–º –∏—Å–ø–æ–ª—å

–ú–æ–π –∫–ª–∏–µ–Ω—Ç - —ç—Ç–æ –∫–æ–º–∞–Ω–¥–∞ .NET —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤, –∞ —ç—Ç–æ –∑–Ω–∞—á–∏—Ç, —á—Ç–æ –¥–≤–∏–≥–∞—Ç—å—Å—è –º–æ–∂–Ω–æ –≤ –¥–≤—É—Ö –æ—Å–Ω–æ–≤–Ω—ã—Ö –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏—è—Ö: —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ —Å –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ–º —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã—Ö –≤–µ–±-—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π –∏–ª–∏ Silverlight.
–ö–æ–≥–¥–∞ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –æ–¥–Ω—É —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—é –≤ –ø—Ä–æ—Ç–∏–≤–æ–≤–µ—Å –¥—Ä—É–≥–æ–π, —Ç–æ –≤ –∏–¥–µ–∞–ª–µ –≤—ã–±–æ—Ä –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –æ—Å–Ω–æ–≤–∞–Ω –Ω–∞ —á–µ–º-—Ç–æ –±–æ–ª—å—à–µ–º, —á–µ–º –Ω–∞ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –º–Ω–µ–Ω–∏—è—Ö.
–ú—ã —Å–æ–∑–¥–∞–ª–∏ —Å–ø–∏—Å–æ–∫ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π-–∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤.
–ü–æ—Å–ª–µ —á–µ–≥–æ –º—ã —Å–æ–∑–¥–∞–ª–∏ –ø–µ—Ä–µ—á–µ–Ω—å –∫—Ä–∏—Ç–µ—Ä–∏–µ–≤, —Å–≤—è–∑–∞–Ω–Ω—ã—Ö –≤ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏.
–í—Å–µ–º –∫—Ä–∏—Ç–µ—Ä–∏—è–º –±—ã–ª –Ω–∞–∑–Ω–∞—á–µ–Ω –Ω–µ–∫–æ—Ç–æ—Ä—ã–π –≤–µ—Å, –∏ –º—ã –æ—Ü–µ–Ω–∏–ª–∏ –∫–∞–∂–¥—ã–π –∏–∑ –Ω–∏—Ö –¥–ª—è –≤—Å–µ—Ö —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π-–∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤.
–í —ç—Ç–æ–π —Å—Ç–∞—Ç—å–µ

–ü—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–æ, –≤—ã–ø–æ–ª–Ω—è–µ—Ç—Å—è –ø—Ä–µ–¥—ã–¥—É—â–∏–π —Å–∫—Ä–∏–ø—Ç, —Ä–∞–∑–±–∏—Ä–∞—é—â–∏–π —Ö–∞—Ä–¥-–ª–∏–Ω–∫–∏.
–ö–∞–∫ –≤–∏–¥–∏—Ç–µ, —Å–∫—Ä–∏–ø—Ç –ø—Ä–æ—Å—Ç–µ–π—à–∏–π –∏ –ø—Ä–∏ –∂–µ–ª–∞–Ω–∏–∏ –µ–≥–æ –º–æ–∂–Ω–æ –ø–µ—Ä–µ–ø–∏—Å–∞—Ç—å –ø–æ–¥ —Ä–µ–∞–∫—Ü–∏—é –Ω–∞ –ª—é–±–æ–µ –¥—Ä—É–≥–æ–µ –∏–∑–º–µ–Ω–µ–Ω–∏–µ.
–ú–æ–∂–Ω–æ –ø–æ–≤–µ—Å–∏—Ç—å –µ–≥–æ cron –∏ –æ–Ω –±—É–¥–µ—Ç —Å–∞–º —Å–ª–µ–¥–∏—Ç—å –∑–∞ –ø—Ä–æ—Ü–µ—Å—Å–æ–º, —Å–æ–±–∏—Ä–∞—è –ø–æ –º–µ—Ä–µ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏ –æ–±—Ä–∞–∑—ã.
–û–±—Ä–∞—Ç–Ω–∞—è —Å—Ç–æ—Ä–æ–Ω–∞ - –ø–æ–ª—É—á–∞–µ–º—ã–π —Ñ—É–Ω–∫—Ü–∏–æ–Ω–∞–ª: –æ–¥–∏–Ω —Ä–∞–∑ —Å–æ–±—Ä–∞–≤ –æ–±—Ä–∞–∑, –≤—ã –ø–æ–ª—É—á–∏—Ç–µ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –≤–≤–µ—Å—Ç–∏ –≤ —Å—Ç—Ä–æ–π –Ω–æ–≤—ã–π —Å–µ—Ä–≤–µ—Ä –∑–∞ —Å—á–∏—Ç–∞–Ω–Ω—ã–µ –º–∏–Ω—É—Ç—ã, –∞ –∫–∞–∂–¥—ã–π —Å–æ–±—Ä–∞–Ω–Ω—ã–π –æ–±—Ä–∞–∑ –º–æ–∂–µ—Ç —Å–ª—É–∂–∏—Ç—å —Ç–æ—á–∫–æ–π –æ—Ç–∫–∞—Ç–∞ –≤ —Å–ª—É—á–∞–µ –Ω–µ—É–¥–∞—á–Ω–æ–≥–æ –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è.
–†–∞–∑—É–º–µ–µ—Ç—Å—è –∑–¥–µ—Å—å –æ–ø–∏—Å–∞–Ω—ã —Ç–æ–ª—å–∫–æ –æ–±—â–∏–µ –ø—Ä–∏–Ω—Ü–∏–ø—ã –∏ –º

–í —ç—Ç–æ–π —Å—Ç–∞—Ç—å–µ —Ä–µ—á—å –ø–æ–π–¥–µ—Ç –æ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–∏ JBFD, —á—Ç–æ –æ–∑–Ω–∞—á–∞–µ—Ç Java BrainFuck Decompiler.
–¢–µ—Ö–Ω–æ–ª–æ–≥–∏—è –µ—â–µ –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –º–æ–ª–æ–¥–∞—è (–æ—Ç —Å–∏–ª—ã 3 —á–∞—Å–∞), —Ç–∞–∫ —á—Ç–æ –Ω–µ —Å—É–¥–∏—Ç–µ —Å—Ç—Ä–æ–≥–æ.
–ò–¥–µ—è —Å–æ–∑–¥–∞–Ω–∏—è –¥–µ–∫–æ–º–ø–∏–ª—è—Ç–æ—Ä–∞ –≤–æ–∑–Ω–∏–∫–ª–∞ –Ω–µ —Å–ª—É—á–∞–π–Ω–æ.
–í—Å–µ–º—É –≤–∏–Ω–æ–π –±–æ–ª—å—à–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç–∞—Ç–µ–π –ø–æ BrainFuck –≤ –ò–Ω—Ç–µ—Ä–Ω–µ—Ç–µ –≤–æ–æ–±—â–µ –∏ –Ω–∞ –•–∞–±—Ä–µ –≤ —á–∞—Å—Ç–Ω–æ—Å—Ç–∏.
–ò–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ç–æ—Ä–æ–≤ —ç—Ç–æ–≥–æ –∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–æ–≥–æ —è–∑—ã–∫–∞ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç –æ–≥—Ä–æ–º–Ω–æ–µ –º–Ω–æ–∂–µ—Å—Ç–≤–æ, –Ω–æ –≤–æ—Ç —Å—Ä–µ–¥—Å—Ç–≤ –¥–ª—è –æ—Ç–ª–∞–¥–∫–∏ BF –∫–æ–¥–∞ –º–Ω–µ —É–¥–∞–ª–æ—Å—å –Ω–∞–π—Ç–∏ –∫—Ä–∞–π–Ω–µ –º–∞–ª–æ.
–í—Å–µ —ç—Ç–æ –∏ –º–Ω–æ–≥–æ –¥—Ä—É–≥–æ–µ —Å—Ç–∞–Ω–æ–≤–∏—Ç—Å—è –≤–æ–∑–º–æ–∂–Ω—ã–º —Å JBFD.
–î–ª—è –Ω–µ—Ç–µ—Ä–ø–µ–ª–∏–≤—ã—Ö, —Å—Å—ã–ª–∫–∞ –Ω–∞ —Å–∫–∞—á–∏–≤–∞–Ω–∏–µ –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –∫–æ–¥–∞ –¥–µ–∫–æ–º–ø–∏–ª—è—Ç–æ—Ä—è –Ω–∞

–î–ª—è MyISAM —Ä–∞–∑–Ω–∏—Ü–∞ –º–µ–∂–¥—É –∏–º–ø–æ—Ä—Ç–æ–º –∏ —ç–∫—Å–ø–æ—Ä—Ç–æ–º –º–∏–Ω–∏–º–∞–ª—å–Ω–∞.
–≠—Ç–∞ "—Å—Ç–∞—Ç—å—è" –≤–æ–æ–±—â–µ –≥–æ–≤–æ—Ä—è –≤ —á–µ—Ä–Ω–æ–≤–æ–º –≤–∞—Ä–∏–∞–Ω—Ç–µ –≤–æ–∑–Ω–∏–∫–ª–∞ –Ω–µ—Å–∫–æ–ª—å–∫–æ –ª–µ—Ç –Ω–∞–∑–∞–¥.
–°–µ–π—á–∞—Å —Å—é–¥–∞ –¥–æ–±–∞–≤–ª–µ–Ω —Ç–æ–ª—å–∫–æ —Å—Ñ–∏–Ω–∫—Å –∏ –∑–∞–Ω–æ–≤–æ –ø—Ä–æ–≤–µ–¥–µ–Ω—ã —Ç–µ—Å—Ç—ã.
–ê –≤–æ–∑–Ω–∏–∫–ª–∞ –æ–Ω–∞ –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–µ —Å–ø–æ—Ä–∞ –Ω–∞ –∫–∞–∫–æ–º-—Ç–æ —Ñ–æ—Ä—É–º–µ, –ø–æ –ø–æ–≤–æ–¥—É –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏ utf –∏ —á—Ç–æ –º–æ–ª –¥—Ä—É–≥–∏–µ –∫–æ–¥–∏—Ä–æ–≤–∫–∏ —á–µ—Ä–µ–∑ –≥–æ–¥ —É–º—Ä—É—Ç.
–õ–æ–∫–∞–ª—å –≤ —Å–ª—É—á–∞–µ —É—Å—Ç–∞–Ω–æ–≤–∫–∏ php –∫–∞–∫ –º–æ–¥—É–ª—è –µ–¥–∏–Ω–∞ –Ω–∞ –≤–µ—Å—å —Å–µ—Ä–≤–µ—Ä —Å–æ –≤—Å–µ–º–∏ –≤—ã—Ç–µ–∫–∞—é—â–∏–º–∏, –∏ –¥–∞–∂–µ –ø—Ä–∏ –ø—Ä–∞–≤–∏–ª—å–Ω–æ–π –ª–æ–∫–∞–ª–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –æ–±—ã—á–Ω—ã–µ —Ñ—É–Ω–∫—Ü–∏–∏ –¥–ª—è —Ä–∞–±–æ—Ç—ã —Å–æ —Å—Ç—Ä–æ–∫–∞–º–∏ –Ω–µ–ª—å–∑—è, –Ω–∞–¥–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –∏—Ö –∞–Ω–∞–ª–æ–≥–∏ –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞—é—â–∏–µ —ç—Ç—É —Ä–∞–±–æ—Ç—É.


–ò—Å—Ç–æ—Ä–∏—è –±—É–¥–µ—Ç –∏ –ø—Ä–æ —Å–µ—Ä–≤–µ—Ä–∞, –∏ –ø—Ä–æ Windows –∏ Mac OS, –ø—Ä–æ –∞–¥–º–∏–Ω—Å–∫–∏–µ –±—É–¥–Ω–∏ –∏ –ø—Ä–∞–∑–¥–Ω–∏–∫–∏, –∏ –¥–∞–∂–µ (–Ω–µ–º–Ω–æ–≥–æ) –ø—Ä–æ Java.
–°–ø–µ—Ä–≤–∞ –Ω–µ–º–Ω–æ–≥–æ –æ –±–∞–Ω–∞–ª—å–Ω–æ–º.
–°–µ—Ä–≤–µ—Ä–∞ –≤ –Ω–∞—à–µ–º –º–∏—Ä–µ —á–∞—â–µ –≤—Å–µ–≥–æ —Å—Ç–æ—è—Ç –≤ –¥–∞—Ç–∞-—Ü–µ–Ω—Ç—Ä–∞—Ö, –¥–æ—Å—Ç—É–ø –∫—É–¥–∞ –Ω–∞ "–ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å" –µ—Å–ª–∏ –∏ –Ω–µ —Å–∏–ª—å–Ω–æ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω –≤—Å—è—á–µ—Å–∫–∏–º–∏ –ø—Ä–æ–ø—É—Å–∫–Ω—ã–º–∏ —Å–∏—Å—Ç–µ–º–∞–º–∏, —Ç–æ —Ö–æ—Ç—è –±—ã –Ω–µ—É–¥–æ–±–µ–Ω –∏–∑-–∑–∞ —Ç–æ–≥–æ, —á—Ç–æ –¥–∞—Ç–∞—Ü–µ–Ω—Ç—Ä –æ–±—ã—á–Ω–æ –∫–∞–∫ –Ω–∞–∑–ª–æ –Ω–∞ –¥—Ä—É–≥–æ–º –∫–æ–Ω—Ü–µ –≥–æ—Ä–æ–¥–∞.
–ù–µ –Ω–∞–µ–∑–¥–∏—à—å—Å—è –Ω–∞ –∫–∞–∂–¥—ã–π —Å–±–æ–π.
–ù–æ, —Ü–∏–≤–∏–ª–∏–∑–∞—Ü–∏—è –∏–∑–æ–±—Ä–µ–ª–∞ –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ø–æ—Å–æ–±–æ–≤ –∫–∞–∫-—Ç–æ –¥–æ—Å—Ç—É—á–∞—Ç—å—Å—è –¥–æ –Ω–µ–±–ª–∏–∑–∫–∏—Ö —Å–µ—Ä–≤–µ—Ä–æ–≤.
–ê —É –Ω–∞—Å –∂–µ –≥–æ–ª–æ—Å–æ–≤–∞–Ω–∏–µ –∏–¥–µ—Ç, –ø–æ–º–æ–≥–∏—Ç–µ, –Ω—É...".
–ù—É, —Å–æ–±—Å—Ç–≤–µ–Ω–Ω–æ, –∏ —Ç

- –¢–æ—Ä–º–æ–∑–Ω—É—Ç—ã–π –¥–æ–º–∞—à–Ω–∏–π —Å–∞–π—Ç.
- –ù–µ—Å—Ç–∞–±–∏–ª—å–Ω–æ–µ –≤—Ä–µ–º—è –æ—Ç–∫–ª–∏–∫–∞.
–ß—Ç–æ-–∂ –∏–∑ –≤—Å–µ–≥–æ –≤—ã—à–µ—Å–∫–∞–∑–∞–Ω–æ–≥–æ, –∫–∞–∂–¥—ã–π —Å–º–æ–∂–µ—Ç —Å–¥–µ–ª–∞—Ç—å –≤—ã–≤–æ–¥—ã –¥–ª—è —Å–µ–±—è —Å–∞–º.
–Ø –¥–ª—è —Å–µ–±—è –∏ —Å–≤–æ–∏—Ö –∑–∞–¥–∞—á –≤—ã–±–µ—Ä—É gevent.
UPD. –Ω–µ–º–Ω–æ–≥–æ –æ—à–∏–±—Å—è —Å–æ —Å–∫—Ä–∏–ø—Ç–æ–º –Ω–∞ gevent.
–ù–∞ –Ω–µ–≥–æ –±—ã–ª–æ –º–µ–Ω—å—à–µ –Ω–∞–≥—Ä—É–∑–∫–∏.
–ü—Ä–æ–¥–∞–∂–∏ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–æ–≤ –∏ –ø–ª–∞–Ω—à–µ—Ç–æ–≤ —Ä–∞—Å—Ç—É—Ç —Å –∫–∞–∂–¥—ã–º –¥–Ω–µ–º, –∏ —ç—Ç–æ —É–∂–µ –≥–æ–≤–æ—Ä–∏—Ç –æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏ –ø–æ–≤—ã—à–µ–Ω–Ω–æ–≥–æ –≤–Ω–∏–º–∞–Ω–∏—è –∫ –∏–Ω—Ç–µ—Ä—Ñ–µ–π—Å–∞–º –¥–ª—è –º–æ–±–∏–ª—å–Ω—ã—Ö —É—Å—Ç—Ä–æ–π—Å—Ç–≤.
–ü–æ –∏–º–µ—é—â–∏–º—Å—è –ø—Ä–æ–≥–Ω–æ–∑–∞–º, –≤ —Ç–µ—á–µ–Ω–∏–µ 2011 –≥–æ–¥–∞ –æ–±—ä–µ–º –ø—Ä–æ–¥–∞–∂ –ø–ª–∞–Ω—à–µ—Ç–æ–≤ —Å—É—â–µ—Å—Ç–≤–µ–Ω–Ω–æ –≤–æ–∑—Ä–∞—Å—Ç–µ—Ç, –∞ –æ–±—ä–µ–º –ø—Ä–æ–¥–∞–∂ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–æ–≤ —Å—É—â–µ—Å—Ç–≤–µ–Ω–Ω–æ –ø—Ä–µ–≤—ã—Å–∏—Ç –æ–±—ä–µ–º –ø—Ä–æ–¥–∞–∂ —Ç–µ–ª–µ—Ñ–æ–Ω–æ–≤ —Ç—Ä–∞–¥–∏—

In [78]:
df = pd.DataFrame({"text": sents})
df.to_csv("data/sents/habr.csv", encoding='utf-8')