In [58]:
import codecs
import re
import nltk
import collections
import numpy
import subprocess
import json
import multiprocessing
import os
import gc
import sklearn

In [5]:
letters_regex = re.compile(r'[а-яА-ЯёЁ]+')
def has_letter(s):
    return letters_regex.match(s)

In [6]:
def mystem_analyze(text):
    p = subprocess.Popen(["mystem", "--format=json", "-nigfcsd"],
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    stdout, stderr = p.communicate(text.encode('utf-8'))
    res_str = stdout.decode('utf-8')
    return json.loads('[%s]' % (",".join(res_str.splitlines())))


In [7]:
def split_by_sentence_marker(analysis):
    last = 0
    for i, a in enumerate(analysis):
        if a['text'] == '\\s':
            yield analysis[last:i]
            last = i + 1
    if analysis[last:]:
        yield analysis[last:]


In [44]:
TextFeatures = collections.namedtuple('TextFeatures', 'avg_words_in_sentence avg_word_length rel_voc_size rel_hapax_legomena_count rel_pos_count rel_char_count rel_common_word_count')

def char_range(start, end):
    return [chr(char) for char in range(ord(start), ord(end) + 1)]

mystem_pos = ['A', 'ADV', 'ADVPRO', 'ANUM', 'APRO', 'COM', 'CONJ', 'INTJ', 'NUM', 'PART', 'PR', 'S', 'SPRO', 'V']
alpha_chars = char_range('а', 'я') + char_range('А', 'Я') + ['ё', 'Ё']
num_chars = char_range('0', '9')
punct_chars = list(',.?!-:;"\'')
most_freq_words = ['и', 'в', 'не', 'он', 'на', 'я', 'что', 'тот', 'быть', 'с', 'а', 'весь', 'это', 'как', 'она', 'по', 'но', 'они', 'к', 'у', 'ты', 'из', 'мы', 'за', 'вы', 'так', 'же', 'от', 'сказать', 'этот', 'который', 'мочь', 'человек', 'о', 'один', 'еще', 'бы', 'такой', 'только', 'себя', 'свое', 'какой', 'когда', 'уже', 'для', 'вот', 'кто', 'да', 'говорить', 'год', 'знать', 'мой', 'до', 'или', 'если', 'время', 'рука', 'нет', 'самый', 'ни', 'стать', 'большой', 'даже', 'другой', 'наш', 'свой', 'ну', 'под', 'где', 'дело', 'есть', 'сам', 'раз', 'чтобы', 'два', 'там', 'чем', 'глаз', 'жизнь', 'первый', 'день', 'тута', 'во', 'ничто', 'потом', 'очень', 'со', 'хотеть', 'ли', 'при', 'голова', 'надо', 'без', 'видеть', 'идти', 'теперь', 'тоже', 'стоять', 'друг', 'дом']

def get_features(analysis):
    words = [w for w in analysis if w.get('analysis')]
    text = "".join([w["text"] for w in analysis])
    sents = list(split_by_sentence_marker(analysis))
    words_of_sents = [[w for w in s if w.get('analysis')] for s in sents]
    lemme_count = collections.Counter([w['analysis'][0]['lex'] for w in words])
    char_count = collections.Counter(text)
    
    avg_words_in_sentence = numpy.average([len(sent) for sent in words_of_sents])
    word_count = len(words)
    avg_word_length = word_count / len(words)
    rel_voc_size = len(lemme_count) / word_count
    rel_hapax_legomena_count = len([w for w in lemme_count if lemme_count[w] == 1]) / word_count
    pos_count = collections.Counter(re.split('[,=()]', w['analysis'][0]['gr'])[0] for w in words)
    rel_pos_count = [pos_count[p] / word_count for p in mystem_pos]
    rel_char_count = [char_count[c] / len(text) for c in alpha_chars + num_chars + punct_chars]
    rel_common_word_count = [lemme_count[w] / word_count for w in most_freq_words]
    return TextFeatures(
        avg_words_in_sentence=avg_words_in_sentence,
        avg_word_length=avg_word_length,
        rel_voc_size=rel_voc_size,
        rel_hapax_legomena_count=rel_hapax_legomena_count,
        rel_pos_count=rel_pos_count,
        rel_char_count=rel_char_count,
        rel_common_word_count=rel_common_word_count
    )
    

In [41]:
def get_vector(f):
    return numpy.array([f.avg_words_in_sentence, f.avg_word_length, f.rel_voc_size, f.rel_hapax_legomena_count] +
        f.rel_pos_count + f.rel_char_count + f.rel_common_word_count)

In [None]:
def get_features_from_file(file):
    print(file)
    with codecs.open(file, encoding='utf-8') as f:
        content = f.read()
        analysis = [json.loads(l) for l in content.splitlines()]
        return get_features(analysis)

features = [get_features_from_file(os.path.join("json", file)) for file in os.listdir("json")]

In [46]:
os.listdir("json")

['Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json',
 'Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json',
 'Aleksin_Moy-brat-igraet-na-klarnete.oGHedw.1061.fb2.txt.json',
 'Aleksin_Pozdniy-rebenok.pCeYeQ.1068.fb2.txt.json',
 'Aleksin_Razdel-imushchestva.qgz_cw.1070.fb2.txt.json',
 'Aleksin_Zdorovye-i-bolnye.FJTSAw.122412.fb2.txt.json',
 'Andreev_Angelochek.b_uQkA.125257.fb2.txt.json',
 'Andreev_Daniil-Andreev-Sobranie-sochineniy-v-4-tomah_2_Zheleznaya-misteriya.r42e2Q.187978.fb2.txt.json',
 'Andreev_Iuda-Iskariot.xLnnCQ.288557.fb2.txt.json',
 'Andreev_Krasnyy-smeh.X1Y_9Q.65864.fb2.txt.json',
 'Andreev_Rasskaz-o-semi-poveshennyh.1L2XlQ.65888.fb2.txt.json',
 'Andreev_Roza-Mira.QoAqJg.245158.fb2.txt.json',
 'Andreev_Zhizn-Vasiliya-Fiveyskogo.w95XsA.7

In [69]:
features[0]._asdict().keys()

odict_keys(['avg_words_in_sentence', 'avg_word_length', 'rel_voc_size', 'rel_hapax_legomena_count', 'rel_pos_count', 'rel_char_count', 'rel_common_word_count'])

In [59]:
feature_matrix = numpy.array([get_vector(f) for f in features])
scaled_feature_matrix = sklearn.preprocessing.scale(feature_matrix)

In [70]:
numpy.shape(scaled_feature_matrix)

(133, 203)