In [26]:
import codecs
import re
import nltk
import collections
import numpy
import subprocess
import json
import multiprocessing
import os
import gc
import sklearn
import networkx as nx

In [4]:
letters_regex = re.compile(r'[а-яА-ЯёЁ]+')
def has_letter(s):
    return letters_regex.match(s)

In [5]:
def mystem_analyze(text):
    p = subprocess.Popen(["mystem", "--format=json", "-nigfcsd"],
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    stdout, stderr = p.communicate(text.encode('utf-8'))
    res_str = stdout.decode('utf-8')
    return json.loads('[%s]' % (",".join(res_str.splitlines())))


In [6]:
def split_by_sentence_marker(analysis):
    last = 0
    for i, a in enumerate(analysis):
        if a['text'] == '\\s':
            yield analysis[last:i]
            last = i + 1
    if analysis[last:]:
        yield analysis[last:]


In [20]:
TextFeatures = collections.namedtuple('TextFeatures', 'avg_words_in_sentence avg_word_length rel_voc_size rel_hapax_legomena_count rel_pos_count rel_char_count rel_common_word_count')

def char_range(start, end):
    return [chr(char) for char in range(ord(start), ord(end) + 1)]

mystem_pos = ['A', 'ADV', 'ADVPRO', 'ANUM', 'APRO', 'COM', 'CONJ', 'INTJ', 'NUM', 'PART', 'PR', 'S', 'SPRO', 'V']
alpha_chars = char_range('а', 'я') + char_range('А', 'Я') + ['ё', 'Ё']
num_chars = char_range('0', '9')
punct_chars = list(',.?!-:;"\'')
most_freq_words = ['и', 'в', 'не', 'он', 'на', 'я', 'что', 'тот', 'быть', 'с', 'а', 'весь', 'это', 'как', 'она', 'по', 'но', 'они', 'к', 'у', 'ты', 'из', 'мы', 'за', 'вы', 'так', 'же', 'от', 'сказать', 'этот', 'который', 'мочь', 'человек', 'о', 'один', 'еще', 'бы', 'такой', 'только', 'себя', 'свое', 'какой', 'когда', 'уже', 'для', 'вот', 'кто', 'да', 'говорить', 'год', 'знать', 'мой', 'до', 'или', 'если', 'время', 'рука', 'нет', 'самый', 'ни', 'стать', 'большой', 'даже', 'другой', 'наш', 'свой', 'ну', 'под', 'где', 'дело', 'есть', 'сам', 'раз', 'чтобы', 'два', 'там', 'чем', 'глаз', 'жизнь', 'первый', 'день', 'тута', 'во', 'ничто', 'потом', 'очень', 'со', 'хотеть', 'ли', 'при', 'голова', 'надо', 'без', 'видеть', 'идти', 'теперь', 'тоже', 'стоять', 'друг', 'дом']

def get_features(analysis):
    words = [w for w in analysis if w.get('analysis')]
    text = "".join([w['text'] for w in analysis])
    sents = list(split_by_sentence_marker(analysis))
    words_of_sents = [[w for w in s if w.get('analysis')] for s in sents]
    lemma_count = collections.Counter([w['analysis'][0]['lex'] for w in words])
    char_count = collections.Counter(text)
    
    avg_words_in_sentence = numpy.average([len(sent) for sent in words_of_sents])
    word_count = len(words)
    avg_word_length = numpy.average([len(w['text']) for w in words])
    rel_voc_size = len(lemma_count) / word_count
    rel_hapax_legomena_count = len([w for w in lemma_count if lemma_count[w] == 1]) / word_count
    pos_count = collections.Counter(re.split('[,=()]', w['analysis'][0]['gr'])[0] for w in words)
    rel_pos_count = [pos_count[p] / word_count for p in mystem_pos]
    rel_char_count = [char_count[c] / len(text) for c in alpha_chars + num_chars + punct_chars]
    rel_common_word_count = [lemma_count[w] / word_count for w in most_freq_words]
    return TextFeatures(
        avg_words_in_sentence=avg_words_in_sentence,
        avg_word_length=avg_word_length,
        rel_voc_size=rel_voc_size,
        rel_hapax_legomena_count=rel_hapax_legomena_count,
        rel_pos_count=rel_pos_count,
        rel_char_count=rel_char_count,
        rel_common_word_count=rel_common_word_count
    )
    

In [8]:
def get_vector(f):
    return numpy.array([f.avg_words_in_sentence, f.avg_word_length, f.rel_voc_size, f.rel_hapax_legomena_count] +
        f.rel_pos_count + f.rel_char_count + f.rel_common_word_count)

In [21]:
def get_features_from_file(file):
    print(file)
    with codecs.open(file, encoding='utf-8') as f:
        content = f.read()
        analysis = [json.loads(l) for l in content.splitlines()]
        return get_features(analysis)

features = [get_features_from_file(os.path.join("json", file)) for file in os.listdir("json")]

json\Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json
json\Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json
json\Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json
json\Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json
json\Aleksin_Moy-brat-igraet-na-klarnete.oGHedw.1061.fb2.txt.json
json\Aleksin_Pozdniy-rebenok.pCeYeQ.1068.fb2.txt.json
json\Aleksin_Razdel-imushchestva.qgz_cw.1070.fb2.txt.json
json\Aleksin_Zdorovye-i-bolnye.FJTSAw.122412.fb2.txt.json
json\Andreev_Angelochek.b_uQkA.125257.fb2.txt.json
json\Andreev_Daniil-Andreev-Sobranie-sochineniy-v-4-tomah_2_Zheleznaya-misteriya.r42e2Q.187978.fb2.txt.json
json\Andreev_Iuda-Iskariot.xLnnCQ.288557.fb2.txt.json
json\Andreev_Krasnyy-smeh.X1Y_9Q.65864.fb2.txt.json
json\Andreev_Rasskaz-o-semi-poveshennyh.1L2XlQ.65888.fb2.txt.json
json\Andreev_Roza-Mira.QoAqJg.245158.fb2.txt.json
json\Andreev_Zhizn-Vasiliya-Fi

In [24]:
book_files = os.listdir("json")
book_files

['Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json',
 'Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json',
 'Aleksin_Moy-brat-igraet-na-klarnete.oGHedw.1061.fb2.txt.json',
 'Aleksin_Pozdniy-rebenok.pCeYeQ.1068.fb2.txt.json',
 'Aleksin_Razdel-imushchestva.qgz_cw.1070.fb2.txt.json',
 'Aleksin_Zdorovye-i-bolnye.FJTSAw.122412.fb2.txt.json',
 'Andreev_Angelochek.b_uQkA.125257.fb2.txt.json',
 'Andreev_Daniil-Andreev-Sobranie-sochineniy-v-4-tomah_2_Zheleznaya-misteriya.r42e2Q.187978.fb2.txt.json',
 'Andreev_Iuda-Iskariot.xLnnCQ.288557.fb2.txt.json',
 'Andreev_Krasnyy-smeh.X1Y_9Q.65864.fb2.txt.json',
 'Andreev_Rasskaz-o-semi-poveshennyh.1L2XlQ.65888.fb2.txt.json',
 'Andreev_Roza-Mira.QoAqJg.245158.fb2.txt.json',
 'Andreev_Zhizn-Vasiliya-Fiveyskogo.w95XsA.7

In [11]:
features[0]._asdict().keys()

odict_keys(['avg_words_in_sentence', 'avg_word_length', 'rel_voc_size', 'rel_hapax_legomena_count', 'rel_pos_count', 'rel_char_count', 'rel_common_word_count'])

In [12]:
feature_matrix = numpy.array([get_vector(f) for f in features])
scaled_feature_matrix = sklearn.preprocessing.scale(feature_matrix)

In [13]:
numpy.shape(scaled_feature_matrix)

(133, 203)

In [30]:
eucl_dists = sklearn.metrics.pairwise.pairwise_distances(scaled_feature_matrix)
numpy.shape(eucl_dists)

(133, 133)

In [83]:
def dist_graph(dists, names, threshold):
    
    def author(book):
        return book.split('_')[0]
    
    g = nx.Graph()
    g.add_nodes_from((i, dict(author=author(book))) for i, book in enumerate(names))
    g.add_weighted_edges_from((i, j, 1/dist) for (i, j), dist in numpy.ndenumerate(dists) if dist <= threshold and dist > 0)
    return g

nx.write_gml(dist_graph(eucl_dists, book_files, 15), 'graph.gml')

In [77]:
book_files[120]

'Makarenko_Pedagogicheskaya-poema.E2ambg.69167.fb2.txt.json'

In [82]:
list(numpy.ndenumerate(eucl_dists))

[((0, 0), 0.0),
 ((0, 1), 11.804704841105524),
 ((0, 2), 12.546320778143993),
 ((0, 3), 17.897781373390561),
 ((0, 4), 16.257392027183212),
 ((0, 5), 16.426748739977626),
 ((0, 6), 15.925285885950636),
 ((0, 7), 21.488016717685809),
 ((0, 8), 24.039626945346281),
 ((0, 9), 43.500296288745538),
 ((0, 10), 19.632860610393998),
 ((0, 11), 16.217419819531106),
 ((0, 12), 17.480871234758418),
 ((0, 13), 22.348796837125477),
 ((0, 14), 20.468044202184622),
 ((0, 15), 22.404092528467977),
 ((0, 16), 21.760456822853865),
 ((0, 17), 16.54251372053055),
 ((0, 18), 17.066444326096942),
 ((0, 19), 35.681097679937039),
 ((0, 20), 16.142553485547094),
 ((0, 21), 31.548295722601836),
 ((0, 22), 28.504843348223311),
 ((0, 23), 16.608903933868827),
 ((0, 24), 14.900916616568795),
 ((0, 25), 14.983907146957817),
 ((0, 26), 15.733467314235924),
 ((0, 27), 16.220747432175351),
 ((0, 28), 16.970090762376543),
 ((0, 29), 18.706151743523815),
 ((0, 30), 19.273588348990209),
 ((0, 31), 17.476834244384104),
 (

In [85]:
list(enumerate(book_files))

[(0,
  'Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json'),
 (1,
  'Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json'),
 (2,
  'Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json'),
 (3,
  'Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json'),
 (4, 'Aleksin_Moy-brat-igraet-na-klarnete.oGHedw.1061.fb2.txt.json'),
 (5, 'Aleksin_Pozdniy-rebenok.pCeYeQ.1068.fb2.txt.json'),
 (6, 'Aleksin_Razdel-imushchestva.qgz_cw.1070.fb2.txt.json'),
 (7, 'Aleksin_Zdorovye-i-bolnye.FJTSAw.122412.fb2.txt.json'),
 (8, 'Andreev_Angelochek.b_uQkA.125257.fb2.txt.json'),
 (9,
  'Andreev_Daniil-Andreev-Sobranie-sochineniy-v-4-tomah_2_Zheleznaya-misteriya.r42e2Q.187978.fb2.txt.json'),
 (10, 'Andreev_Iuda-Iskariot.xLnnCQ.288557.fb2.txt.json'),
 (11, 'Andreev_Krasnyy-smeh.X1Y_9Q.65864.fb2.txt.json'),
 (12, 'Andreev_Rasskaz-o-semi-poveshennyh.1L2XlQ.65888.fb2.txt.json'),
 (13, 'Andreev_

In [87]:
eucl_dists[100][116]

12.916021486427113