In [9]:
import codecs
import re
import nltk
import collections
import numpy
import subprocess
import json
import multiprocessing
import os
import gc
import sklearn
import networkx as nx

In [10]:
letters_regex = re.compile(r'[а-яА-ЯёЁ]+')
def has_letter(s):
    return letters_regex.match(s)

In [11]:
def mystem_analyze(text):
    p = subprocess.Popen(["mystem", "--format=json", "-nigfcsd"],
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    stdout, stderr = p.communicate(text.encode('utf-8'))
    res_str = stdout.decode('utf-8')
    return json.loads('[%s]' % (",".join(res_str.splitlines())))


In [12]:
def split_by_sentence_marker(analysis):
    last = 0
    for i, a in enumerate(analysis):
        if a['text'] == '\\s':
            yield analysis[last:i]
            last = i + 1
    if analysis[last:]:
        yield analysis[last:]


In [13]:
TextFeatures = collections.namedtuple('TextFeatures', 'avg_words_in_sentence avg_word_length rel_voc_size rel_hapax_legomena_count rel_pos_count rel_char_count rel_common_word_count')

def char_range(start, end):
    return [chr(char) for char in range(ord(start), ord(end) + 1)]

mystem_pos = ['A', 'ADV', 'ADVPRO', 'ANUM', 'APRO', 'COM', 'CONJ', 'INTJ', 'NUM', 'PART', 'PR', 'S', 'SPRO', 'V']
alpha_chars = char_range('а', 'я') + char_range('А', 'Я') + ['ё', 'Ё']
num_chars = char_range('0', '9')
punct_chars = list(',.?!-:;"\'')
most_freq_words = ['и', 'в', 'не', 'он', 'на', 'я', 'что', 'тот', 'быть', 'с', 'а', 'весь', 'это', 'как', 'она', 'по', 'но', 'они', 'к', 'у', 'ты', 'из', 'мы', 'за', 'вы', 'так', 'же', 'от', 'сказать', 'этот', 'который', 'мочь', 'человек', 'о', 'один', 'еще', 'бы', 'такой', 'только', 'себя', 'свое', 'какой', 'когда', 'уже', 'для', 'вот', 'кто', 'да', 'говорить', 'год', 'знать', 'мой', 'до', 'или', 'если', 'время', 'рука', 'нет', 'самый', 'ни', 'стать', 'большой', 'даже', 'другой', 'наш', 'свой', 'ну', 'под', 'где', 'дело', 'есть', 'сам', 'раз', 'чтобы', 'два', 'там', 'чем', 'глаз', 'жизнь', 'первый', 'день', 'тута', 'во', 'ничто', 'потом', 'очень', 'со', 'хотеть', 'ли', 'при', 'голова', 'надо', 'без', 'видеть', 'идти', 'теперь', 'тоже', 'стоять', 'друг', 'дом']

def get_features(analysis):
    words = [w for w in analysis if w.get('analysis')]
    text = "".join([w['text'] for w in analysis])
    sents = list(split_by_sentence_marker(analysis))
    words_of_sents = [[w for w in s if w.get('analysis')] for s in sents]
    lemma_count = collections.Counter([w['analysis'][0]['lex'] for w in words])
    char_count = collections.Counter(text)
    
    avg_words_in_sentence = numpy.average([len(sent) for sent in words_of_sents])
    word_count = len(words)
    avg_word_length = numpy.average([len(w['text']) for w in words])
    rel_voc_size = len(lemma_count) / word_count
    rel_hapax_legomena_count = len([w for w in lemma_count if lemma_count[w] == 1]) / word_count
    pos_count = collections.Counter(re.split('[,=()]', w['analysis'][0]['gr'])[0] for w in words)
    rel_pos_count = [pos_count[p] / word_count for p in mystem_pos]
    rel_char_count = [char_count[c] / len(text) for c in alpha_chars + num_chars + punct_chars]
    rel_common_word_count = [lemma_count[w] / word_count for w in most_freq_words]
    return TextFeatures(
        avg_words_in_sentence=avg_words_in_sentence,
        avg_word_length=avg_word_length,
        rel_voc_size=rel_voc_size,
        rel_hapax_legomena_count=rel_hapax_legomena_count,
        rel_pos_count=rel_pos_count,
        rel_char_count=rel_char_count,
        rel_common_word_count=rel_common_word_count
    )
    

In [14]:
def get_vector(f):
    return numpy.array([f.avg_words_in_sentence, f.avg_word_length, f.rel_voc_size, f.rel_hapax_legomena_count] +
        f.rel_pos_count + f.rel_char_count + f.rel_common_word_count)

In [15]:
def get_features_from_file(file):
    print(file)
    with codecs.open(file, encoding='utf-8') as f:
        content = f.read()
        analysis = [json.loads(l) for l in content.splitlines()]
        return get_features(analysis)

features = [get_features_from_file(os.path.join("json", file)) for file in os.listdir("json")]

json\Abramov_Pryasliny_1_Bratya-i-sestry.edbdyw.158030.fb2.txt.json
json\Abramov_Pryasliny_2_Dve-zimy-i-tri-leta.hU9IrQ.158033.fb2.txt.json
json\Abramov_Pryasliny_3_Puti-pereputya.ktaB0Q.158036.fb2.txt.json
json\Abramov_Pryasliny_4_Dom.FWiheg.158032.fb2.txt.json
json\Aksenov_Zvezdnyy-bilet.HMAp3Q.677.fb2.txt.json
json\Akunin_Priklyucheniya-Erasta-Fandorina_2_Tureckiy-gambit.xK5KVg.305713.fb2.txt.json
json\Akunin_Priklyucheniya-Erasta-Fandorina_7_Statskiy-sovetnik.5irVkw.461230.fb2.txt.json
json\Akunin_Priklyucheniya-Nikolasa-Fandorina_1_Altyn-Tolobas.Uz6tpg.131819.fb2.txt.json
json\Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json
json\Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json
json\Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json
json\Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json
json\Aleksin_Moy-brat-igraet-na-klarnete.oGHedw.1061.fb2.txt.js

json\Gorkiy_Delo-Artamonovyh.O2tuGQ.225353.fb2.txt.json
json\Gorkiy_Mat.bDeL0g.19772.fb2.txt.json
json\Gorkiy_Trilogiya-Maksim-Gorkiy-_3_Moi-universitety.A4iIZw.19780.fb2.txt.json
json\Granin_Eshche-zameten-sled.qk5PRw.149025.fb2.txt.json
json\Granin_Idu-na-grozu.rC7RHg.499588.fb2.txt.json
json\Granin_Iskateli.bhevgw.498584.fb2.txt.json
json\Granin_Kartina.pp67Hg.20477.fb2.txt.json
json\Granin_Nash-dorogoy-Roman-Avdeevich.mi15NA.490942.fb2.txt.json
json\Grin_Alye-parusa.LSYj6A.151346.fb2.txt.json
json\Grin_Blistayushchiy-mir.Kd-LQg.74681.fb2.txt.json
json\Grishkovec_A-a._jyHkg.189626.fb2.txt.json
json\Grishkovec_Asfalt.C9Ar2g.183415.fb2.txt.json
json\Grishkovec_Reki.xC3f6g.183416.fb2.txt.json
json\Grishkovec_Rubashka.Go3d8Q.186275.fb2.txt.json
json\Grossman_Za-pravoe-delo.W2HG_Q.366727.fb2.txt.json
json\Grossman_Zhizn-i-sudba.kkKMgw.312635.fb2.txt.json
json\Iskander_Kroliki-i-udavy.zk4xWA.70498.fb2.txt.json
json\Iskander_Morskoy-skorpion.UiuNBg.153215.fb2.txt.json
json\Iskander_Sandro-

json\Pristavkin_Soldat-i-malchik.5Y_pgw.77143.fb2.txt.json
json\Prohanov_Gospodin-Geksogen.nkwiOg.44510.fb2.txt.json
json\Prohanov_Mesto-deystviya.5SybzQ.185965.fb2.txt.json
json\Prohanov_Shestsot-let-posle-bitvy.hZezLw.234281.fb2.txt.json
json\Prokofeva_Ostrov-kapitanov.6caUVQ.183317.fb2.txt.json
json\Prokofeva_Uchenik-volshebnika.9DYzFQ.183311.fb2.txt.json
json\Pushkin_Dubrovskiy.0ZmHPw.253788.fb2.txt.json
json\Pushkin_Pikovaya-Dama.WF0dzQ.77039.fb2.txt.json
json\Pushkin_Povesti-pokoynogo-Ivana-Petrovicha-Belkina_4_Stancionnyy-smotritel.6OB8pQ.348954.fb2.txt.json
json\Rasputin_Posledniy-srok.xGsIOw.330813.fb2.txt.json
json\Rasputin_Pozhar.vCw3EQ.97467.fb2.txt.json
json\Rasputin_Proshchanie-s-Materoy.FOOsHA.98447.fb2.txt.json
json\Rasputin_Sovremennaya-russkaya-i-zarubezhnaya-proza_1_Uroki-francuzskogo.9Knb7w.397901.fb2.txt.json
json\Rasputin_Zhivi-i-pomni.GiHLwg.149476.fb2.txt.json
json\Reshetnikov_Podlipovcy._pL2fw.137488.fb2.txt.json
json\Rubina_Belaya-golubka-Kordovy.J9cWKg.173678

In [16]:
book_files = os.listdir("json")
book_files

['Abramov_Pryasliny_1_Bratya-i-sestry.edbdyw.158030.fb2.txt.json',
 'Abramov_Pryasliny_2_Dve-zimy-i-tri-leta.hU9IrQ.158033.fb2.txt.json',
 'Abramov_Pryasliny_3_Puti-pereputya.ktaB0Q.158036.fb2.txt.json',
 'Abramov_Pryasliny_4_Dom.FWiheg.158032.fb2.txt.json',
 'Aksenov_Zvezdnyy-bilet.HMAp3Q.677.fb2.txt.json',
 'Akunin_Priklyucheniya-Erasta-Fandorina_2_Tureckiy-gambit.xK5KVg.305713.fb2.txt.json',
 'Akunin_Priklyucheniya-Erasta-Fandorina_7_Statskiy-sovetnik.5irVkw.461230.fb2.txt.json',
 'Akunin_Priklyucheniya-Nikolasa-Fandorina_1_Altyn-Tolobas.Uz6tpg.131819.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json',
 'Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json',
 'Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json',
 'Aleksin_Moy-brat-igraet-na-klarnete.oGHedw.1061.fb2.txt.json',
 'Aleksin_

In [17]:
features[0]._asdict().keys()

odict_keys(['avg_words_in_sentence', 'avg_word_length', 'rel_voc_size', 'rel_hapax_legomena_count', 'rel_pos_count', 'rel_char_count', 'rel_common_word_count'])

In [18]:
feature_matrix = numpy.array([get_vector(f) for f in features])
scaled_feature_matrix = sklearn.preprocessing.scale(feature_matrix)

In [19]:
numpy.shape(scaled_feature_matrix)

(388, 203)

In [20]:
eucl_dists = sklearn.metrics.pairwise.pairwise_distances(scaled_feature_matrix)
numpy.shape(eucl_dists)

(388, 388)

In [21]:
def dist_graph(dists, names, threshold):
    
    def author(book):
        return book.split('_')[0]
    
    g = nx.Graph()
    g.add_nodes_from((i, dict(author=author(book))) for i, book in enumerate(names))
    g.add_weighted_edges_from((i, j, 1/dist) for (i, j), dist in numpy.ndenumerate(dists) if dist <= threshold and dist > 0)
    return g

nx.write_gml(dist_graph(eucl_dists, book_files, 15), 'graph.gml')

In [22]:
book_files[120]

'Gaydar_Golubaya-chashka.-XXvtA.17600.fb2.txt.json'

In [23]:
list(numpy.ndenumerate(eucl_dists))

[((0, 0), 0.0),
 ((0, 1), 8.1511541894100894),
 ((0, 2), 10.049049587639816),
 ((0, 3), 10.527583449393106),
 ((0, 4), 15.452332678185435),
 ((0, 5), 18.722178360352416),
 ((0, 6), 14.828094189844037),
 ((0, 7), 15.090709102459892),
 ((0, 8), 17.604406263541286),
 ((0, 9), 17.828024621825058),
 ((0, 10), 18.017877143066269),
 ((0, 11), 19.587688534540622),
 ((0, 12), 18.974276220347779),
 ((0, 13), 20.323803701925126),
 ((0, 14), 18.445985579295957),
 ((0, 15), 22.297764993516807),
 ((0, 16), 11.29272612343822),
 ((0, 17), 13.132218845699789),
 ((0, 18), 12.976383954227504),
 ((0, 19), 16.735783092575272),
 ((0, 20), 20.72829133272305),
 ((0, 21), 50.586826449735284),
 ((0, 22), 19.521651574765826),
 ((0, 23), 18.187537521039342),
 ((0, 24), 16.183602448400613),
 ((0, 25), 23.024032290381172),
 ((0, 26), 17.530141604164974),
 ((0, 27), 16.674532163998251),
 ((0, 28), 20.02899792527786),
 ((0, 29), 13.201000684020016),
 ((0, 30), 14.522423052187627),
 ((0, 31), 39.825016072790916),
 ((0

In [24]:
list(enumerate(book_files))

[(0, 'Abramov_Pryasliny_1_Bratya-i-sestry.edbdyw.158030.fb2.txt.json'),
 (1, 'Abramov_Pryasliny_2_Dve-zimy-i-tri-leta.hU9IrQ.158033.fb2.txt.json'),
 (2, 'Abramov_Pryasliny_3_Puti-pereputya.ktaB0Q.158036.fb2.txt.json'),
 (3, 'Abramov_Pryasliny_4_Dom.FWiheg.158032.fb2.txt.json'),
 (4, 'Aksenov_Zvezdnyy-bilet.HMAp3Q.677.fb2.txt.json'),
 (5,
  'Akunin_Priklyucheniya-Erasta-Fandorina_2_Tureckiy-gambit.xK5KVg.305713.fb2.txt.json'),
 (6,
  'Akunin_Priklyucheniya-Erasta-Fandorina_7_Statskiy-sovetnik.5irVkw.461230.fb2.txt.json'),
 (7,
  'Akunin_Priklyucheniya-Nikolasa-Fandorina_1_Altyn-Tolobas.Uz6tpg.131819.fb2.txt.json'),
 (8,
  'Aleksievich_Golosa-Utopii_1_U-voyny-ne-zhenskoe-lico-.VNQLMg.70938.fb2.txt.json'),
 (9,
  'Aleksievich_Golosa-Utopii_3_Cinkovye-malchiki.NVa1uw.427115.fb2.txt.json'),
 (10,
  'Aleksievich_Golosa-Utopii_5_Vremya-sekond-hend.9FJziw.426019.fb2.txt.json'),
 (11,
  'Aleksin_Anatoliy-Aleksin-Sobranie-sochineniy-v-treh-tomah_1_Bezumnaya-Evdokiya.gkQ47w.347944.fb2.txt.json'),

In [25]:
eucl_dists[100][116]

18.993671562182588