In [1]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import json
from pprint import pprint

In [2]:
os.chdir('C:/projects/itmo/text-anal/')

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avdosev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def load_data():
    res = []
    pps = ['AA']
    for pp in pps:
        data_path = f'data/out/{pp}'

        for file_name in os.listdir(data_path):
            print('preparing: {}'.format(file_name))
            with open(os.path.join(data_path,file_name)) as f:
                lines = f.readlines()
                for line in lines:
                    text = json.loads(line)['text']
                    res.append(text)
    return res

In [5]:
documents = load_data()

preparing: wiki_00
preparing: wiki_01
preparing: wiki_02
preparing: wiki_03
preparing: wiki_04
preparing: wiki_05
preparing: wiki_06
preparing: wiki_07
preparing: wiki_08
preparing: wiki_09
preparing: wiki_10
preparing: wiki_11
preparing: wiki_12
preparing: wiki_13
preparing: wiki_14
preparing: wiki_15
preparing: wiki_16
preparing: wiki_17
preparing: wiki_18
preparing: wiki_19
preparing: wiki_20
preparing: wiki_21
preparing: wiki_22
preparing: wiki_23
preparing: wiki_24
preparing: wiki_25
preparing: wiki_26
preparing: wiki_27
preparing: wiki_28
preparing: wiki_29
preparing: wiki_30
preparing: wiki_31
preparing: wiki_32
preparing: wiki_33
preparing: wiki_34
preparing: wiki_35
preparing: wiki_36
preparing: wiki_37
preparing: wiki_38
preparing: wiki_39
preparing: wiki_40
preparing: wiki_41
preparing: wiki_42
preparing: wiki_43
preparing: wiki_44
preparing: wiki_45
preparing: wiki_46
preparing: wiki_47
preparing: wiki_48
preparing: wiki_49
preparing: wiki_50
preparing: wiki_51
preparing: w

In [6]:
def preprocess_data(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = set(stopwords.words('russian'))
    p_stemmer = PorterStemmer()
    texts = []
    index = 0
    for i in docs:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    return texts

In [7]:
clean_text = preprocess_data(documents)

In [8]:
dictionary = corpora.Dictionary(clean_text)

In [9]:
def prepare_corpus(doc_clean):
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary, doc_term_matrix

In [10]:
def create_model(doc_clean, num_topics):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)
    return lsamodel

In [11]:
len(clean_text)

1422

In [12]:
topics_count = 300
model = create_model(clean_text, topics_count)

In [13]:
pprint(model.print_topics(num_topics=topics_count, num_words=10))

[(0,
  '0.470*"года" + 0.411*"году" + 0.189*"также" + 0.129*"1" + 0.117*"время" + '
  '0.109*"является" + 0.108*"города" + 0.107*"россии" + 0.107*"населения" + '
  '0.106*"страны"'),
 (1,
  '-0.487*"ссср" + -0.362*"года" + 0.250*"году" + 0.152*"является" + '
  '0.142*"страны" + -0.122*"войска" + 0.115*"населения" + 0.114*"новой" + '
  '0.111*"зеландии" + -0.105*"войны"'),
 (2,
  '0.259*"города" + -0.228*"это" + 0.176*"году" + -0.175*"также" + '
  '0.173*"года" + -0.147*"которые" + -0.120*"время" + -0.112*"например" + '
  '-0.111*"её" + 0.107*"город"'),
 (3,
  '0.372*"города" + -0.361*"ссср" + -0.292*"страны" + -0.185*"новой" + '
  '-0.169*"зеландии" + 0.165*"город" + 0.148*"городе" + -0.131*"населения" + '
  '0.111*"м" + -0.111*"стране"'),
 (4,
  '0.364*"the" + -0.356*"ссср" + 0.292*"beatl" + 0.266*"группы" + 0.251*"года" '
  '+ -0.177*"россии" + 0.115*"новой" + -0.108*"города" + 0.107*"зеландии" + '
  '-0.085*"области"'),
 (5,
  '0.370*"ссср" + 0.316*"the" + 0.254*"beatl" + 0.233*"гру

In [14]:
wrds = ['россии']

In [15]:
corp = dictionary.doc2bow(wrds)
print(corp)

[(1831, 1)]


In [74]:
vecs = model[corp]
print(vecs)

[(0, 0.10715395812853824), (1, -0.03992654657684746), (2, 0.08436165047637628), (3, 0.11112979146223086), (4, -0.17658827207194114), (5, -0.14721350081459986), (6, -0.17880093944102426), (7, -0.2372511931415115), (8, -0.4037384792620463), (9, 0.10985127383027139), (10, -0.03347382233707849), (11, -0.26026608703172577), (12, -0.08211878736447452), (13, -0.0878320341636793), (14, 0.0703076495665071), (15, 0.01343635900455778), (16, 0.169646647040389), (17, 0.11273469682054518), (18, -0.06612872138970587), (19, 0.20538064933098132), (20, -0.03510022306625668), (21, -0.06076642028489548), (22, -0.07775721399264619), (23, 0.07144636545493847), (24, 0.04547120474095263), (25, -0.01609028256643281), (26, 0.033490435419369725), (27, 0.04031304160052343), (28, -0.03451898250030135), (29, -0.03543669645301455), (30, 0.01585002515524006), (31, -0.029598314097071938), (32, -0.00557538804957816), (33, 0.02152299953811722), (34, 0.06298139490905635), (35, 0.04046533430086225), (36, 0.034810033756286

In [48]:
import numpy as np

In [106]:
def vectorize(words):
    return [np.array([v for i, v in model[dictionary.doc2bow([word])]]) for word in words]
print(len(vectorize(['россии', 'золота', 'москве'])))
print(vectorize(['россии', 'золота', 'москве']))

3
[array([ 1.07153958e-01, -3.99265466e-02,  8.43616505e-02,  1.11129791e-01,
       -1.76588272e-01, -1.47213501e-01, -1.78800939e-01, -2.37251193e-01,
       -4.03738479e-01,  1.09851274e-01, -3.34738223e-02, -2.60266087e-01,
       -8.21187874e-02, -8.78320342e-02,  7.03076496e-02,  1.34363590e-02,
        1.69646647e-01,  1.12734697e-01, -6.61287214e-02,  2.05380649e-01,
       -3.51002231e-02, -6.07664203e-02, -7.77572140e-02,  7.14463655e-02,
        4.54712047e-02, -1.60902826e-02,  3.34904354e-02,  4.03130416e-02,
       -3.45189825e-02, -3.54366965e-02,  1.58500252e-02, -2.95983141e-02,
       -5.57538805e-03,  2.15229995e-02,  6.29813949e-02,  4.04653343e-02,
        3.48100338e-02, -1.11792061e-01, -4.35530902e-03,  6.46014156e-02,
        1.01644383e-02,  8.68027339e-03, -1.21404514e-02, -8.77230758e-03,
        1.40940738e-02,  2.73670565e-02,  4.12197305e-02, -5.33825876e-03,
       -3.88629302e-02, -9.85242521e-03, -5.52300867e-02,  8.67283666e-03,
        8.53163352e-03

In [64]:
from common import *
from collections import Counter

In [113]:
words = load_words()[:100000]
print(len(words))
words = [word for word in words if word in dictionary.token2id]
print(len(words))

100000
28011


In [114]:
vecs = vectorize(words)
print(Counter(len(v) for v in vecs))

l = [item for item in zip(vecs, words) if len(item[0]) == 300]
words = [w for v, w in l]
vecs = [v for v, w in l]

Counter({300: 27989, 299: 22})


In [44]:
from sklearn.cluster import DBSCAN as dbscan, KMeans, MeanShift

In [46]:
def groupBy(iterable, limit=10):
    res = {}
    for key, value in iterable:
        if key not in res:
            res[key] = []
        if len(res[key]) > limit and limit > 0:
            continue
        res[key].append(value)
    return res


In [123]:
clusters = dbscan(eps=0.01, n_jobs=-1, min_samples=3)
clusters.fit(vecs)
labels = clusters.labels_
print(Counter(labels))

Counter({0: 14237, -1: 12224, 2: 78, 15: 65, 9: 63, 4: 58, 1: 54, 31: 51, 3: 49, 22: 47, 28: 44, 46: 44, 27: 40, 39: 40, 54: 35, 29: 34, 38: 34, 30: 33, 12: 29, 18: 29, 26: 28, 60: 27, 5: 26, 8: 26, 24: 26, 49: 26, 32: 25, 11: 24, 17: 22, 43: 22, 35: 21, 52: 21, 16: 20, 42: 20, 45: 20, 25: 19, 51: 19, 47: 18, 6: 17, 41: 17, 40: 16, 48: 15, 50: 14, 58: 14, 61: 14, 55: 10, 19: 9, 37: 9, 20: 8, 68: 8, 13: 7, 65: 7, 36: 6, 57: 6, 66: 6, 44: 5, 56: 5, 73: 5, 81: 5, 7: 4, 10: 4, 14: 4, 21: 4, 79: 4, 23: 4, 34: 4, 63: 4, 69: 4, 70: 4, 71: 4, 74: 4, 80: 4, 59: 3, 33: 3, 77: 3, 53: 3, 72: 3, 62: 3, 64: 3, 82: 3, 67: 3, 75: 3, 76: 3, 78: 3})


In [124]:
from pprint import pprint

In [126]:
pprint(groupBy(zip(labels, words)))

{-1: ['мужем',
      'отойти',
      'предметам',
      'вино',
      'вднх',
      'сожалению',
      'сессии',
      'решило',
      'механическое',
      'происходящего',
      'орнамент'],
 0: ['ходовая',
     'инстинктивное',
     'сиротский',
     'аня',
     'алмазами',
     'фонограмму',
     'сжигание',
     'парится',
     'sach',
     'соглашались',
     'грозное'],
 1: ['страстью',
     'окончен',
     'замужней',
     'синьор',
     'тьме',
     'дьявольской',
     'изображено',
     'девятилетний',
     'благородстве',
     'восторженной',
     'херувим'],
 2: ['придут',
     'имеющуюся',
     'бедняка',
     'прибегает',
     'смелых',
     'злодеем',
     'братии',
     'бояться',
     'окрестил',
     'выслан',
     'несправедливость'],
 3: ['печального',
     'следа',
     'австрийскую',
     'настроению',
     'бурными',
     'образчик',
     'противоположным',
     'наивную',
     'забывая',
     'похвалы',
     'публику'],
 4: ['попробовал',
     'знала',
     'дос