In [1]:
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

import sqlite3
from db_api import get_records_by_field

db = sqlite3.connect('db/users.db')
cursor = db.cursor()

In [2]:
cluster_fields = ['about', 'activities', 'books', 'communities',
                  'games', 'interests', 'personal_inspired_by', 'movies',
                  'music', 'status']

In [3]:
cluster_field = cluster_fields[3]
data = get_records_by_field(cursor, cluster_field)
ids, words = map(list, zip(*data))
flattened = [' '.join(sublist) for sublist in words]

total: 84995
0.0 0.05
0.0%
1.1765397964586153%
2.3530795929172306%
3.5296193893758456%
4.706159185834461%
5.882698982293076%
7.059238778751691%
8.235778575210306%
9.412318371668922%
10.588858168127537%
11.765397964586152%
12.941937761044768%
14.118477557503383%
15.295017353961997%
16.47155715042061%
17.648096946879228%
18.824636743337845%
20.001176539796457%
21.177716336255074%
22.35425613271369%
23.530795929172303%
24.70733572563092%
25.883875522089536%
27.06041531854815%
28.236955115006765%
29.41349491146538%
30.590034707923994%
31.76657450438261%
32.94311430084122%
34.11965409729984%
35.296193893758456%
36.47273369021707%
37.64927348667569%
38.8258132831343%
40.002353079592915%
41.178892876051535%
42.35543267251015%
43.53197246896876%
44.70851226542738%
45.88505206188599%
47.061591858344606%
48.238131654803226%
49.41467145126184%
50.59121124772045%
51.76775104417907%
52.944290840637684%
54.1208306370963%
55.29737043355492%
56.47391023001353%
57.65045002647214%
58.82698982293076%
60.

In [4]:
vectorizer = TfidfVectorizer(max_df=0.3, min_df=0.005)
X = vectorizer.fit_transform(flattened)

In [5]:
true_k = 20

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=1)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print(i)
    st = ''
    arr = []
    for ind in order_centroids[i, :25]:
        arr.append(terms[ind])
    print(' '.join(arr))
    print()

Top terms per cluster:
0
–∂–∏–≤–æ—Ç–Ω—ã–µ –∂–∏–≤–æ—Ç–Ω—ã–º –∂–∏–≤–æ—Ç–Ω—ã—Ö –±–µ–∑–¥–æ–º–Ω—ã—Ö –∏–º —Å–æ–±–∞–∫ —Ç—Ä–µ–±—É–µ—Ç—Å—è –æ—Ç–ª–æ–≤ —Å—Ç—Ä–∞–¥–∞—é—Ç –∂–∏–≤—É—â–∏—Ö –ø—Ä–∏–≤–æ–¥–∏—Ç —Å—Ç–∞–Ω—å—Ç–µ –∂–∏–≤–æ—Ç–Ω—ã–º–∏ –¥–µ–ª–∞—Ç—å —Å–µ–º—å—é —Å–ø–æ—Å–æ–±–æ–º –ø–æ–º–æ–≥–∞—é—Ç –∫–æ—à–µ–∫ –¥—Ä—É–≥ —É–ª–∏—Ü–µ —Å—Ç–æ—Ä–æ–Ω—ã –ø–æ–º–æ—â—å –Ω–∞—Ö–æ–¥—è—Ç –ø–æ–º–æ—á—å —Å–ª—É—á–∞–µ

1
–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–∏ —Ä–∞–∑–º–µ—â–µ–Ω–∏—é vkcomekbrepost –∫–æ–º–º–µ—Ä—á–µ—Å–∫–æ–º—É —É–¥–∞–ª—è—é—Ç—Å—è –æ—Å—Ç–∞–≤–ª—è—Ç—å –ø—Ä–µ–¥–ª–∞–≥–∞—Ç—å vkcompage –º–µ–Ω–µ–¥–∂–µ—Ä—É –ø–æ—á–µ–º—É –ø–æ–¥–ø–∏—à–∏—Ç–µ—Å—å —Å—Ç—Ä–∞–Ω–∏—Ü—É —É—Ä–∞–ª–∞ –Ω–∞—à—É –ø—Ä–æ–∏—Å—à–µ—Å—Ç–≤–∏—è –ø–∞–±–ª–∏–∫–∞ –Ω–µ—Å—á–∞—Å—Ç–Ω–æ–≥–æ –ø–æ–∂–∞—Ä–∞ —Å–≤–∏–¥–µ—Ç–µ–ª–µ–º –∞–≤–∞—Ä–∏–∏ vkcomkalininamarinka incekb —Ä–∞–∑–±–∞–Ω—É —Å–ª—É—á–∞—è –ø–æ–∂–∞–ª–æ–≤–∞—Ç—å—Å—è

2
–Ω–æ–≤–æ—Å—Ç–µ–π –º–µ–º—ã –ø—Ä–∏–∫–æ–ª—ã —Ç–∏–ø–∏—á–Ω—ã–π —ç–∫—Å–∫–ª—é–∑–∏–≤–∞–º —Å–µ—Ä—å—ë–∑–Ω–æ–µ –∫–æ—Ç–∏–∫–æ–≤ —Ç–∏–ø–∏—á–Ω–æ–≥–æ –æ–ø–µ—Ä–∞—Ç–∏–≤–Ω–æ—Å—Ç–∏ –µ–∂–µ–¥–Ω–µ–

In [6]:
true_k = 30

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=1)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print(i)
    st = ''
    arr = []
    for ind in order_centroids[i, :25]:
        arr.append(terms[ind])
    print(' '.join(arr))
    print()

Initialization complete
Iteration  0, inertia 83970.798
Iteration  1, inertia 78347.516
Iteration  2, inertia 77612.692
Iteration  3, inertia 77331.816
Iteration  4, inertia 77207.202
Iteration  5, inertia 76993.982
Iteration  6, inertia 76881.870
Iteration  7, inertia 76751.210
Iteration  8, inertia 76669.362
Iteration  9, inertia 76636.819
Iteration 10, inertia 76615.952
Iteration 11, inertia 76604.985
Iteration 12, inertia 76600.870
Iteration 13, inertia 76599.200
Iteration 14, inertia 76597.775
Iteration 15, inertia 76596.072
Iteration 16, inertia 76593.955
Iteration 17, inertia 76590.684
Iteration 18, inertia 76584.563
Iteration 19, inertia 76571.111
Iteration 20, inertia 76552.341
Iteration 21, inertia 76546.025
Iteration 22, inertia 76540.088
Iteration 23, inertia 76526.661
Iteration 24, inertia 76485.249
Iteration 25, inertia 76345.609
Iteration 26, inertia 76282.050
Iteration 27, inertia 76233.486
Iteration 28, inertia 76186.550
Iteration 29, inertia 76158.232
Iteration 30, in

In [1]:
from clustering import load_model
model = load_model('communities.pkl')
vec = load_model('communities_vec.pkl')

In [5]:
X = vec.transform(['–∫–∏–Ω–æ'])
model.predict(X)

array([13], dtype=int32)

In [2]:
true_k = 40
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(true_k):
    print(i)
    st = ''
    arr = []
    for ind in order_centroids[i, :25]:
        arr.append(terms[ind])
    print(' '.join(arr))
    print()

Top terms per cluster:
0
–∏–¥–µ–∞–ª—å–Ω–∞—è –¥–∞–Ω–æ –º–æ—Ç–∏–≤–∞—Ü–∏–∏ —Ñ–∏–≥—É—Ä—É –ø–∞–±–ª–∏–∫–æ–≤ ya —Å–ø–æ—Ä—Ç–µ –±–æ–ª—å—à–∏—Ö —Ç–µ–ª–æ –ø—Ä–∞–≤–∏–ª—å–Ω–æ–º –ø–∏—Ç–∞–Ω–∏–∏ —Ä–æ–∂–¥–µ–Ω–∏—è –æ–¥–∏–Ω sport –∏–Ω—Ç–µ—Ä–≤—å—é –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤ –∏–Ω—Ç–µ—Ä–µ—Å –∏–Ω—Ç–µ—Ä–µ—Å–∞–º –∏–Ω—Ç–µ—Ä–µ—Å–µ–Ω –∏–Ω—Ç–µ—Ä–µ—Å–Ω–∞ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–∞—è —è—Å–Ω–æ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–µ–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ –∏–Ω—Ç–µ–ª–ª–∏–≥–µ–Ω—Ü–∏—è

1
federation been content has –æ—Å–Ω–æ–≤–∞–Ω–∏–∏ russian this —Å—É–¥ —Ä–µ—à–µ–Ω–∏—è the in –æ–±–ª–∞—Å—Ç—å —Ç–æ–ª—å—è—Ç—Ç–∏ –∫—Ä–∞–π –≥–æ—Ä–æ–¥—Å–∫–æ–π –º–∞—Å—Å–æ–≤—ã—Ö porn –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω—ã—Ö rus –≤–ª–∞—Å—Ç–∏ at —Å–ª—É–∂–±—ã from —Ñ–µ–¥–µ—Ä–∞–ª—å–Ω–æ–≥–æ —Ç–µ—Ö–Ω–æ–ª–æ–≥–∏–π

2
ifun —è—Å–Ω–æ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç—É–∞–ª—å–Ω–æ–π –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç—É–∞–ª—å–Ω—ã–π –∏–Ω—Ç–µ–ª–ª–∏–≥–µ–Ω—Ü–∏—è –∏–Ω—Ç–µ—Ä–∞–∫—Ç–∏–≤ –∏–Ω—Ç–µ—Ä–≤—å—é –∏–Ω—Ç–µ—Ä–µ—Å –∏–Ω—Ç–µ—Ä–µ—Å–∞–º –∏–Ω—Ç–µ—Ä–µ—Å–µ–Ω –∏–Ω—Ç–µ—Ä–µ—Å–Ω–∞ –∏–Ω—Ç–µ—Ä–µ—Å–Ω–∞—è –∏–Ω—Ç–µ—Ä–µ—Å–Ω–µ–µ –∏–Ω—Ç–µ—Ä–µ—Å–

In [2]:
model.get_params()

{'algorithm': 'auto',
 'copy_x': True,
 'init': 'random',
 'max_iter': 300,
 'n_clusters': 40,
 'n_init': 10,
 'n_jobs': None,
 'precompute_distances': 'auto',
 'random_state': None,
 'tol': 0.0001,
 'verbose': True}

In [3]:
print(len(list(vec.stop_words_)))
vec.stop_words_

952809


{'–∑–∞–≤–∏–¥–æ–≤–∞–ª',
 '–∫–æ—Å—Ç—é–º—á–∏–∫–∞',
 'qq',
 '—Ä–∞—à–∞—é—Ç—Å—è',
 '–≤–∑—Ä—ã–≤—á–∞—Ç–∫–∞',
 '–∫—Ä–∞—Å–∏–ª–æ–≤–∞',
 '–≤–∑—è—Ç–æ',
 'surahn',
 'umperia',
 'mirlovez',
 'rona',
 '–ø–Ω–∞—è',
 'oliverhuntemann',
 '–∑–∞–ø–∞—Ö–∞',
 'culinar',
 '–≤–ø–∏—Å—ã–≤–∞—Ç—å',
 '–∞–≤—Ç–æ—Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–Ω–æ–≥–æ',
 'alumiar',
 'serbyaz',
 'vangay',
 '–æ–¥–Ω–æ–∫–ª–∞—Å—Å–Ω–∏–∫–∞–º–∏',
 'solikamskeparh',
 '–≤—ã—Ç–∫–∞–ª—Å—è',
 '–Ω–µ–ø—Ä–æ–º–æ–∫–∞–µ–º—ã–π',
 '–ø—Ä–æ–ø–∞–≥–∞–Ω–¥–∏—Å—Ç—Å–∫–∏–π',
 '–Ω–∞–ø–µ—Ä—Å–Ω–∏–∫',
 '–∫—É—Ç—É–µ–≤',
 '—Å–∞–π–ø–∞–Ω—É',
 'holles',
 '—Ç–µ–ª–µ—Ñ–æ–Ω–µ',
 '–¥–æ–±—ã–≤–∞–µ–º–∞—è',
 '–∂–µ—Ä–µ–±—Ü–æ–≤–∞',
 'griboedovbasement',
 'weddinglocation',
 '–ª–∞–¥–∫–∏–π',
 'trudbud',
 '—Å—É—Å–∞—Ä–µ–Ω–∫–æ',
 '–∫–∏—Ç—É–∞',
 'abcfinance',
 '–ø—Ä–æ–¥–ª–µ–≤–∞–µ—à—å',
 '–∞–Ω–∞–ª–∏–∑–∞—Ç–æ—Ä–æ–º',
 '–∫–∞–º–∞–Ω',
 'autolaser',
 '–ø–µ—Ä—Å–µ–π',
 'jimmypoy',
 'electrocar',
 '–≥–æ—Å–ø–æ–¥–∞—Ä—Å—å–∫–æ—ó',
 '–∫–æ–º–∏–∏–Ω—Ñ–æ—Ä–º',
 '–≥–∞–∑–æ–¥–æ–±—ã–≤–∞—é—â–∏—Ö',
 'fbid',
 '–≤–µ–∑–µ–Ω–∏—è',
 '–∏

In [4]:
print(len(list(vec.vocabulary_)))
vec.vocabulary_

3143


{'girls': 85,
 '–º–∏–ª–ª–∏–æ–Ω–∞–º–∏': 1367,
 'na': 157,
 '–ø–æ—Ä—Ç–∞–ª': 1974,
 '–ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏': 2034,
 '–∏—Å–∫–ª—é—á–∏—Ç–µ–ª—å–Ω–æ': 998,
 '—Å–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–µ': 2601,
 '—Å–Ω—è—Ç—å': 2541,
 '–∏—Å—Ç–æ—á–Ω–∏–∫': 1021,
 '–∫–∞—Ñ–µ': 1061,
 '–Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏—è–º': 1474,
 '—Ç—Ä–∞–¥–∏—Ü–∏–∏': 2823,
 '—Å–ª–æ–∂–Ω—ã–µ': 2514,
 '—Å–≤–æ–±–æ–¥–∞': 2424,
 'it': 121,
 '—Ç–≤–æ–∏—Ö': 2744,
 '—ç–∫—Å–∫–ª—é–∑–∏–≤–Ω—ã–µ': 3103,
 '–∫—Ä—É–ø–Ω—ã–π': 1192,
 '–æ—Å—Ç–∞–≤–ª—è—Ç—å': 1721,
 '–≥—Ä–∞–∂–¥–∞–Ω': 605,
 'info': 114,
 '—Ç–µ–º–∞—Ö': 2782,
 '—Å—Ç–∞—Ä–∞–µ–º—Å—è': 2653,
 '–±—Ä–æ–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ': 388,
 '–Ω—Ä–∞–≤–∏—Ç—Å—è': 1586,
 '—Ö–æ—á–µ—Ç': 3011,
 '—Å–ª–∞–¥–∫–∏–µ': 2503,
 'rem': 197,
 '–º–∞–≥–∞–∑–∏–Ω–µ': 1296,
 'art': 16,
 '—É—Å–ª—É–≥–∏': 2923,
 '—É–ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏–µ': 2907,
 '–º–µ–±–µ–ª–∏': 1334,
 '–¥–µ–ª–∞–π—Ç–µ': 658,
 '–ø—Ä–∞–≤–¥–∞': 2020,
 '—Å–æ—Ö—Ä–∞–Ω–∏—Ç—å': 2604,
 '–ø–æ–∏—Å–∫–∞—Ö': 1889,
 '–Ω–∞–ø—Ä—è–º—É—é': 1476,
 '–∫–∞–∂–¥—ã–º': 1040,
 '—Ç–≤–æ–∏': 2743,
 '–≤–∞—à–µ': 

In [5]:
vec.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 0.35,
 'max_features': None,
 'min_df': 0.025,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [6]:
'–∫–∏–Ω–æ' in vec.vocabulary_

True

In [7]:
'–º—É–∑—ã–∫–∞' in vec.vocabulary_

True