In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import os
from collections import Counter

from time import time

%matplotlib inline

import scipy.sparse

In [2]:
def id_counter(id_list):
    cnt = Counter()
    for el in id_list:
        cnt[el[u'element']] = el[u'count']
    return cnt

def_black_list = { 'unit', 'units', 'value', 'values', 'axis', 'axes', 'factor', 'factors', 'line', 'lines',
                 'point', 'points', 'number', 'numbers', 'variable', 'variables', 'respect', 'case', 'cases',
                 'vector', 'vectors', 'element', 'elements', 'example', 
                 'integer', 'integers', 'term', 'terms', 'parameter', 'parameters', 'coefficient', 'coefficients',
                 'formula', 'times', 'product', 'matrices', 'expression', 'complex', 'real', 'zeros', 'bits',
                 'sign',
                 'if and only if',
                 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 
                 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega'}

def valid_def(definition):
    if len(definition) <= 3:
        return False

    return definition.lower() not in def_black_list

def rel_to_dict(rels):
    res = defaultdict(list)
    for r in rels:
        if not valid_def(r['definition']):
            continue
        res[r['identifier']].append((r['definition'], r['score']))
    return res

In [3]:
from collections import defaultdict

doc_categories = defaultdict(set)
category_docs = defaultdict(set)

for line in file('C:/tmp/mlp/category_info_refined.txt'):
    title, cat = line.strip().split('\t')
    title = title.decode('utf-8')
    cat = cat.decode('utf-8')

    # let's also remove all documents from "OTHER" category
    if cat == u'OTHER':
        continue

    doc_categories[title].add(cat) 
    category_docs[cat].add(title)

In [4]:
root = 'C:/tmp/mlp/mlp-output/'

docs = []
titles = []
ids = []
rels = []

empty = 0
small = 0
uncategorized = 0

for f in os.listdir(root): 
    for line in file(root + f):
        doc = json.loads(line)

        title = doc['title']        
        if title not in doc_categories:
            uncategorized = uncategorized + 1
            continue

        if '(disambiguation)' in title:
            continue

        id_bag = id_counter(doc['identifiers'])
        if len(id_bag) <= 1:
            if len(id_bag) == 0:
                empty = empty + 1
            else:
                small = small + 1
            continue

        docs.append(doc)
        titles.append(title)
        ids.append(id_bag)

        id_rels = rel_to_dict(doc['relations'])
        rels.append(id_rels)

print empty, small, uncategorized

N_doc = len(ids)
print N_doc

0 0 7501
22512


In [5]:
title_idx = {title: idx for (idx, title) in enumerate(titles)}

for doc, cats in doc_categories.items():
    if doc in title_idx:
        continue

    for cat in cats: 
        category_docs[cat].remove(doc)
    
    del doc_categories[doc]

print len(doc_categories)

22512


In [6]:
doc_categories_list = [doc_categories[doc] for doc in titles]

Let's keep a copy of identifiers

In [236]:
ids = [id_counter(d['identifiers']) for d in docs]

Remove least common

In [12]:
np.mean([len(doc_ids) for doc_ids in ids])

13.736540511727078

In [10]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [14]:
for idx in xrange(N_doc):
    vals = rels[idx].items()
    id_list = ids[idx]

    for id, definitions in vals:
        for definition, score in definitions:
            for unigram in definition.lower().split():
                stem = snowball_stemmer.stem(unigram)
                key = u'%s_%s' % (id, stem)
                id_list[key] = id_list[key] + 1

In [15]:
all_ids = Counter()

for id_cnt in ids:
    all_ids.update(id_cnt)

print len(all_ids)

infrequent = set()
min_count = 2

for (el, cnt) in all_ids.items():
    if cnt <= min_count:
        infrequent.add(el)

print len(infrequent)

for id_cnt in ids:
    for id in (set(id_cnt) & infrequent):
        del id_cnt[id]

        
all_ids = Counter()

for id_cnt in ids:
    all_ids.update(id_cnt)

print len(all_ids)
        
del all_ids
del infrequent

148783
115734
33049


In [16]:
np.mean([len(doc_ids) for doc_ids in ids])

22.786202914001422

In [17]:
df = Counter()
for cnt in ids:
    df.update(list(cnt))

top = 50
mc = [id for (id, cnt) in df.most_common(top) if cnt > 3000]
print ' '.join(mc)

mc = set(mc)

for id_cnt in ids:
    for id in list(id_cnt):
        if id in mc:
            del id_cnt[id]

del mc
del df

n t x m p d g k f R l y c r T π C P b S s N B E X F j


In [18]:
np.mean([len(doc_ids) for doc_ids in ids])

16.980188343994314

Inverted index

In [19]:
inv_idx = {}

for (idx, id_list) in enumerate(ids):
    for id in id_list: 
        if id in inv_idx:
            inv_idx[id].append(idx)
        else:
            inv_idx[id] = [idx]

In [20]:
def docs_to_compare(doc_id):
    res = set([])
    id_list = ids[doc_id]
    for id in id_list:
        res.update(inv_idx[id])
    if doc_id in res:
        res.remove(doc_id)
    return res

In [21]:
np.mean([len(docs_to_compare(doc_id)) for doc_id in xrange(200)])

4862.1899999999996

## Jaccard 

In [22]:
k_matrix = 25
k_graph = 15
#sim_threshold = 0.1

In [23]:
ids_sets = [set(id_list) for id_list in ids]

def calc_jaccard(set1, set2):
    union = len(set1 | set2)
    if not union: 
        return 0.0

    inter = len(set1 & set2)
    return inter * 1.0 / union

In [24]:
t0 = time()
shared_nn = []

for i in xrange(N_doc):
    if i % 1000 == 0:
        print "iteration %d" % i

    doc_ids = np.array(list(docs_to_compare(i)))
    sim = np.zeros(len(doc_ids))

    for (idx, j) in enumerate(doc_ids):
        sim[idx] = calc_jaccard(ids_sets[i], ids_sets[j])
    
    sim_idx = sim.argsort()[-1:-k_matrix-1:-1]
    doc_ids_to_add = doc_ids[sim_idx]

    shared_nn.append(set(doc_ids_to_add[0:k_graph]))

print "done in %0.3fs." % (time() - t0)

iteration 0
iteration 1000
iteration 2000
iteration 3000
iteration 4000
iteration 5000
iteration 6000
iteration 7000
iteration 8000
iteration 9000
iteration 10000
iteration 11000
iteration 12000
iteration 13000
iteration 14000
iteration 15000
iteration 16000
iteration 17000
iteration 18000
iteration 19000
iteration 20000
iteration 21000
iteration 22000
done in 931.305s.


In [193]:
import snn_dbscan

Let's try to look for best params

In [17]:
import cluster_evaluation
import snn_dbscan

In [18]:
reload(cluster_evaluation)
evaluate = cluster_evaluation.Evaluator(doc_titles=titles, doc_ids=ids, 
                                        doc_ids_definitions=rels, doc_categories=doc_categories_list)

In [28]:
N_doc

22512

In [32]:
eps_list = [3, 4]
min_pts_list = [3, 4, 5, 6, 7, 8]

hyperparam_purity = {}
hyperparam_res = {}

for eps in eps_list:
    for min_pts in min_pts_list:
        res = np.array(snn_dbscan.dbscan(shared_nn, eps=eps, min_pts=min_pts))
        res[res == 'noise'] = 0
        res = res.astype(int)

        hyperparam_res[(eps, min_pts)] = res
        cluster_purity = evaluate.overall_purity(res)
        hyperparam_purity[(eps, min_pts)] = cluster_purity

        no_clusters = len(np.unique(res))
        no_pure_clusters = len(evaluate.high_purity_clusters(res, threshold=0.8))

        print 'purity for eps=%d, min_pts=%d is %0.4f, number of clusters: %d, number of >0.8 clusters: %d' % \
              (eps, min_pts, cluster_purity, no_clusters, no_pure_clusters)

purity for eps=3, min_pts=3 is 0.3325, number of clusters: 3694, number of >0.8 clusters: 53
purity for eps=3, min_pts=4 is 0.3264, number of clusters: 3558, number of >0.8 clusters: 53
purity for eps=3, min_pts=5 is 0.3214, number of clusters: 3427, number of >0.8 clusters: 54
purity for eps=3, min_pts=6 is 0.3167, number of clusters: 3318, number of >0.8 clusters: 47
purity for eps=3, min_pts=7 is 0.3126, number of clusters: 3217, number of >0.8 clusters: 45
purity for eps=3, min_pts=8 is 0.3080, number of clusters: 3131, number of >0.8 clusters: 41
purity for eps=4, min_pts=3 is 0.3343, number of clusters: 4014, number of >0.8 clusters: 50
purity for eps=4, min_pts=4 is 0.3189, number of clusters: 3668, number of >0.8 clusters: 49
purity for eps=4, min_pts=5 is 0.3095, number of clusters: 3443, number of >0.8 clusters: 47
purity for eps=4, min_pts=6 is 0.3016, number of clusters: 3265, number of >0.8 clusters: 42
purity for eps=4, min_pts=7 is 0.2935, number of clusters: 3111, numbe

Results for untruncated data

In [52]:
eps_list = [3, 4, 5, 6, 7, 8, 9, 10]
min_pts_list = [3, 4, 5, 6, 7]

hyperparam_purity = {}
hyperparam_res = {}

for eps in eps_list:
    for min_pts in min_pts_list:
        res = np.array(snn_dbscan.dbscan(jaccard_ssn_15, eps=eps, min_pts=min_pts))
        res[res == 'noise'] = 0
        res = res.astype(int)

        hyperparam_res[(eps, min_pts)] = res
        cluster_purity = evaluate.overall_purity(res)
        hyperparam_purity[(eps, min_pts)] = cluster_purity

        no_clusters = len(np.unique(res))
        no_pure_clusters = len(evaluate.high_purity_clusters(res, threshold=0.8))

        print 'purity for eps=%d, min_pts=%d is %0.4f, number of clusters: %d, number of >0.8 clusters: %d' % \
              (eps, min_pts, cluster_purity, no_clusters, no_pure_clusters)

purity for eps=3, min_pts=3 is 0.4807, number of clusters: 7028, number of >0.8 clusters: 79
purity for eps=3, min_pts=4 is 0.4665, number of clusters: 6687, number of >0.8 clusters: 82
purity for eps=3, min_pts=5 is 0.4541, number of clusters: 6394, number of >0.8 clusters: 78
purity for eps=3, min_pts=6 is 0.4412, number of clusters: 6117, number of >0.8 clusters: 82
purity for eps=3, min_pts=7 is 0.4315, number of clusters: 5909, number of >0.8 clusters: 80
purity for eps=4, min_pts=3 is 0.4446, number of clusters: 6497, number of >0.8 clusters: 61
purity for eps=4, min_pts=4 is 0.4148, number of clusters: 5820, number of >0.8 clusters: 73
purity for eps=4, min_pts=5 is 0.3888, number of clusters: 5276, number of >0.8 clusters: 72
purity for eps=4, min_pts=6 is 0.3703, number of clusters: 4844, number of >0.8 clusters: 66
purity for eps=4, min_pts=7 is 0.3530, number of clusters: 4491, number of >0.8 clusters: 58
purity for eps=5, min_pts=3 is 0.3678, number of clusters: 4999, numbe

In [53]:
evaluate.report_overall(hyperparam_res[(3, 4)], purity_threshold=0.8, sort_by='size')

overall purity 0.4665
number of high purity clusters of size at least 5 is 82

- Group theory (id=6038) size=21, purity=0.9524
- Propositional calculus (id=6705) size=21, purity=0.8095
- Astronomical catalogues (id=5951) size=20, purity=1.0000
- National Basketball Association seasons (id=3121) size=13, purity=0.9231
- Thermodynamics (id=5793) size=11, purity=0.9091
- Enzymes (id=58) size=10, purity=0.9000
- Linear algebra (id=6095) size=10, purity=0.8000
- Statistics (id=6867) size=10, purity=0.8000
- Group theory (id=6292) size=9, purity=1.0000
- Lie groups (id=531) size=8, purity=0.8750
- Differential geometry (id=1709) size=8, purity=0.8750
- Topology (id=3509) size=8, purity=0.8750
- Graph theory (id=5042) size=8, purity=1.0000
- Thermodynamics (id=6003) size=8, purity=1.0000
- Mathematical optimization (id=150) size=7, purity=1.0000
- Quantum mechanics (id=413) size=7, purity=0.8571
- Combinatorics (id=1663) size=7, purity=0.8571
- Materials science (id=3661) size=7, purity=0.857

In [54]:
evaluate.print_cluster(hyperparam_res[(3, 4)], 6127, collection_weighting=1)

size: 5

- Minor (linear algebra) (categories: Linear algebra, Matrix theory, Multilinear algebra, Determinants) k_matrix k_size C_11 p_matrix n C_21 C_22 M_23_c_23 n_matrix M_23_cofactor M_23 k_determinant M_23_entry C m_matrix K K_subsets T n_columns k_rows m_rows k j m C_12 j_cofactor p k_columns n_rows
- Centering matrix (categories: Statistics, Linear algebra, Statistical data types, Mathematical objects, Linear operators, ...) p_n X_sample m v_column-vector C_n O_size v_size X_data v_projection n_matrix C_1 n_identity C_3 C_2 n_size X_matrix μ C_n_semi-definite C n_ones O_matrix O S X_m-by-n C_n_property X p_2 p_1 X_multiplication O_n-by-n m_rows k C_m n p v n_column-vector v_components C_n_positive
- Kronecker product (categories: Linear algebra, Matrix theory) BD D_size B_transformations AC_matrix P_matrices m_matrix AC X_basis AXB BD_matrix AXB_instance p_matrix Q_permutation k_scalar r_B_kronecker B_12 r_A r_B B_11 B_nq r_A_nonzero B_⊗ p_q Q_matrices n_matrix B_product C_1 D_

In [403]:
evaluate.report_overall(hyperparam_res[(6, 5)], purity_threshold=0.8, sort_by='size')

overall purity 0.2294
number of high purity clusters of size at least 5 is 83

- Abstract algebra (id=79) size=45, purity=0.8444
- Coding theory (id=135) size=31, purity=0.9677
- Astronomical catalogues (id=1023) size=24, purity=1.0000
- Group theory (id=1406) size=24, purity=0.9167
- Solid mechanics (id=82) size=21, purity=0.8571
- Category theory (id=140) size=19, purity=0.9474
- Complex analysis (id=108) size=18, purity=1.0000
- Functional analysis (id=109) size=17, purity=0.8235
- Numerical analysis (id=20) size=16, purity=0.8750
- Probability theory (id=301) size=16, purity=0.8750
- Cartographic projections (id=531) size=16, purity=0.8750
- Chemical elements (id=496) size=15, purity=0.9333
- Fluid dynamics (id=519) size=15, purity=0.8000
- National Basketball Association seasons (id=644) size=15, purity=0.8000
- Mechanics (id=363) size=14, purity=0.9286
- Physical cosmology (id=454) size=14, purity=0.8571
- Cartography (id=226) size=12, purity=0.9167
- Radio frequency propagation 

##  K-means and Cosine

In [7]:
import kmeans

In [11]:
ids = [id_counter(d['identifiers']) for d in docs]

for idx in xrange(N_doc):
    vals = rels[idx].items()
    id_list = ids[idx]

    for id, definitions in vals:
        for definition, score in definitions:
            for unigram in definition.lower().split():
                stem = snowball_stemmer.stem(unigram)
                key = u'%s_%s' % (id, stem)
                id_list[key] = id_list[key] + 1

all_ids = Counter()

for id_cnt in ids:
    all_ids.update(id_cnt)

infrequent = set()
min_count = 5

for (el, cnt) in all_ids.items():
    if cnt <= min_count:
        infrequent.add(el)

for id_cnt in ids:
    for id in (set(id_cnt) & infrequent):
        del id_cnt[id]

del all_ids
del infrequent

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, randomized_svd

In [32]:
def unwrap_counter(cnt):
    res = []
    for id, c in cnt.items():
        res.extend([id] * c)
    return res

vectorizer = TfidfVectorizer(analyzer=unwrap_counter, use_idf=True)
X = vectorizer.fit_transform(ids)

In [33]:
X

<22512x33049 sparse matrix of type '<type 'numpy.float64'>'
	with 512963 stored elements in Compressed Sparse Row format>

In [22]:
from sklearn.cluster import KMeans, MiniBatchKMeans

Doesn't seem to give good results. 

In [35]:
km_hyperparam_purity = {}
km_hyperparam_res = {}

ks = [k * 100 for k in xrange(20, 31)]

for k in ks:
    km = MiniBatchKMeans(n_clusters=k, init_size=k*3, n_init=10, init='random')
    km.fit(X)

    km_hyperparam_res[k] = km.labels_
    cluster_purity = evaluate.overall_purity(km.labels_)
    km_hyperparam_purity[k] = cluster_purity

    no_pure_clusters = len(evaluate.high_purity_clusters(km.labels_, threshold=0.8))

    print 'purity for k=%d is %0.4f number of >0.8 clusters: %d' % (k, cluster_purity, no_pure_clusters)

purity for k=2000 is 0.1413 number of >0.8 clusters: 1
purity for k=2100 is 0.1346 number of >0.8 clusters: 0
purity for k=2200 is 0.1289 number of >0.8 clusters: 0
purity for k=2300 is 0.1350 number of >0.8 clusters: 1


KeyboardInterrupt: 

In [30]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer(copy=False)

In [36]:
n_components = [n * 100 for n in xrange(3, 8)]
ks = [k * 100 for k in xrange(25, 46)]

km_grid_hyperparam_purity = {}
km_grid_hyperparam_res = {}

for n in n_components:
    print 'doing LSA with %d components...' % n
    print

    U, S, Vt = randomized_svd(X, n_components=n)
    V = Vt.T

    X_red_grid = X.dot(V)
    X_red_grid = normalizer.fit_transform(X_red_grid)


    for k in ks:
        if k > 1000 and n > 70:
            km_grid = MiniBatchKMeans(n_clusters=k, init_size=k*3, n_init=10, init='random')
        else:
            km_grid = MiniBatchKMeans(n_clusters=k, init_size=k*3, n_init=10)
        km_grid.fit(X_red_grid)

        km_grid_hyperparam_res[(n, k)] = km_grid.labels_
        cluster_purity = evaluate.overall_purity(km_grid.labels_)
        km_grid_hyperparam_purity[(n, k)] = cluster_purity

        no_pure_clusters = len(evaluate.high_purity_clusters(km_grid.labels_, threshold=0.8))

        print 'purity for n=%d, k=%d is %0.4f, number of >0.8 clusters: %d' % (n, k, cluster_purity, no_pure_clusters)

doing LSA with 300 components...

purity for n=300, k=2500 is 0.3221, number of >0.8 clusters: 49
purity for n=300, k=2600 is 0.3210, number of >0.8 clusters: 29
purity for n=300, k=2700 is 0.3231, number of >0.8 clusters: 37
purity for n=300, k=2800 is 0.3306, number of >0.8 clusters: 39
purity for n=300, k=2900 is 0.3363, number of >0.8 clusters: 41
purity for n=300, k=3000 is 0.3366, number of >0.8 clusters: 38
purity for n=300, k=3100 is 0.3454, number of >0.8 clusters: 43
purity for n=300, k=3200 is 0.3505, number of >0.8 clusters: 53
purity for n=300, k=3300 is 0.3551, number of >0.8 clusters: 55
purity for n=300, k=3400 is 0.3576, number of >0.8 clusters: 48
purity for n=300, k=3500 is 0.3652, number of >0.8 clusters: 49
purity for n=300, k=3600 is 0.3628, number of >0.8 clusters: 53
purity for n=300, k=3700 is 0.3744, number of >0.8 clusters: 54
purity for n=300, k=3800 is 0.3674, number of >0.8 clusters: 43
purity for n=300, k=3900 is 0.3810, number of >0.8 clusters: 56
purity

In [39]:
evaluate.report_overall(km_grid_hyperparam_res[(700, 4500)], purity_threshold=0.8, sort_by='size')

overall purity 0.3964
number of high purity clusters of size at least 5 is 73

- Astronomical catalogues (id=538) size=53, purity=0.9811
- Stochastic processes (id=1999) size=18, purity=0.8889
- Group theory (id=175) size=17, purity=0.8235
- Quantum mechanics (id=1751) size=16, purity=0.9375
- Quantum mechanics (id=1153) size=12, purity=0.9167
- Physics (id=530) size=11, purity=0.8182
- Probability distributions (id=3138) size=11, purity=0.8182
- Cartographic projections (id=4020) size=11, purity=0.9091
- Deformation (id=74) size=10, purity=0.8000
- Astrodynamics (id=826) size=10, purity=0.8000
- Thermodynamics (id=918) size=10, purity=0.9000
- Electromagnetism (id=1755) size=10, purity=0.8000
- Group theory (id=3301) size=10, purity=0.9000
- National Basketball Association seasons (id=282) size=9, purity=1.0000
- Electrochemistry (id=2) size=8, purity=0.8750
- Set theory (id=3525) size=8, purity=1.0000
- Enzymes (id=176) size=7, purity=0.8571
- Thermodynamics (id=271) size=7, purity=0

In [117]:
reload(cluster_evaluation)
evaluate = cluster_evaluation.Evaluator(doc_titles=titles, doc_ids=ids, 
                                        doc_ids_definitions=rels, doc_categories=doc_categories_list)

In [121]:
evaluate.find_identifier(km_grid_hyperparam_res[(700, 4500)], purity_threshold=0.8, id=u'σ', collection_weighting=0)

overall purity 0.3964
number of high purity clusters of size at least 5 is 73

category "Mathematical finance", cluster_id=28, size=5:
top categories: [(u'Mathematical finance', 5), (u'Mathematical science occupations', 4), (u'Fields of finance', 4), (u'Applied mathematics', 4), (u'Fields of application of statistics', 4)]
σ: (constant volatility: 2.71), (short rate volatility: 0.93), (model: 0.89), (security price: 0.86), (stock prices: 0.82)

category "Deformation", cluster_id=74, size=10:
top categories: [(u'Deformation', 8), (u'Plasticity', 8), (u'Mechanics', 8), (u'Solid mechanics', 8), (u'Continuum mechanics', 7)]
σ: (Examples: 0.99), (Main: 0.99), (Cauchy stress: 0.95), (articles: 0.95), (shear strength: 0.93), (Spinors in three dimensions: 0.89), (normal stress: 0.89)

category "General relativity", cluster_id=121, size=5:
top categories: [(u'General relativity', 4), (u'Theory of relativity', 4), (u'Physical cosmology', 3), (u'Theories of gravitation', 3), (u'Astronomical objec

In [123]:
evaluate.print_cluster(km_grid_hyperparam_res[(700, 4500)], 121, collection_weighting=0)

size: 5

- Newman–Penrose formalism (categories: Theoretical physics, Mechanics, Mathematical physics, Modern physics, Mathematics, ...) n_ingo Λ_scalar Φ_22 Φ_21 Φ_20 α ε ν ρ D n_vector ϕ Φ_10 Φ_11 d ϕ_2 m_b m_a l Ψ_i_scalar ϝ_i ϵ t R_trace-fre Φ_01 Φ_00 Φ_02 β κ R_normal G S c g R_tensor ϝ_i_field ϕ_1 ϕ_0 Λ h_× γ λ σ F ϕ_j ϕ_i R_einstein R b j n l_b r Ψ_4 Ψ_2 Ψ_3 Ψ_0 Ψ_1 Ψ_i_ricci-np n_null Δ n_a n_b δ θ μ π τ r_direct Y R_relat l_a h_+ Ψ_i ϝ_i_scalar R_part Ψ_i_weyl-np r_propag m ∇_m ∇_l Ψ_4_scalar ∇_n Φ_12 ∇_a ∇_b
- Weyl scalar (categories: General relativity, Theory of relativity, Theories of gravitation, Physical cosmology) Ψ_4 Ψ_2 Ψ_3 Ψ_0 Ψ_1 Δ Λ n_a D h_× α γ β ε δ λ κ ν μ ρ π σ τ R h_+ Ψ_i m l n Ψ_4_scalar
- Non-expanding horizon (categories: Astrophysics, General relativity, Unsolved problems in physics, Astronomical objects, Density, ...) n_ingo Φ_20 α ε δ ρ D ω n_field F_field T Φ_10 v_ingo Λ_field d ϝ_i_field m_b m_a l Ψ_i_scalar ϝ_i D_connect h v_coordin g_genus Φ_01 Φ_00

In [125]:
evaluate.find_identifier(km_grid_hyperparam_res[(700, 4500)], purity_threshold=0.8, id=u'ϕ')

overall purity 0.3964
number of high purity clusters of size at least 5 is 73

category "Deformation", cluster_id=74, size=10:
top categories: [(u'Deformation', 8), (u'Plasticity', 8), (u'Mechanics', 8), (u'Solid mechanics', 8), (u'Continuum mechanics', 7)]
ϕ: (slope: 8.13), (angle: 6.15)

category "Statistics", cluster_id=185, size=5:
top categories: [(u'Statistics', 4), (u'Statistical theory', 4), (u'Scientific theories', 3), (u'Estimation theory', 2), (u'Bayesian statistics', 2)]
ϕ: (Jeffreys: 8.35)

category "Physics", cluster_id=530, size=11:
top categories: [(u'Physics', 9), (u'Theory of relativity', 8), (u'Special relativity', 7), (u'Concepts in physics', 6), (u'Concepts by field', 4)]
ϕ: (rapidity: 14.94), (hyperbolic angle: 8.70), (hyperbolic rotation: 7.51)

category "Differential equations", cluster_id=764, size=5:
top categories: [(u'Differential equations', 4), (u'Partial differential equations', 3), (u'Mathematical analysis', 2), (u'Fluid mechanics', 2), (u'Theoretical ph

The scores below are without df-weighting

In [31]:
n_components = [200, 300, 400, 500]
ks = [k * 100 for k in xrange(20, 41)]

km_grid_hyperparam_purity = {}
km_grid_hyperparam_res = {}

for n in n_components:
    print 'doing LSA with %d components...' % n
    print

    U, S, Vt = randomized_svd(X, n_components=n)
    V = Vt.T

    X_red_grid = X.dot(V)
    X_red_grid = normalizer.fit_transform(X_red_grid)


    for k in ks:
        if k > 1000 and n > 70:
            km_grid = MiniBatchKMeans(n_clusters=k, init_size=k*3, n_init=10, init='random')
        else:
            km_grid = MiniBatchKMeans(n_clusters=k, init_size=k*3, n_init=10)
        km_grid.fit(X_red_grid)

        km_grid_hyperparam_res[(n, k)] = km_grid.labels_
        cluster_purity = evaluate.overall_purity(km_grid.labels_)
        km_grid_hyperparam_purity[(n, k)] = cluster_purity

        no_pure_clusters = len(evaluate.high_purity_clusters(km_grid.labels_, threshold=0.8))

        print 'purity for n=%d, k=%d is %0.4f, number of >0.8 clusters: %d' % (n, k, cluster_purity, no_pure_clusters)

doing LSA with 200 components...

purity for n=200, k=2000 is 0.2913, number of >0.8 clusters: 21
purity for n=200, k=2100 is 0.3085, number of >0.8 clusters: 44
purity for n=200, k=2200 is 0.3157, number of >0.8 clusters: 37
purity for n=200, k=2300 is 0.3182, number of >0.8 clusters: 40
purity for n=200, k=2400 is 0.3141, number of >0.8 clusters: 36
purity for n=200, k=2500 is 0.3291, number of >0.8 clusters: 45
purity for n=200, k=2600 is 0.3321, number of >0.8 clusters: 39
purity for n=200, k=2700 is 0.3269, number of >0.8 clusters: 36
purity for n=200, k=2800 is 0.3315, number of >0.8 clusters: 39
purity for n=200, k=2900 is 0.3433, number of >0.8 clusters: 38
purity for n=200, k=3000 is 0.3509, number of >0.8 clusters: 56
purity for n=200, k=3100 is 0.3592, number of >0.8 clusters: 49
purity for n=200, k=3200 is 0.3511, number of >0.8 clusters: 35
purity for n=200, k=3300 is 0.3499, number of >0.8 clusters: 46
purity for n=200, k=3400 is 0.3679, number of >0.8 clusters: 54
purity

KeyboardInterrupt: 

### cosine + dbscan