In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import json
import os
import pickle

from collections import defaultdict
from collections import Counter

from time import time

%matplotlib inline

import scipy.sparse

In [2]:
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
matplotlib.rc('axes', labelsize='large') 

In [3]:
def id_counter(id_list):
    cnt = Counter()
    for el in id_list:
        cnt[el[u'element']] = el[u'count']
    return cnt

def_black_list = { 'unit', 'units', 'value', 'values', 'axis', 'axes', 'factor', 'factors', 'line', 'lines',
                 'point', 'points', 'number', 'numbers', 'variable', 'variables', 'respect', 'case', 'cases',
                 'vector', 'vectors', 'element', 'elements', 'example', 
                 'integer', 'integers', 'term', 'terms', 'parameter', 'parameters', 'coefficient', 'coefficients',
                 'formula', 'times', 'product', 'matrices', 'expression', 'complex', 'real', 'zeros', 'bits',
                 'sign',
                 'if and only if',
                 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 
                 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega'}

def valid_def(definition):
    if len(definition) <= 3:
        return False

    return definition.lower() not in def_black_list

def rel_to_dict(rels):
    res = defaultdict(list)
    for r in rels:
        if not valid_def(r['definition']):
            continue
        res[r['identifier']].append((r['definition'], r['score']))
    return res


doc_categories = defaultdict(set)
category_docs = defaultdict(set)

for line in file('C:/tmp/mlp/category_info_refined.txt'):
    title, cat = line.strip().split('\t')
    title = title.decode('utf-8')
    cat = cat.decode('utf-8')

    # let's also remove all documents from "OTHER" category
    if cat == u'OTHER':
        continue

    doc_categories[title].add(cat) 
    category_docs[cat].add(title)

root = 'C:/tmp/mlp/mlp-output/'

docs = []
titles = []
ids = []
rels = []

empty = 0
small = 0
uncategorized = 0

for f in os.listdir(root): 
    for line in file(root + f):
        doc = json.loads(line)

        title = doc['title']        
        if title not in doc_categories:
            uncategorized = uncategorized + 1
            continue

        if '(disambiguation)' in title:
            continue

        id_bag = id_counter(doc['identifiers'])
        if len(id_bag) <= 1:
            if len(id_bag) == 0:
                empty = empty + 1
            else:
                small = small + 1
            continue

        docs.append(doc)
        titles.append(title)
        ids.append(id_bag)

        id_rels = rel_to_dict(doc['relations'])
        rels.append(id_rels)

print empty, small, uncategorized

N_doc = len(ids)
print N_doc

title_idx = {title: idx for (idx, title) in enumerate(titles)}

for doc, cats in doc_categories.items():
    if doc in title_idx:
        continue

    for cat in cats: 
        category_docs[cat].remove(doc)
    
    del doc_categories[doc]

print len(doc_categories)

doc_categories_list = [doc_categories[doc] for doc in titles]

0 0 7501
22512
22512


In [4]:
import cluster_evaluation
reload(cluster_evaluation)
evaluate = cluster_evaluation.Evaluator(doc_titles=titles, doc_ids=ids, 
                                        doc_ids_definitions=rels, doc_categories=doc_categories_list)

## Using MSC and PACS

### MSC

Data taken from http://cran.r-project.org/web/classifications/MSC.html

In [5]:
tree = {}

top_parent = None
parent = None

for line in file('C:/tmp/mlp/msc.txt'):
    if line.startswith('#'):
        continue

    tabs = sum(1 for s in line[:8] if s == ' ') / 4
    code, name = line.strip().split(': ', 1)
    name = name.decode('utf-8')
    name = name.replace('$', '')
    
    if code.endswith('99'):
        # 'Miscellaneous topics' or 'None of the above, but in this section'
        continue
  
    if tabs == 0:
        top_parent = name
        tree[top_parent] = {}
    elif tabs == 1:
        parent = name
        tree[top_parent][parent] = []
    else:
        tree[top_parent][parent].append(name)

Let's remove categories without subcategories: they anyways are usually precedings or historical or monographs. "Aplications" subcategories don't seem to be very useful - remove as weel. Finally, let's remove some top categories like "general", "History" etc

Some are physics specific that have many common words, so let's remove than and let PACS be used

In [6]:
del tree['General']
del tree['History and biography']
del tree['Mathematics education']

del tree['Quantum theory']['Axiomatics, foundations, philosophy']
del tree['Quantum theory']['Applications to specific physical systems']
del tree['Quantum theory']['Groups and algebras in quantum theory']

del tree['Partial differential equations']['Equations of mathematical physics and other areas of application']
del tree['Statistics']['Sufficiency and information']
del tree['Functional analysis']['Other (nonclassical) types of functional analysis']
del tree['Functional analysis']['Miscellaneous applications of functional analysis']

In [7]:
for k_top, top in tree.items():
    for k_2, v in top.items():
        if not v:
            del top[k_2]
        elif k_2 == u'Applications':
            del top[k_2]
        elif 'proceedings' in k_2.lower() or 'conferences' in k_2.lower() or 'collections' in k_2.lower():
            del top[k_2]

### PACS

data taken from https://github.com/teefax/pacsparser

In [8]:
import re

tree_pacs = {}

top_top_parent = None
top_parent = None
parent = None

pacs_file = file('C:/tmp/mlp/pacs.txt')

see_also_re = re.compile('\(see also.+?\)')
for_see_re = re.compile('\(for.+?see.+?\)')
tags_re = re.compile('<[^ ].*?>')

for line in pacs_file:
    if line.startswith('#'):
        continue
    if not line.strip():
        continue

    line = line.strip()
    code = line[0:8]
    if code.strip() == '... ...':
        continue

    if code == 'APPENDIX':
        break
        
    name = line[9:].decode('utf-8').replace('$', '')

    name = see_also_re.sub('', name)
    name = for_see_re.sub('', name)
    name = tags_re.sub('', name)

    codes = code.split('.')
    if len(codes) < 3:
        print code
    is_top_level =  False

    if codes[0].isdigit():
        top_code = int(codes[0])
        is_top_level = (top_code % 10 == 0) & (codes[1] == '00') & (codes[2] == '00')

    if is_top_level:
        top_top_parent = name
        tree_pacs[top_top_parent] = {}
    elif (codes[1] == '00') & (codes[2] == '00'):
        top_parent = name
        tree_pacs[top_top_parent][top_parent] = {}
    elif codes[2][0] in ['+', '-']:
        parent = name
        tree_pacs[top_top_parent][top_parent][parent] = []
    else: # tabs == 0
        tree_pacs[top_top_parent][top_parent][parent].append(name)

In [9]:
del tree_pacs['GENERAL']

In [10]:
pacs = {}
for k_0, cat_top in tree_pacs.items():
    for k_1, cat in cat_top.items():
        pacs[k_0 + ' ' + k_1] = cat

In [11]:
general_pacs = {}
for k_0, cat_top in tree_pacs.items():
    general_pacs[k_0] = {}

    for k_1, cat in cat_top.items():
        desc = []
        for k_2, low_cat in cat.items():
            desc.append(k_2)
            desc.extend(low_cat)
        
        general_pacs[k_0][k_1] = desc

### Wikipedia Categories

MSC's "Statistics" category doesn't seem to be good enough - statistical clusters gets mapped to anything else but stats. So let's create a stat category ourselves using wiki categories data 

In [12]:
wiki_tree = {}

narrower = defaultdict(set)
broader = defaultdict(set)

for line in file('C:/tmp/mlp/skos_math_broader.txt'):
    all_categories = line.decode('utf-8').strip().split('\t')
    
    child = all_categories[0]
    broader_list = all_categories[1:]

    for parent in broader_list:
        narrower[parent].add(child)
        broader[child].add(parent)

wiki_stat_category = set()

wiki_stat_category.update(narrower['Statistics'])

for cat in narrower['Statistics']:
    wiki_stat_category.update(narrower[cat])

wiki_tree['Statistics'] = {}
wiki_tree['Statistics']['Wiki statistics'] = wiki_stat_category

### ACM Classification Scheme

Can be downloaded from http://www.acm.org/about/class/class/2012 and has parsable skos format http://dl.acm.org/ft_gateway.cfm?id=2371137&ftid=1290922&dwn=1

In [13]:
import rdflib
from rdflib.namespace import SKOS

import logging
logging.getLogger('rdflib.term').setLevel(logging.ERROR)

In [14]:
g = rdflib.Graph()
g.load('C:/tmp/mlp/acm_skos_taxonomy.xml')

pref_len = len('file:///C://tmp/mlp/acm_skos_taxonomy.xml')

acm_titles = {}

for s, p, o in g.triples((None, SKOS.prefLabel, None)):
    if o.language != 'en':
        continue
    acm_titles[s[pref_len:]] = o.value

broader = defaultdict(set)
narrower = defaultdict(set)

for s, _, o in g.triples((None, SKOS.broader, None)):
    subj_title = acm_titles[s[pref_len:]]
    obj_title = acm_titles[o[pref_len:]]
    broader[subj_title].add(obj_title)
    narrower[obj_title].add(subj_title)

top_level = ['Hardware', 'Computer systems organization', 'Networks', 'Software and its engineering',
             'Theory of computation', 'Information systems', 'Security and privacy', 'Human-centered computing',
             'Computing methodologies']

acm_tree = {}

def dfs(result, cat):
    result.append(cat)
    if cat in narrower:
        for n_cat in narrower[cat]:
            dfs(result, n_cat)
    return result

for most_top in top_level:
    acm_tree[most_top] = {}
    for cat in narrower[most_top]:
        acm_tree[most_top][cat] = dfs([], cat)

## Category Index

Now we can build an index from these categories - and can find the most similar category fro each cluster

But first, let's merge MCS and PACS and other categories

In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
all_cat_tree = defaultdict(dict)

for d in [tree, general_pacs, acm_tree]: #wiki_tree
    for k, dict_inner in d.items():
        all_cat_tree[k].update(dict_inner)

In [17]:
all_categories = []
all_categories_idx = {}

english_sw = set(stopwords.words('english') + 
                 ['etc', 'given', 'method', 'methods', 'theory', 'problem', 'problems', 'model', 'models'] + 
                 ['section'] + ['must', 'also'])

cnt = 0

for k_top, top in all_cat_tree.items():
    for k_2, v in top.items():
        top = ' '.join([k_top] * 3)
        document = top + ' ' + k_2 + ' ' + ' '.join(v)
        tokens = word_tokenize(document)
        tokens = [t.lower() for t in tokens if t.isalpha()]
        tokens = [t for t in tokens if t not in english_sw]
        # tokens = [snowball_stemmer.stem(t) for t in tokens]
        all_categories.append(tokens)
        all_categories_idx[cnt] = (k_top, k_2)
        cnt = cnt + 1


def identity(lst): return lst
category_vectorizer = TfidfVectorizer(analyzer=identity).fit(all_categories)

cat_index = category_vectorizer.transform(all_categories)
cat_index

<533x5321 sparse matrix of type '<type 'numpy.float64'>'
	with 19661 stored elements in Compressed Sparse Row format>

### Indexing Clusters

Now let's apply the same transformation to clusters

In [18]:
cluster_assignment = pickle.load(file('C:/tmp/mlp/namespaces/best414.bin'))

In [19]:
descriptions = evaluate.high_purity_clusters(cluster_assignment, threshold=0.8, all_categories=1, min_size=3)
desc_ids = [d['cluster'] for d in descriptions]
len(desc_ids)

414

In [20]:
def counter_to_string(cnt, repeat=1):
    if repeat:
        return ' '.join([(word + ' ') * cnt for word, cnt in cl_cats.items()])
    else:
        return ' '.join(cnt.keys())

def all_definitions(clustering, cluster_index):
    indices, = np.where(clustering == cluster_index)
    
    all_defs = []
    for idx in indices: 
        idx = int(idx)
        for lst in rels[idx].values():
            for d, _ in lst:
                all_defs.extend(d.split())
    return all_defs

clusters_representation = []

for cl_id in desc_ids:
    cl_titles, cl_cats = evaluate.cluster_details(cluster_assignment, cl_id)
    #defs = all_definitions(clustering, cl_id)
    document = ' '.join(cl_titles) + ' ' + counter_to_string(cl_cats) #+ ' '.join(defs)
    tokens = word_tokenize(document)
    tokens = [t.lower() for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in english_sw]
    # tokens = [snowball_stemmer.stem(t) for t in tokens]
    clusters_representation.append(tokens)
    
clus_index = category_vectorizer.transform(clusters_representation)
clus_index

<414x5321 sparse matrix of type '<type 'numpy.float64'>'
	with 10564 stored elements in Compressed Sparse Row format>

Now calculate cosine between VS of hierarchy and clusters

In [21]:
clus_cat_sim = (clus_index * cat_index.T).toarray()
best_similarity = clus_cat_sim.max(axis=1)
clus_cat_assignment = clus_cat_sim.argmax(axis=1)
clus_cat_assignment

array([400, 406, 441, 345, 489, 455, 173,  24, 367, 250,  71, 347, 406,
        27, 382, 399, 347, 419, 274, 527,  45, 455, 485,  24, 489, 370,
       307,   5,  73, 113,  73,  53, 138, 370, 442, 362, 370, 385, 343,
       478, 527,  25, 417, 329, 102, 311, 526, 410, 354, 195, 432,  64,
       370,  53,   8, 278, 397, 475, 409, 489, 417,  13, 530, 343, 324,
       274, 201, 422, 432, 262,  73, 201, 362, 101, 112, 326, 526,  67,
        51, 409, 432, 324,  81, 527, 281, 417, 528, 419, 262, 424, 274,
       304, 217, 517, 417, 384, 324, 489,   2, 364, 103, 445, 106, 307,
        51, 483, 195, 112,  84, 196, 379, 240, 266, 103, 354, 380, 347,
       267, 311,  67, 419, 274,   2, 475,  81, 264,  51, 113,   1,  51,
       112, 311, 513, 445, 173, 343, 362, 188,  85, 267,  25, 372, 193,
       452, 324,  63, 442, 372, 324,  73, 484, 103, 324, 490,  73, 372,
       341, 330, 341, 175, 398, 478,  64, 480, 422, 332, 343, 330, 526,
       406, 201,  97, 429, 390, 417, 323, 173, 387, 440, 377, 42

In [22]:
clus_id = 10
evaluate.print_cluster(cluster_assignment, desc_ids[clus_id])

cluster 217, size: 5
- Stirling's approximation (categories: Mathematical analysis, Mathematical theorems, Number theory, Complex analysis, Special functions, ...) π B_k d Π s k j m Γ O n p S t N_+ y x c_n z
- Error function (categories: Complex analysis, Special functions, Types of functions, Analytic functions, Real analysis, ...) Γ E_n τ Φ γ ζ π σ E_0 D c_k c_m O N Q P F_1 w d g k j m l n q p erf erfc t R_N x z
- Clausen function (categories: Special functions, Analytic number theory, Algebraic number theory, Zeta and L-functions) Sl_4 Γ Sl_1 Sl_2 Sl_3 Λ Li_2 Cl_4 Cl_5 Cl_6 Cl_1 Cl_2 Cl_3 β Gl_m η ζ θ ψ_m π Li_n G φ S_z L R Ti_2 Z B_j C_z d B_n Cl_s k j m l n q p s t y x z
- Properties of polynomial roots (categories: Mathematical analysis, Abstract algebra, Fields of mathematics, Functions and mappings, Numerical analysis, ...) σ b_0 b_1 b_m d_1 N_n ζ z_j z_n π C B D R_p M O Q P R x_± z_1 c b d g h k j m l n q p t x z
- Struve function (categories: Combinatorics, Complex analysis, 

## Building a hierarhcy

Let's build a hierarchy:

In [23]:
namespaces = defaultdict(list)
namespace_name = []

for idx in range(len(desc_ids)):
    cat_id = clus_cat_assignment[idx]
    desc = descriptions[idx]

    score = best_similarity[idx]
    common_keywords = set(clusters_representation[idx]) & set(all_categories[cat_id])

    parent_cat, namespace_cat = all_categories_idx[cat_id]
    if score <= 0.2 or len(common_keywords) == 1:
        parent_cat = 'OTHER'

    namespaces[parent_cat].append((namespace_cat, score, desc, common_keywords))
    namespace_name.append((desc['cluster'], (parent_cat, namespace_cat)))

namespaces = sorted(namespaces.items())

In [24]:
class Namespace():

    _parent = None
    _name = None
    _identifiers = None
    _children = None
    _relations = None
    _wiki_cats = None

    _cluster_id = None

    def __init__(self, name, parent=None):
        self._name = name
        self._children = []

        if parent:
            self._parent = parent
            self._parent._children.append(self)

    def set_wiki_categories(self, wiki_cats):
        self._wiki_cats = wiki_cats
        
    def most_common_wiki_cat(self):
        if self._wiki_cats:
            return self._wiki_cats.most_common(1)[0]
        else:
            return None

    def set_relations(self, relations):
        self._relations = relations

    def set_additional_info(self, cluster_id, purity, matching_score, matching_terms):
        self._cluster_id = cluster_id
        self._purity = purity
        self._matching_score = matching_score
        self._matching_terms = matching_terms

    def print_ns(self, indend=0, print_rels=0):
        indend_str = ' ' * (4 * indend)
        print indend_str, 'Category: %s' % self._name
        if self._wiki_cats:
            print indend_str, '          wiki categories:', 
            print ', '.join('%s (%d)' % (cat, cnt) for cat, cnt in self._wiki_cats.most_common(3))
        if self._cluster_id:
            print indend_str, '          cluster_id: %d (matching score: %0.2f, purity: %0.2f)' % \
                                    (self._cluster_id, self._matching_score, self._purity)
            print indend_str, '          common:', ' '.join(self._matching_terms)

        if print_rels and self._relations:
            for id, def_list in self._relations:
                print indend_str, '-',
                print evaluate._string_def_list(id, def_list) 
        
        print 
        for child in self._children:
            child.print_ns(indend+1, print_rels=print_rels)
            print
    
    def __repr__(self):
        return self._name

In [25]:
ROOT = Namespace('ROOT')

for parent_cat, groups in namespaces:
    parent_namespace = Namespace(parent_cat, ROOT)

    for cat, score, desc, common in groups:
        ns = Namespace(cat, parent_namespace)
        ns.set_wiki_categories(desc['all_categories'])

        wiki_category = ns.most_common_wiki_cat()
        cluster_id = desc['cluster']
        ns.set_additional_info(cluster_id, desc['purity'], score, common)

        all_def = evaluate.find_all_def(cluster_assignment, cluster_id) 
        all_items = sorted(all_def.items())
        ns.set_relations(all_items)

In [26]:
ROOT.print_ns(print_rels=0)

 Category: ROOT

     Category: ATOMIC AND MOLECULAR PHYSICS

         Category: Atomic properties and interactions with photons 
                   wiki categories: Atomic physics (3), Quantum mechanics (3), Atomic, molecular, and optical physics (3)
                   cluster_id: 4338 (matching score: 0.34, purity: 1.00)
                   common: optical molecular physics atomic



     Category: Algebraic geometry

         Category: Computational aspects in algebraic geometry
                   wiki categories: Sheaf theory (3), Theorems in geometry (3), Theorems in algebraic geometry (3)
                   cluster_id: 86 (matching score: 0.60, purity: 1.00)
                   common: surfaces geometry varieties algebraic


         Category: Computational aspects in algebraic geometry
                   wiki categories: Algebraic geometry (4), Algebraic varieties (3), Manifolds (3)
                   cluster_id: 2977 (matching score: 0.59, purity: 0.80)
                   common:

In [27]:
evaluate.print_cluster(cluster_assignment, 2892)

cluster 2892, size: 4
- Early stopping (categories: Machine learning, Cybernetics, Artificial intelligence, Mathematical modeling, Networks, ...) L_2 E_z x_i y_i ρ E f_ρ H R Y X Z γ_t d f m n f_t t y x z
- Stability (learning theory) (categories: Machine learning, Artificial intelligence, Learning, Cognition, Behavior, ...) E_z x_m f_S o P_S E_S x_1 y_m β δ z_i z_m C E H L O S V Y X Z Z_m z_1 d f k m l y_1 n y x z
- Leave one out error (categories: Machine learning, Artificial intelligence, Learning) E_z x_m f_S o P_S y_m β δ z_i z_m V E F L S x_1 Y X Z Z_m z_1 f m l y_1 n y x z
- Statistical learning theory (categories: Machine learning, Artificial intelligence, Learning) x Z p d f H z γ y_1 n Θ y_n S R y_i V Y X θ y f_S

common terms: (z f n y x Y X Z)
top categories: (Machine learning, 4), (Learning, 4), (Artificial intelligence, 4), (Neural networks, 1), (Computational neuroscience, 1)
purity: 1.000
relations:
     E: (unknown probability measure: 0.83)
     H: (hypothesis space*: 

In [77]:
namespace_name[0]

(24,
 (u'Numerical analysis',
  u'Mathematical programming, optimization and variational techniques'))

In [88]:
top_namespaces = set()
for (p, c), t in Counter([(p, c) for id, (p, c) in namespace_name]).most_common(15):
    print p + ': ' + c, t
    top_namespaces.add((p, c))

Fluid mechanics: Biological fluid mechanics 10
Partial differential equations: Close-to-elliptic equations 9
Field theory and polynomials: Differential and difference algebra 9
Game theory, economics, social and behavioral sciences: Mathematical economics 8
Probability theory and stochastic processes: Foundations of probability theory 8
Mathematical logic and foundations: General logic 7
Group theory and generalizations: Connections with homological algebra and category theory 7
Geometry: Analytic and descriptive geometry 7
OTHER: Properties of specific nuclei listed by mass ranges (an additional heading must be chosen with these entries, where the given mass number limits are, to some degree, arbitrary) 7
Probability theory and stochastic processes: Distribution theory 7
Optics, electromagnetic theory: General 7
Differential geometry: Classical differential geometry 7
Numerical analysis: Numerical linear algebra 7
Quantum theory: General mathematical topics and methods in quantum theo

In [89]:
top_ns = defaultdict(list)

for id, pair in namespace_name:
    if pair in top_namespaces:
        top_ns[pair].append(id)

In [90]:
top_ns

defaultdict(<type 'list'>, {(u'Probability theory and stochastic processes', u'Distribution theory'): [373, 1992, 2668, 4251, 4308, 6153, 9112], (u'Group theory and generalizations', u'Connections with homological algebra and category theory'): [1328, 2746, 4764, 4861, 6748, 7228, 7566], (u'Quantum theory', u'General mathematical topics and methods in quantum theory'): [1767, 2652, 6256, 6783, 8354, 8648, 9145], (u'Partial differential equations', u'Close-to-elliptic equations'): [1576, 1665, 3919, 4748, 5596, 5640, 7407, 7954, 8443], (u'Differential geometry', u'Classical differential geometry'): [2234, 2492, 3453, 5302, 6325, 7885, 7959], ('OTHER', u'Computational methods'): [1855, 2107, 3388, 3508, 4995, 8893], (u'Field theory and polynomials', u'Differential and difference algebra'): [228, 359, 2566, 5649, 6823, 7402, 7890, 8391, 8847], (u'Mathematical logic and foundations', u'General logic'): [1123, 2369, 5751, 6281, 7379, 8524, 9249], (u'Probability theory and stochastic process

In [120]:
import cluster_evaluation
reload(cluster_evaluation)
evaluate = cluster_evaluation.Evaluator(doc_titles=titles, doc_ids=ids, 
                                        doc_ids_definitions=rels, doc_categories=doc_categories_list)

In [124]:
for (p, c), id_list in top_ns.items():
    print p, ': ', c
    
    evaluate.print_cluster(cluster_assignment, id_list, print_docs=0, sort_by_score=1, normalize_score=1, top_k_def=10)
    
    print 
    print
    print

Probability theory and stochastic processes :  Distribution theory
common terms: ()
top categories: (Probability distributions, 25), (Continuous distributions, 20), (Infinitely divisible probability distributions, 5), (Gaussian function, 5), (Exponential family distributions, 5)
purity: 8.333
relations:
μ: mean* (1.00)
σ: variance* (1.00)
f: probability density function* (1.00)
x: probability density function* (1.00)
β: skewness parameter* (1.00)
α: shape parameter* (0.99)
X: random variable* (0.99)
F: cumulative distribution function* (0.98)
π: interval* (0.98)
Γ: interval* (0.97)



Group theory and generalizations :  Connections with homological algebra and category theory
common terms: ()
top categories: (Group theory, 30), (Abstract algebra, 19), (Metric geometry, 16), (Algebraic structures, 5), (Theorems in group theory, 4)
purity: 10.000
relations:
G: group* (1.00)
H: subgroup* (1.00)
Z: group* (1.00)
K: subgroup* (0.99)
N: group* (0.99)
p: power* (0.98)
n: root* (0.95)
F: free 

In [148]:
for (p, c), id_list in top_ns.items():
    print (p, c)
    all_idx = np.array([], dtype=np.int)

    for cluster_id in id_list:
        indices, = np.where(cluster_assignment == cluster_id)
        all_idx = np.concatenate((all_idx, indices))
        
    all_articles = []
    for np_idx in all_idx:
        idx = int(np_idx)
        no_ids = sum(c for _, c in ids[idx].items())
        all_articles.append((idx, no_ids))
    
    all_articles = sorted(all_articles, key=lambda k: k[1], reverse=True)[:5]
    
    all_idx, _ = zip(*all_articles)
    
    for idx, size in all_articles:
        print '%s (%s),' % (titles[idx], size),
    
    print
    print

(u'Probability theory and stochastic processes', u'Distribution theory')
Chebyshev's inequality (1019), Gamma distribution (612), Noncentral chi-squared distribution (435), Sum of normally distributed random variables (395), Chi-squared distribution (286),

(u'Group theory and generalizations', u'Connections with homological algebra and category theory')
Free abelian group (100), Center (group theory) (93), Holomorph (mathematics) (84), P-group (68), Powerful p-group (61),

(u'Quantum theory', u'General mathematical topics and methods in quantum theory')
Gamma matrices (1626), Schrödinger equation (959), Theoretical and experimental justification for the Schrödinger equation (547), Quantization of the electromagnetic field (535), Quantum harmonic oscillator (507),

(u'Partial differential equations', u'Close-to-elliptic equations')
Orr–Sommerfeld equation (256), Helmholtz equation (250), Fictitious domain method (214), Green's function for the three-variable Laplace equation (209), Eik

In [142]:
all_idx

(4656, 10447, 1076, 11115, 20959)

In [160]:
evaluate.find_identifier(cluster_assignment, purity_threshold=0.8, id=u'σ', min_size=3)

overall purity 0.6336
number of high purity clusters of size at least 5 is 414

category "Category theory", cluster_id=56, size=3:
top categories: [(u'Category theory', 3), (u'Multilinear algebra', 3), (u'Monoidal categories', 3), (u'Closed categories', 1), (u'Dagger categories', 1)]
     σ: (natural isomorphisms: 0.89)
category "Physics", cluster_id=69, size=4:
top categories: [(u'Physics', 4), (u'Mechanics', 3), (u'Theoretical physics', 3), (u'Modern physics', 2), (u'Philosophy of physics', 2)]
     σ: (state: 1.94), (system: 0.87)
category "Special functions", cluster_id=217, size=5:
top categories: [(u'Special functions', 4), (u'Complex analysis', 3), (u'Types of functions', 2), (u'Mathematical analysis', 2), (u'Analytic functions', 2)]
     σ: (standard deviation: 1.94), (variance: 1.90), (mean density*: 1.70), (expected value: 0.89), (errors: 0.87), (probability: 0.86), (normal distribution: 0.84), (distance: 0.81)
category "Linear algebra", cluster_id=246, size=4:
top categories

## Some statisitcs

In [152]:
len(desc_ids)

414

In [154]:
used = sum((cluster_assignment == id).sum() for id in desc_ids)
used, len(ids), (len(ids) * 1.0 - used) / len(ids) 

(1774, 22512, 0.9211975835110163)