In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy.sparse

import json
import os
from collections import Counter

from time import time

In [2]:
from collections import defaultdict

category_blacklist = {u'Articles containing proofs', 
                      u'Articles created via the Article Wizard', 
                      u'Articles with example pseudocode'}

doc_categories = defaultdict(set)
category_docs = defaultdict(set)

for line in file('C:/tmp/mlp/category_info.txt'):
    title, cat = line.strip().split('\t')
    title = title.decode('utf-8')
    cat = cat.decode('utf-8')
    
    if cat in category_blacklist:
        continue
    
    doc_categories[title].add(cat) 
    category_docs[cat].add(title)

print len(doc_categories), len(category_docs)

small_cats = set()

for cat, docs in category_docs.items():
    if len(docs) == 1:
        small_cats.add(cat)

print len(small_cats)

for cat in small_cats:
    for doc in category_docs[cat]:
        doc_categories[doc].remove(cat)
    del category_docs[cat]

del small_cats

for doc in doc_categories.keys():
    if len(doc_categories[doc]) == 0:
        doc_categories[doc].add(u'OTHER')
    category_docs[u'OTHER'].add(doc)


22822 4798
508


In [3]:
def id_counter(id_list):
    cnt = Counter()
    for el in id_list:
        cnt[el[u'element']] = el[u'count']
    return cnt

def_black_list = { 'unit', 'units', 'value', 'values', 'axis', 'axes', 'factor', 'factors', 'line', 'lines',
                 'point', 'points', 'number', 'numbers', 'variable', 'variables', 'respect', 'case', 'cases',
                 'vector', 'vectors', 'element', 'elements', 'example', 
                 'integer', 'integers', 'term', 'terms', 'parameter', 'parameters', 'coefficient', 'coefficients',
                 'formula', 'times', 'product', 'matrices', 'expression', 'complex', 'real', 'zeros', 'bits',
                 'sign',
                 'if and only if',
                 'alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa', 'lambda', 
                 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi', 'chi', 'psi', 'omega'}

def valid_def(definition):
    if len(definition) <= 3:
        return False

    return definition.lower() not in def_black_list

def rel_to_dict(rels):
    return {r['identifier']: r['definition'] for r in rels if valid_def(r['definition'])}

In [4]:
root = 'C:/tmp/mlp/mlp-output/'

docs = []
titles = []
ids = []
rels = []

empty = 0
small = 0

for f in os.listdir(root): 
    for line in file(root + f):
        doc = json.loads(line)

        title = doc['title']        
        if title not in doc_categories:
            continue

        if '(disambiguation)' in title:
            continue

        id_bag = id_counter(doc['identifiers'])
        if len(id_bag) <= 1:
            if len(id_bag) == 0:
                empty = empty + 1
            else:
                small = small + 1
            continue
        
        docs.append(doc)
        titles.append(title)
        ids.append(id_bag)

        id_rels = rel_to_dict(doc['relations'])
        rels.append(id_rels)

In [6]:
title_idx = {title: idx for (idx, title) in enumerate(titles)}

for doc, cats in doc_categories.items():
    if doc in title_idx:
        continue
        
    for cat in cats: 
        category_docs[cat].remove(doc)
    
    del doc_categories[doc]

print len(doc_categories)

title_cats = [doc_categories[title] for title in titles]

22822

In [10]:
N_doc = len(titles)

Clustering

In [8]:
df_cats = Counter()

for doc, cats in doc_categories.items():
    df_cats.update(cats)

categories = list(df_cats)
categories_idx = {cat: idx for idx, cat in enumerate(categories)}
len(categories)

4798

In [11]:
doc_category_matrix = np.zeros((N_doc, len(categories)))

for doc, cats in doc_categories.items():
    doc_idx = title_idx[doc]
    for cat in cats:
        i = categories_idx[cat]
        doc_category_matrix[doc_idx, i] = df_cats[cat]

In [12]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.preprocessing import Normalizer
normalizer = Normalizer(copy=False)

In [13]:
doc_category_matrix = normalizer.fit_transform(doc_category_matrix)

In [None]:
t0 = time()

labels = np.zeros(N_doc, dtype=np.int)


k_max = 1
k = 2371

while k_max < k:
    if k_max % 100 == 0:
        print 'iteration %d...' % k_max
    idx, count = Counter(labels).most_common()[0]

    trials = 3
    while trials > 0:
        if count < 100:
            model = Kmeans(n_clusters=2)
        else:
            model = MiniBatchKMeans(n_clusters=2, init='random', n_init=10)

        km = model.fit(doc_category_matrix[labels == idx])

        it_labels = km.labels_
        if (it_labels == 1).sum() > 1 and (it_labels == 0).sum() > 1:
            break
        trials = trials - 1
    
    it_labels[it_labels == 1] = k_max
    it_labels[it_labels == 0] = idx        
    labels[labels == idx] = it_labels
    
    k_max = k_max + 1

print "done in %0.3fs." % (time() - t0)

In [None]:
found_k = labels.max()
centroids = np.zeros((found_k + 1, len(categories)))

for i in xrange(found_k + 1):
    if (labels == i).sum() > 0:
        centroids[i, :] = doc_category_matrix[labels == i, :].mean(axis=0)

centroid_top_cats = centroids.argmax(axis=1)
clustered_cat_labels = centroid_top_cats[labels]

clustered_categories = list(np.array(categories)[clustered_cat_labels])



How many documents hve unique category now?

In [14]:
low_freq = [c for c, i in Counter(clustered_categories).most_common() if i == 1]
1.0 * len(low_freq) / N_doc

NameError: name 'clustered_categories' is not defined

In [None]:
import codecs
category_file = codecs.open('C:/tmp/mlp/doc_category_clustered.txt', 'w', encoding='utf-8')

for doc, cat in zip(titles, clustered_categories):
    category_file.write(u'%s\t%s\n' % (title, cat))
        
category_file.close()