In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import json
import os
from collections import Counter

In [3]:
from time import time

In [45]:
root = 'C:/tmp/mlp/output/output-test/'

docs = []

for f in os.listdir(root): 
    for line in file(root + f):
        doc = json.loads(line)
        docs.append(doc)

In [46]:
titles = [d['title'] for d in docs] 
ids = [d['identifiers'] for d in docs]

In [6]:
def rel_to_dict(rels):
    return {r['identifier']: r['definition'] for r in rels}

In [7]:
rels = [rel_to_dict(d['relations']) for d in docs]

In [8]:
all_ids = sum(ids, [])
c = Counter(all_ids)

In [9]:
frequent = set(el for (el, cnt) in c.items() if cnt > 2)

In [10]:
def keep_frequent(ids):
    return [i for i in ids if i in frequent]

In [47]:
ids = [keep_frequent(id_list) for id_list in ids]

In [52]:
title_idx = {title: idx for (idx, title) in enumerate(titles)}

In [53]:
def get_ids(title):
    if isinstance(title, (int, long)):
        return ' '.join(ids[title])
    else:
        return ' '.join(ids[title_idx[title]])
    
def print_ids(title):
    print get_ids(title)

In [54]:
print_ids(u'Laplace–Stieltjes transform')

s x d g L ε h F E X t_i l t t_0 t_1 t_n T τ_i π n R F_Y Y λ Y_1 Y_2 Y_n Z U b


In [15]:
sigmas = [idx for (idx, id_list) in enumerate(ids) if u'σ' in id_list]

In [16]:
len(sigmas)

1302

In [17]:
print titles[sigmas[0]]
print_ids(sigmas[0])

Analysis of variance
ε N σ j y_i t_j s n y S D F n_T M


In [18]:
print_ids('Momentum')

p m v p_1 p_2 m_1 v_1 m_2 v_2 r_cm r_1 r_2 v_cm Δ F t d t_1 t_2 u_1 u_2 x u C_R v_x v_y v_z p_x p_y p_z y z L T V q_j p_j H q p_i q_i γ c m_0 τ U_0 V_0 U_1 V_1 U_2 V_2 U_3 V_3 U R P E B μ_0 g S ρ J n_j j ϵ_0 E_i E_j B_i B_j M r ϕ p_k B_l ℏ ψ h λ D σ_zx μ σ f


In [19]:
print_ids('Cauchy stress tensor')

σ n_i T n j σ_11 σ_12 σ_13 σ_21 σ_22 σ_23 σ_31 σ_32 σ_33 σ_x σ_y σ_z m p d S σ_n τ P F b Δ M F_i x t F_n F_s ρ h n_1 n_2 n_3 σ_1 σ_2 σ_3 σ_4 σ_5 σ_6 n_j τ_n n_k V x_j F_k r k n_m K_n λ λ_i λ_1 λ_2 λ_3 y τ_max σ_max σ_min g π u_k x_k u s_12 J_1 J_2 J_3 s_1 s_2 s_3 s σ_e


In [55]:
def id_intersect(title1, title2):
    return set(ids[title_idx[title1]]) & set(ids[title_idx[title2]])

def id_union(title1, title2):
    return set(ids[title_idx[title1]]) | set(ids[title_idx[title2]])

def jaccard(title1, title2):
    return len(id_intersect(title1, title2)) * 1.0 / len(id_union(title1, title2))

In [56]:
print jaccard('Momentum', 'Cauchy stress tensor'), 
print " ".join(id_intersect('Momentum', 'Cauchy stress tensor'))

0.166666666667 Δ n_j λ ρ σ τ F M P S T V d g h j m p r u t y x


In [57]:
print jaccard('Momentum', 'Bayesian linear regression'), 
print " ".join(id_intersect('Momentum', 'Bayesian linear regression'))

print jaccard('Cauchy stress tensor', 'Bayesian linear regression'), 
print " ".join(id_intersect('Cauchy stress tensor', 'Bayesian linear regression'))

0.11320754717 ρ c d m σ p μ_0 T v y x μ
0.132653061224 ρ π σ b d k m n p s T y x


In [23]:
print jaccard('Least squares', 'Bayesian linear regression'),
print " ".join(id_intersect('Least squares', 'Bayesian linear regression'))

0.222222222222 σ k m x_i p β T y_i x y X n


## Feature extraction:

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
identity = lambda x: x
vectorizer = TfidfVectorizer(analyzer=identity)

X = vectorizer.fit_transform(ids)

In [26]:
X.shape

(20873, 3614)

## SVD and LSA

http://www.datascienceassn.org/sites/default/files/users/user1/lsa_presentation_final.pdf

In [422]:
from sklearn.decomposition import TruncatedSVD

In [483]:
lsa = TruncatedSVD(100, algorithm='randomized')
X_red = lsa.fit_transform(X)

In [484]:
X_red.shape

(20873L, 100L)

## Jaccard:

In [24]:
ids_sets = [set(id_list) for id_list in ids]

In [25]:
def calc_jaccard(set1, set2):
    union = len(set1 | set2)
    if not union: 
        return 0.0

    inter = len(set1 & set2)
    return inter * 1.0 / union

In [27]:
N_docs = len(ids_sets)
jaccard_dist = np.zeros((N_docs, N_docs))
jaccard_dist.shape

(20873L, 20873L)

In [28]:
t0 = time()

for i in xrange(N_docs):
    for j in xrange(i + 1, N_docs):
        jaccard_dist[i, j] = jaccard_dist[j, i] = 1 - calc_jaccard(ids_sets[i], ids_sets[j])

print "done in %0.3fs." % (time() - t0)

done in 927.599s.


In [89]:
momentum = jaccard_dist[title_idx['Momentum'], :]

momentum_neighbors = momentum[momentum.argsort()[:10]]
print momentum_neighbors

for i in np.nditer(momentum.argsort()[:10]):
    print titles[i], " ".join(id_intersect('Momentum', titles[i])), '\t', rels[i].get(u'ϕ', 'not found')

[ 0.          0.62962963  0.63207547  0.6407767   0.65909091  0.66666667
  0.66666667  0.68217054  0.68695652  0.7       ]
Momentum j p_j p_k p_i v ℏ v_x v_y v_z U_3 U_2 U_1 U_0 p_z Δ p_x p_y E_j E_i v_1 x V_2 v_2 p c p_2 γ μ_0 r H m_2 m_1 m_0 μ ρ r_cm σ B E τ n_j F ψ J M L C_R q_j V_1 t_2 S R ϕ q_i V V_0 D U λ p_1 B_i B_j t_1 B_l d g f h r_2 V_3 m T q u_2 u_1 ϵ_0 u t r_1 y σ_zx z P v_cm 	not found
Laws of science p_i q_i Δ μ_0 γ ψ m_2 λ μ ρ B E F H J M L P S R U ℏ V m_1 c d g h j m q p r u t v t_2 x t_1 T 	not found
Lorentz force U_3 U_2 U_1 U_0 p_z p_x p_y μ_0 γ r μ ρ σ B E τ F J L P S ϕ T V p_2 p_1 c d f m q p ϵ_0 u t v y x z 	electrostatic potential
Lagrangian p_i ℏ τ q_i μ_0 γ r μ ρ σ B E D F ψ J M L S R ϕ T V p_2 c d g f j m q ϵ_0 t v y x z 	volts
Hamiltonian mechanics p_j p_i q_j q_i p_z p_x p_y γ ρ B E F H L P ϕ T V c d f j m q p r t y x z 	electric scalar potential
List of common physics notations ℏ Δ D x γ μ ρ σ B E τ F H J M L P S R ϕ T V c d g f h j m q p r u t v y U 	not f

In [81]:
doc_name = 'Bayesian linear regression'
neighbors = jaccard_dist[title_idx[doc_name], :]

for i in np.nditer(neighbors.argsort()[:10]):
    print titles[i], " ".join(id_intersect(doc_name, titles[i]))

Bayesian linear regression Λ_n m x_i Γ Λ_0 b_0 Λ b_n ϵ_i y_n μ_0 β y_i μ ρ π σ v_0 χ N T X c b d k μ_n y_1 n p s v y x
Proofs involving ordinary least squares π σ b d χ m N p β T n x y X x_i
Möbius function π c b d k m N p s T x X n μ
Quartic reciprocity π c b d k m σ N p s β y x n μ
List of integrals of exponential functions π c b d k m σ n p Γ y x μ
FKG inequality x c b d b_n μ_n m n p Γ β v y X Λ μ
Normal-gamma distribution π σ b s m μ_0 x_i p Γ β T n x X N μ
Median π σ d k m c N p s x v y X n μ
Optical transfer function π σ d k c N y_n s n T x X x_i μ
Mean σ b d m c y_1 N y_n n y_i y x x_i μ


## Locality sensitive hashing

https://github.com/pixelogik/NearPy

In [574]:
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from nearpy.distances import CosineDistance

# Dimension of our vector space
dims = 3614

# Create a random binary hash with 10 bits
rbp = RandomBinaryProjections('rbp', 10)

engine = Engine(dims, lshashes=[rbp], distance=CosineDistance())

for (idx, row) in enumerate(X):
    if (idx % 1000 == 0):
        print idx
    engine.store_vector(row.T, idx)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000


In [719]:
import scipy.sparse

def neighborhood_2(self, v):
    candidates = []
    for lshash in self.lshashes:
        for bucket_key in lshash.hash_vector(v, querying=True):
            bucket_content = self.storage.get_bucket(lshash.hash_name, bucket_key)
            #print 'Bucket %s size %d' % (bucket_key, len(bucket_content))
            candidates.extend(bucket_content)

    print 'Candidate count is %d' % len(candidates)

    #if scipy.sparse.issparse(v):
    candidate_vec, indexes = zip(*candidates)
    candidate_matrix = scipy.sparse.hstack(blocks=candidate_vec, format=v.format)

    cos = 1 - cosine_similarity(candidate_matrix.T, v.T)
    cos = cos.reshape(-1)

    candidates = zip(candidate_vec, indexes, cos)
    return candidates

Engine.neighborhood_sparse = neighborhood_2

In [720]:
query = X[50, :].T

candidates = engine.neighborhood_sparse(query)
_, indexes, cos = zip(*candidates)

Candidate count is 15


In [717]:
cand_idx = pd.DataFrame({'index': indexes, 'cosine': cos})
cand_idx.sort('cosine', inplace=True, ascending=False)
idx = cand_idx[cand_idx.cosine > 0].index 

idx

Int64Index([1, 5, 7, 13, 6, 11, 8, 3], dtype='int64')

In [583]:
query = X[10, :].T

N = engine.neighbours(query)
print len(N)

ValueError: setting an array element with a sequence.

## K-means

Link: http://brandonrose.org/clustering

In [425]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [485]:
num_clusters = 300
km = KMeans(n_clusters=num_clusters, max_iter=1000, random_state=10)

In [486]:
km.fit(X_red)

KMeans(copy_x=True, init='k-means++', max_iter=1000, n_clusters=300,
    n_init=10, n_jobs=1, precompute_distances=True, random_state=10,
    tol=0.0001, verbose=0)

http://stackoverflow.com/questions/27889873/clustering-text-documents-using-scikit-learn-kmeans-in-python

In [472]:
pd.Series(km.labels_).value_counts()

10     4654
194     262
278     239
3       225
22      189
176     185
276     184
51      184
1       183
11      181
152     174
244     169
227     160
111     157
219     151
...
174    12
124    12
293    11
246    10
179    10
94     10
113    10
266     9
162     8
182     8
8       8
292     7
205     5
13      5
241     5
Length: 300, dtype: int64

In [480]:
sigma_clusters = km.labels_[sigmas]
pd.Series(sigma_clusters).value_counts()

244    86
183    74
194    68
272    55
223    53
265    52
93     49
105    42
285    41
22     38
115    28
186    27
73     23
219    22
142    22
...
220    1
77     1
76     1
139    1
282    1
192    1
202    1
264    1
187    1
61     1
60     1
225    1
145    1
150    1
190    1
Length: 148, dtype: int64

In [488]:
for t0 in np.nditer(np.where(sigma_clusters == 244)[0]):
    t = sigmas[int(t0)]
    print titles[t], '\t', get_ids(t) #, '\t', rels[t].get(u'σ', 'not found')

Acoustic theory 	ρ_0 v t p m κ x y z c_0 c φ ρ s b σ μ T λ x_i v_i x_j v_j v_k x_k c_p c_v d γ r θ v_r v_θ v_z ω R Q α k J_α B_α C_α D_α
Arithmetic function 	χ n m p f k d σ_k ω τ p_1 p_2 φ J_k μ Ω q c_q gcd π c_r λ χ_0 x Π ϑ ψ Λ l t ϕ r_k g F_a s ζ R F_b b c j F_c δ J_r J_s σ r_2 r_4 ν r_6 r_8 r_24 b_n b_j c_n σ_3 σ_1 σ_5 σ_7 σ_9 σ_11 h D r r_3 H_n γ ν_p θ k_1 k_2 k_s m_1 m_2 d_2 d_1 u v
List of equations in classical mechanics 	m λ d l σ S ρ V r x_i r_i m_i N r_com M m_1 m_2 μ v t j ω n θ α p F t_1 t_2 Δ r_0 L τ W W_ON W_BY P E q Q T H v_average j_average ζ p_i F_E L_i τ_E Y Ω w E_k E_p r_2 r_1 k F_⊥ R F_c v_0 ω_1 ω_0 s F_app Λ τ_app x ϕ Θ b ω_res γ κ g U U_max
Force 	F m d p t v Δ m_0 c x F_x γ F_y F_z μ g m_⊕ R_⊕ G R r m_1 m_2 E q B l F_sf μ_sf F_N F_kf μ_kf k F_d F_g V P b σ τ α L dt t_1 t_2 W U m_n q_1 q_2 ϵ_0 π q_n
Felix Hausdorff 	ℵ_0 ℵ ℵ_α card T ℵ_1 μ ℵ_μ ω_ξ ω_η ω η W η_α δ σ R n ρ G_δ φ x L p F E
Lorentz transformation 	t m p γ v x c y z β X Λ r_⟂ r_∥ r r_⊥ T β_x β_y β_z v_

Usual k-means with euclidean distances doesn't work fine. Let's try to use cosine 

## Kernel KMneans

https://gist.github.com/mblondel/6230787

In [125]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity, euclidean_distances

In [140]:
def cosine_distance(X, Y=None):
    return 1 - cosine_similarity(X, Y)

In [141]:
kkm = KernelKMeans(n_clusters=10, kernel=cosine_distance, verbose=1)

In [142]:
idx = np.random.choice(range(X.shape[0]), size=2000)
X_sample = X_red[idx, :]
X_sample.shape

(2000L, 50L)

In [143]:
kkm.fit(X_sample)

Computing kernel...
Done computing kernel
iteration  0
iteration  1
iteration  2
iteration  3
iteration  4
iteration  5
iteration  6
iteration  7
iteration  8
iteration  9
iteration  10
iteration  11
iteration  12
iteration  13
iteration  14
iteration  15
iteration  16
iteration  17
iteration  18
iteration  19
iteration  20
iteration  21
iteration  22
iteration  23
iteration  24
iteration  25
iteration  26
iteration  27
iteration  28
iteration  29
iteration  30
iteration  31
iteration  32
iteration  33
iteration  34
iteration  35
iteration  36
iteration  37
iteration  38
iteration  39
iteration  40
iteration  41
iteration  42
iteration  43
iteration  44
iteration  45
iteration  46
iteration  47
iteration  48
iteration  49


KernelKMeans(kernel=<function cosine_distance at 0x0000000022B7B048>,
       max_iter=50, n_clusters=10, random_state=None, tol=0.001, verbose=1)

In [145]:
pd.Series(kkm.labels_).value_counts()

0    2000
dtype: int64

## Non-Negative Matrix Factorization

- http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf.html
- http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html#sklearn.decomposition.NMF

In [146]:
from sklearn.decomposition import NMF

In [147]:
idx = np.random.choice(range(X.shape[0]), size=2000)
X_sample = X[idx, :]
X_sample.shape

(2000, 3614)

In [376]:
t0 = time()

n_topics = 200
nmf = NMF(n_components=n_topics, random_state=1).fit(X_sample)

print "done in %0.3fs." % (time() - t0)

done in 514.840s.


In [377]:
feature_names = vectorizer.get_feature_names()
n_top_words = 10
for topic_idx, topic in enumerate(nmf.components_):
    print "Topic #%d:" % topic_idx, 
    print " ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]])

Topic #0: l p_t ε_r H_n t_0 C_A C_d p_θ L_r x_min
Topic #1: P R_t Alg P_X V_λ π_i U_a F_C F_D W_f
Topic #2: n V_1 B_L d_H P_n l_n m_n n_0 U_a U_p
Topic #3: x M_f l_n P_n E_A ℵ_1 M_i m_n D_K M_x
Topic #4: G Σ r_s T_A C_4 T_p G_i U_e C_p O_p
Topic #5: p pc m_i D_3 SL R_* R_⊙ U_a T_b U_p
Topic #6: X M_f B_r P_X π_1 X_t π_i Σ_j R_+ M_*
Topic #7: b AB h_f G_i Ax F_x r_N d_b U_a η_B
Topic #8: π D_3 M_x ×_G π_t R_I R_O x_s Ψ_3 ϵ_r
Topic #9: N n_0 Alg r_N OH C_4 π_i G_2 f_p S_N
Topic #10: T E_A R_t ε_r C_d T_b ρ_A Ax D_T η_A
Topic #11: S W_n N_v z_i B_ν T_b p_t L_0 λ_1 S_0
Topic #12: θ_m G_n H_n ω_o τ_g G_0 f_H G_1 ω_0 f_0
Topic #13: f L_U B_10 M_f θ_m f_t ℵ_0 k_I p_c Ψ_3
Topic #14: k Da R_eq k_i C_d k_g Th λ_d P_e x_d
Topic #15: B H_T AB GL ρ_A b_n k_I k_R U_a k_r
Topic #16: R V_D SL pd_R depth soc ρ_A R_R U_e Ax
Topic #17: r n_0 r_s years r_N ρ_A R_o R_s p_θ B_r
Topic #18: z z_0 d_b ℶ_1 B_r E_1 λ_r W_p d_H H_n
Topic #19: q M_x C_d k_f S_0 C_L _ cm u_* Ψ_3
Topic #20: x_2 x_1 x_N x_d y_2 x_3 x

In [378]:
X_topic = nmf.transform(X)
X_topic.shape

(20873L, 200L)

In [379]:
(X_topic.sum(axis=1) > 0).mean()

0.80884396109806933

In [380]:
X_1 = X_topic
X_1.sort(axis=1)
pd.Series(X_1[:, n_topics - 1]).describe()

count    20873.000000
mean         0.121951
std          0.095763
min          0.000000
25%          0.064970
50%          0.108711
75%          0.171699
max          0.495830
dtype: float64

In [371]:
threshold = 0.05
clustering = []
for doc, row in enumerate(X_topic):
    clus, = np.where(row >= threshold)
    if clus.shape[0] == 0:
        continue
    for c in np.nditer(clus):
        clustering.append((int(c), doc))

In [372]:
clustering = pd.DataFrame(clustering, columns=['cluster', 'document'])

In [373]:
clustering.head()

Unnamed: 0,cluster,document
0,21,1
1,35,1
2,47,1
3,2,2
4,3,2


In [374]:
gb = clustering.groupby('cluster', )
[(k, len(v)) for (k, v) in gb.groups.items()]

[(0, 991),
 (1, 1820),
 (2, 3685),
 (3, 3484),
 (4, 1842),
 (5, 1940),
 (6, 1989),
 (7, 3273),
 (8, 1545),
 (9, 1792),
 (10, 1346),
 (11, 2013),
 (12, 1753),
 (13, 998),
 (14, 2112),
 (15, 2014),
 (16, 1454),
 (17, 896),
 (18, 1459),
 (19, 758),
 (20, 1159),
 (21, 1370),
 (22, 1259),
 (23, 941),
 (24, 893),
 (25, 1193),
 (26, 907),
 (27, 911),
 (28, 884),
 (29, 668),
 (30, 518),
 (31, 1081),
 (32, 740),
 (33, 979),
 (34, 926),
 (35, 638),
 (36, 811),
 (37, 980),
 (38, 876),
 (39, 999),
 (40, 776),
 (41, 730),
 (42, 861),
 (43, 700),
 (44, 882),
 (45, 530),
 (46, 698),
 (47, 757),
 (48, 786),
 (49, 562)]

In [375]:
cluster_0 = gb.groups[0]

for t in cluster_0:
    print titles[t], '\t', rels[t].get(u'σ', 'not found')

Calculus 	not found
Carnot heat engine 	not found
Design of experiments 	not found
Group action 	not found
Laplace transform 	numbers
Physical quantity 	not found
Regular grammar 	not found
Beam diameter 	standard deviation
Waveguide 	not found
Intensity (physics) 	not found
Pythagorean tuning 	not found
Isomorphism theorem 	subgroup
Computer number format 	not found
Newton's laws of motion 	not found
Conjunctive normal form 	not found
Self-organizing map 	not found
A New Kind of Science 	not found
Proofs of Fermat's little theorem 	not found
Hebron, Connecticut 	not found
Spin glass 	not found
Value at risk 	not found
Liquid air cycle engine 	not found
Price discrimination 	not found
Profit maximization 	not found
Latent heat 	not found
Time hierarchy theorem 	not found
Impulse (physics) 	not found
Erythrocyte sedimentation rate 	not found
Subsequence 	not found
Mahlo cardinal 	not found
HSL and HSV 	not found
Ergodic theory 	not found
Curve sketching 	not found
Operational semantics 

IndexError: list index out of range

## Some stuff

In [20]:
dist_ids = 1 - cosine_similarity(X.T)
dist_ids.shape

(3614L, 3614L)

In [14]:
dist_docs = 1 - cosine_similarity(X)
dist_docs.shape

(20873L, 20873L)

## Hierarchical

https://de.dariah.eu/tatom/working_with_text.html

In [35]:
from scipy.cluster.hierarchy import ward, dendrogram

In [None]:
linkage_matrix = ward(dist_docs)