In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import Counter, defaultdict, namedtuple
%matplotlib inline

In [2]:
np.random.seed(1)

In [3]:
import sys
import os
sys.path.append("/Users/aleverentz/ucsd-classes/research/hier-topic-models/")

In [4]:
from load_data import load_data, load_vocab
data = load_data("/Users/aleverentz/Downloads/ap/ap.dat")
vocab = load_vocab("/Users/aleverentz/Downloads/ap/vocab.txt")

Reading lines: 100%|██████████| 2246/2246 [00:00<00:00, 3657.11it/s]
Filling matrix: 100%|██████████| 2246/2246 [00:02<00:00, 971.83it/s]
Loading vocabulary: 100%|██████████| 10473/10473 [00:00<00:00, 1779860.04it/s]


In [5]:
data.shape, len(vocab)

((2246, 10473), 10473)

In [6]:
from scipy.sparse import dok_matrix, csr_matrix, isspmatrix_csr

In [7]:
from collections import defaultdict

In [8]:
from tqdm import tqdm

In [9]:
def make_q(doc_term_matrix):
    assert isspmatrix_csr(doc_term_matrix), "Must provide sparse-CSR matrix"
    D, V = doc_term_matrix.shape
    result = dok_matrix((V, V), dtype='float')
    total_by_term = defaultdict(float)
    # Iterate over rows (documents)
    for doc in tqdm(doc_term_matrix, total=D, desc="Processing documents"):
        for i, count_i in zip(doc.indices, doc.data):
            for j, count_j in zip(doc.indices, doc.data):
                assert count_i > 0 and count_j > 0
                if i == j:
                    to_add = count_i * (count_i - 1)
                else:
                    to_add = count_i * count_j
                result[i, j] += to_add
                total_by_term[i] += to_add
    # TODO: convert to result CSR format (before/after normalizing?)
    for i, j in tqdm(result.keys(), total=result.nnz, desc="Normalizing"):
        result[i, j] /= total_by_term[i]
    return result

In [10]:
%%time
q = make_q(data[:,:])
print(q.shape, q.nnz)

Processing documents: 100%|██████████| 2246/2246 [1:25:45<00:00,  2.29s/it]
Normalizing: 100%|██████████| 21226142/21226142 [09:05<00:00, 38894.66it/s]

(10473, 10473) 21226142
CPU times: user 38min 54s, sys: 13.8 s, total: 39min 8s
Wall time: 1h 34min 51s





In [11]:
%%time
row_sums = np.asarray(q.sum(axis=1)).squeeze()
print(row_sums.min(), row_sums.max())

1.0 1.0
CPU times: user 16.3 s, sys: 106 ms, total: 16.4 s
Wall time: 16.5 s


In [12]:
q.nnz / np.product(q.shape)

0.19352133806464586

In [13]:
row_sums.shape

(10473,)

In [None]:
def anchor_words(q_normalized, k):
    assert k < q_normalized.shape[0]
    def extend_set(current):
        if current is None:
            norms = np.linalg.norm(q_normalized, ord=2, axis=1)
            ii = np.argmax(norms)
            new_row = np.asarray(q_normalized[ii].todense())
            return new_row.reshape((1, -1))
        else:
            pass
    result = None
    for i in tqdm(range(k)):
        result = extend_set(result)
    return result

In [None]:
anchor_words(q, k=2)

  0%|          | 0/2 [00:00<?, ?it/s]