In [26]:
import sys
sys.path.append('/home/mquezada/phd/multimedia-summarization/')

import numpy as np
from scipy.sparse import lil_matrix
from tqdm import trange
import json
from docopt import docopt
from sklearn.cluster import AgglomerativeClustering, KMeans
from sqlalchemy.orm import sessionmaker

from clustering.clustering_online import OnlineClustering
from db.engines import engine_of215 as engine
from db.models_new import EventGroup, Cluster, DocumentCluster
from document_generation.documents import get_representatives
from document_representation.get_vectors import *
from nlp.filter_tweets import filter_tweets
from document_representation import discourse_vectors
from document_representation import generate_vectors_fasttext
import summaries_local


logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s | %(name)s | %(levelname)s : %(message)s', level=logging.INFO)

Session = sessionmaker(engine, autocommit=True)
session = Session()

event_name = 'libya_hotel'

from db.events import get_documents_from_event2
d = get_documents_from_event2(event_name, session)


tau = 0.99
a = 0.001
docs, doc_objs = get_fasttext_vectors(event_name, session)

2017-11-15 16:17:12,364 | root | INFO : loading documents from DB
2017-11-15 16:17:12,551 | root | INFO : loading data from data/fasttext_vectors_event_libya_hotel.npy
2017-11-15 16:17:12,570 | root | INFO : done loading documents


In [27]:
n = docs.shape[0]

d = docs @ docs.T
# docs * docs es element-wise product
norm = np.sqrt((docs * docs).sum(1, keepdims=True))

# cos(vi, vj) = ((vi * vj) / ||vi||) / ||vj||
s = (d / norm) / norm.T
np.fill_diagonal(s, tau)

del d
del norm

one = np.ones((n, 1), dtype='bool')
i = 0

c = lil_matrix((n, n), dtype='bool')
c[0, 0] = True

for j in trange(1, n):
    # elem j belongs to cluster i + 1
    c[i + 1, j] = 1

    # t := 1 / (# of elems in each cluster)
    t = 1 / c.dot(one)

    # inf values removed
    t[t > 1] = 0

    # sum of similarities of elem j to elems in each cluster
    v = c.dot(s[:, j])

    v = v * t.T

    v[v < tau] = 0

    k = np.argmax(v)

    c[i + 1, j] = 0
    c[k, j] = 1
    i += 1

labels_ = np.zeros(n, dtype=np.uint32)
c = c.tocoo()

for i, j, _ in zip(c.row, c.col, c.data):
    labels_[j] = i

n_clusters = int(max(labels_)) + 1

100%|██████████| 2857/2857 [00:05<00:00, 494.48it/s]


In [36]:
v1, v2 = 10, 20

docs[v1].dot(docs[v2]) / (np.sqrt(sum(docs[v1] ** 2)) * np.sqrt(sum(docs[v2] ** 2)))

0.97720288359336083

In [15]:
np.sqrt(sum(docs[0] ** 2))

1.896994420428693

In [29]:
n_clusters

3