In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [72]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans2
from scipy.spatial.distance import cdist
import pandas as pd

import datasets
import preprocessing

### Get data

In [5]:
# BBC corpus
bbc = datasets.get_bbc()

In [6]:
# bbc is a list of strings
type(bbc)

list

In [7]:
type(bbc[0])

str

In [8]:
len(bbc)

2225

In [9]:
# first 100 chars of the 1st doc
bbc[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [10]:
# pre-process and vectorize
processor = preprocessing.NLPProcessor('tf-idf')
bbc_vectorized = processor.fit_transform(bbc)

In [11]:
# sparse matrix
type(bbc_vectorized)

scipy.sparse.csr.csr_matrix

In [12]:
# 2,225 docs with 29,421 sparse features
bbc_vectorized.shape

(2225, 29421)

### Similarity metric

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# cosine similarity between 1st and 2nd doc
cosine_similarity(bbc_vectorized[0], bbc_vectorized[1])

array([[0.09372963]])

In [15]:
# cosine similarity of all 2,225 docs
# in a matrix of 2,225 x 2,225
cosine_similarity(bbc_vectorized)

array([[1.        , 0.09372963, 0.16603523, ..., 0.14290773, 0.23992781,
        0.18738048],
       [0.09372963, 1.        , 0.08616861, ..., 0.04780253, 0.09995931,
        0.08760382],
       [0.16603523, 0.08616861, 1.        , ..., 0.09247675, 0.17008746,
        0.1621905 ],
       ...,
       [0.14290773, 0.04780253, 0.09247675, ..., 1.        , 0.11614638,
        0.08908685],
       [0.23992781, 0.09995931, 0.17008746, ..., 0.11614638, 1.        ,
        0.15640773],
       [0.18738048, 0.08760382, 0.1621905 , ..., 0.08908685, 0.15640773,
        1.        ]])

In [16]:
# ...

### Clustering

In [17]:
from sklearn.cluster import KMeans

In [18]:
k = KMeans(n_clusters=5)

In [19]:
# this will take a while
# k.fit(bbc_vectorized)

In [20]:
# take only the top 1,000 most occurring tokens in the corpus 
p = preprocessing.NLPProcessor(max_features=1000)
bbc_vectorized_features_bound = p.fit_transform(bbc)

In [21]:
bbc_vectorized_features_bound.shape

(2225, 1000)

In [22]:
%%time
k.fit(bbc_vectorized_features_bound)

CPU times: user 1min 12s, sys: 605 ms, total: 1min 12s
Wall time: 24.5 s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [23]:
k.predict(bbc_vectorized_features_bound)

array([4, 3, 2, ..., 1, 4, 2], dtype=int32)

In [24]:
# ...

### Interactive

In [25]:
import interactive

In [26]:
k = 5  # start with assuming there are 10 clusters
options = (1.1, 25, 0.01, 0)
userU = -1
docs = 1000

In [27]:
p = preprocessing.NLPProcessor(max_features=1000)
bbc_vectorized_features_bound = p.fit_transform(bbc)

In [28]:
data = bbc_vectorized_features_bound[:docs].todense()
terms = np.array(p.vec.get_feature_names()).reshape((1, 1000))

In [29]:
# run the algo with no user input
x = interactive.icluster(data, terms, [], k, userU=-1)
clusterDocs, clusterKeyterms, keyterms, silhouette_avg = x

In [30]:
# sample docs in clusters 0 to k-1
[x[:5] for x in clusterDocs]

[[5, 10, 11, 27, 30],
 [12, 13, 16, 19, 35],
 [1, 3, 17, 18, 20],
 [3, 4, 8, 9, 15],
 [2, 6, 7, 14, 17]]

In [31]:
# there seem to be some legitimate clusters forming
# some overlap
# and some catchall cluster for all common words (need stop word removal?)
[x[:5] for x in clusterKeyterms]

[['film', 'actor', 'her', 'actress', 'award'],
 ['growth', 'economy', 'analysts', 'shares', 'its'],
 ['digital', 'music', 'technology', 'video', 'work'],
 ['side', 'game', 'cup', 'coach', 'match'],
 ['election', 'party', 'labour', 'tory', 'mr']]

#### Get user input
Specify number of clusters and top terms

In [32]:
user_input = [x[:5] for x in clusterKeyterms]
user_input

[['film', 'actor', 'her', 'actress', 'award'],
 ['growth', 'economy', 'analysts', 'shares', 'its'],
 ['digital', 'music', 'technology', 'video', 'work'],
 ['side', 'game', 'cup', 'coach', 'match'],
 ['election', 'party', 'labour', 'tory', 'mr']]

In [33]:
# looks like there are 5 clusters at a first glance
user_input = [
    ['film', 'tv', 'music', 'oscar'],
    ['growth', 'economy', 'stock', 'investor'],
    ['technology', 'web', 'software'],
    ['olympic', 'game', 'cup', 'coach', 'game', 'club'],
    ['law', 'government', 'election', 'blair'],
]

In [34]:
# run it with user input
x = interactive.icluster(data, terms, user_input, len(user_input), userU=+1)
clusterDocs, clusterKeyterms, keyterms, silhouette_avg = x



In [35]:
# seem to be getting better clusters
[x[:5] for x in clusterKeyterms]

[['film', 'award', 'her', 'awards', 'actor'],
 ['growth', 'economy', 'analysts', 'shares', 'oil'],
 ['users', 'microsoft', 'technology', 'digital', 'video'],
 ['side', 'game', 'cup', 'coach', 'win'],
 ['mr', 'party', 'election', 'labour', 'government']]

In [36]:
len(bbc)

2225

In [37]:
sum([len(x) for x in clusterDocs])

1109

In [38]:
[x[:5] for x in clusterDocs]

[[5, 10, 11, 27, 30],
 [12, 13, 16, 19, 35],
 [1, 20, 21, 22, 23],
 [3, 4, 8, 9, 15],
 [1, 2, 6, 7, 14]]

In [39]:
clusterDocs[-1][-1]

994

In [40]:
for k in range(len(clusterDocs)):
    key = ','.join(clusterKeyterms[k][:5])
    sample_doc = bbc[clusterDocs[k][0]][:200]
    print(f'Cluster {k + 1}')
    print(f'Key terms  : {key}')
    print(f'Sample doc : {sample_doc}')
    print('=' * 10)

Cluster 1
Key terms  : film,award,her,awards,actor
Sample doc : howard hits back at mongrel jibe michael howard has said a claim by peter hain that the tory leader is acting like an  attack mongrel  shows labour is  rattled  by the opposition.  in an upbeat speech
Cluster 2
Key terms  : growth,economy,analysts,shares,oil
Sample doc : crude oil prices back above $50 cold weather across parts of the united states and much of europe has pushed us crude oil prices above $50 a barrel for the first time in almost three months.  freezing
Cluster 3
Key terms  : users,microsoft,technology,digital,video
Sample doc : worldcom boss  left books alone  former worldcom boss bernie ebbers  who is accused of overseeing an $11bn (£5.8bn) fraud  never made accounting decisions  a witness has told jurors.  david myers made
Cluster 4
Key terms  : side,game,cup,coach,win
Sample doc : yeading face newcastle in fa cup premiership side newcastle united face a trip to ryman premier league leaders yeading in the

### t-SNE viz

In [41]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from collections import defaultdict

In [42]:
# tsne = TSNE(n_components=2)
# pca = 

In [43]:
# %%time
# t = tsne.fit_transform(data)

In [44]:
pca = PCA(n_components=2).fit_transform(data)

In [45]:
cluster_data = defaultdict(list)
for i, docs in enumerate(clusterDocs):
    for doc in docs:
        cluster_data[i].append(pca[doc - 1])

In [46]:
color_alpha = 0.5
colors_base = [
    'rgba(47,126,216,{alpha})',
    'rgba(13,35,58,{alpha})',
    'rgba(139,188,33,{alpha})',
    'rgb(145,0,0,{alpha})',
    'rgba(26,173,206,{alpha})',
    'rgba(73,41,112,{alpha})',
    'rgba(242,143,67,{alpha})',
    'rgba(119,161,229,{alpha})',
    'rgba(196,37,37,{alpha})',
    'rgba(166,201,106,{alpha})'
]
colors = [x.format(alpha=0.5) for x in colors_base]

In [47]:
list(pca[0])

[-0.13420995604180042, -0.11349501273343268]

In [48]:
bbc[0][:100]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital'

In [49]:
[
        {
            'name': f'Cluster {k+1}',
            'color': colors[i],
            'data': cluster_data[i]
        }
        for i in range(k)
    ]

[{'name': 'Cluster 5',
  'color': 'rgba(47,126,216,0.5)',
  'data': [array([-0.04793498,  0.20335746]),
   array([-0.00076205,  0.06126828]),
   array([-0.0047744 ,  0.25279978]),
   array([0.18021007, 0.24146084]),
   array([0.0675272 , 0.23470283]),
   array([0.03294537, 0.08701081]),
   array([0.23106822, 0.19145731]),
   array([-0.06384574,  0.03542689]),
   array([0.06732423, 0.35935586]),
   array([0.0374498, 0.0010603]),
   array([0.11203538, 0.06359329]),
   array([0.07125666, 0.07220353]),
   array([0.1283694 , 0.06444156]),
   array([0.01881467, 0.30856319]),
   array([0.01655831, 0.17646431]),
   array([-0.10974802,  0.17023852]),
   array([0.01236621, 0.11687246]),
   array([0.09641634, 0.30689497]),
   array([0.07200972, 0.2867238 ]),
   array([-0.00612549,  0.09031556]),
   array([0.05944463, 0.30273381]),
   array([0.17640653, 0.28750069]),
   array([ 0.10367877, -0.03771667]),
   array([-0.07097861,  0.12623696]),
   array([0.16987637, 0.24632861]),
   array([ 0.2884232

In [50]:
color = ['red', 'blue', 'green', 'purple', 'yellow'] * 2

In [51]:
color[0]

'red'

In [52]:
plt.scatter(t[:, 0], t[:, 1], color=[color[int(c)] for c in assignments])

NameError: name 't' is not defined

In [None]:
plt.scatter(p[:, 0], p[:, 1], color=[color[int(c)] for c in assignments])

### Birch

In [58]:
from sklearn.cluster import Birch, SpectralClustering, AffinityPropagation, DBSCAN
from sklearn.metrics.pairwise import euclidean_distances

In [53]:
bbc_vectorized_features_bound

<2225x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 259748 stored elements in Compressed Sparse Row format>

In [56]:
b = Birch(n_clusters=3)
b.fit(bbc_vectorized_features_bound)

Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=3,
   threshold=0.5)

In [63]:
b.subcluster_labels_.shape

(1675,)

In [120]:
from functools import reduce

In [122]:
from operator import add

In [123]:
reduce(add, [Counter([1,2,3]), Counter([2,3])])

Counter({1: 1, 2: 2, 3: 2})

In [133]:
set(frozenset([1,2,3])).add({1, 2})

TypeError: unhashable type: 'set'

In [127]:
xx = [[1,2,3], [4,5,6]]

for x in xx:
    for y in x:
        print(y)

1
2
3
4
5
6


In [128]:
import string

In [129]:
"said people music said. government film year best like just".strip(string.punctuation)

'said people music said. government film year best like just'

In [65]:
b.transform(bbc_vectorized_features_bound).shape

(2225, 1675)

In [116]:
from collections import Counter

In [125]:
cc = Counter('thi sis it'.split()) + Counter('this is not it'.split())
cc.most_common(10)

[('it', 2), ('thi', 1), ('sis', 1), ('this', 1), ('is', 1), ('not', 1)]

In [108]:
bbc_vectorized_features_bound

<2225x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 259748 stored elements in Compressed Sparse Row format>

In [107]:
a = [[] for _ in range(10)]
a[0].append(1)
a

[[1], [], [], [], [], [], [], [], [], []]

In [57]:
b.subcluster_centers_

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.1918507 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.15039556, 0.0484423 ,
        0.        ],
       [0.06550933, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [104]:
pd.Series(b.labels_).value_counts()

0    1536
1     507
2     182
dtype: int64

### DBSCAN

In [77]:
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler


In [78]:
pca = IncrementalPCA(n_components=2).fit_transform(data)
pca = StandardScaler().fit_transform(pca)

In [79]:
db = DBSCAN(eps=.2)
db.fit(pca)

DBSCAN(algorithm='auto', eps=0.2, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=None, p=None)

In [80]:
pd.Series(db.labels_).value_counts()

 0    895
-1     68
 2     16
 1     12
 3      9
dtype: int64

In [82]:
db.core_sample_indices_.shape

(882,)

In [84]:
db.components_.shape

(882, 2)

In [None]:
db.

### Affinity

In [86]:
a = AffinityPropagation()
a.fit(pca)

AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
          damping=0.5, max_iter=200, preference=None, verbose=False)

In [88]:
a.cluster_centers_.shape

(29, 2)

In [89]:
a.labels_

array([ 3, 17, 22,  9, 18,  5, 23,  9, 28,  0, 18,  2, 27,  5, 14, 27, 23,
       16, 27, 25,  3,  3, 17,  6,  3,  0, 19, 21, 23, 19, 26,  4,  0,  0,
       15, 17, 27, 23, 23, 24, 15,  1, 27, 28, 25, 23, 17,  8, 26,  2, 14,
       23, 27, 27, 22,  3, 23, 10, 14,  4,  4,  9, 20, 28, 27, 25, 25, 13,
       25, 15, 26,  2, 15,  6,  3, 26, 27, 26,  9,  3, 20,  6, 15,  2, 11,
        5,  3, 20, 26, 13, 28,  9, 17, 28, 15,  7,  2, 14, 27,  6,  2, 28,
        4, 26,  8, 15,  6, 15,  0, 21, 24, 26, 28,  0, 14,  6, 11, 23,  6,
        6, 12, 17, 25, 21,  2, 27,  4, 10, 21, 28,  4,  2,  1,  2, 25,  4,
        0, 26,  1,  3,  4,  4, 23,  9, 23,  2,  6, 11, 14,  4, 26, 27, 16,
       24,  3, 26, 10,  2,  4,  3, 11,  5, 21,  0, 16, 24, 17, 17,  5, 11,
        5,  9,  2, 21, 23, 24, 19,  7, 25, 26, 10, 26, 10, 25, 12,  6, 25,
       28, 16, 19, 17,  4,  8, 25, 15,  1, 13, 21, 13, 15, 11, 24, 10,  3,
       11, 15,  8,  5, 15, 22,  6,  3, 19, 12,  6, 28,  0,  6,  3,  3, 11,
       13,  7,  8, 20, 13

In [91]:
a.cluster_centers_indices_

array([ 33,  41,  49,  74, 126, 161, 214, 249, 259, 306, 308, 312, 314,
       325, 334, 402, 552, 585, 586, 614, 649, 663, 684, 700, 806, 836,
       867, 973, 982])

In [96]:
bbc[74]

'call to save manufacturing jobs the trades union congress (tuc) is calling on the government to stem job losses in manufacturing firms by reviewing the help it gives companies.  the tuc said in its submission before the budget that action is needed because of 105 000 jobs lost from the sector over the last year. it calls for better pensions  child care provision and decent wages. the 36-page submission also urges the government to examine support other european countries provide to industry. tuc general secretary brendan barber called for  a commitment to policies that will make a real difference to the lives of working people.    greater investment in childcare strategies and the people delivering that childcare will increases the options available to working parents   he said.  a commitment to our public services and manufacturing sector ensures that we can continue to compete on a global level and deliver the frontline services that this country needs.  he also called for  practica

In [97]:
s = SpectralClustering(n_clusters=3)

In [98]:
s.fit(bbc_vectorized)

SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
          eigen_solver=None, eigen_tol=0.0, gamma=1.0, kernel_params=None,
          n_clusters=3, n_init=10, n_jobs=None, n_neighbors=10,
          random_state=None)

In [100]:
s.labels_

array([1, 0, 0, ..., 0, 2, 0], dtype=int32)

In [102]:
s.affinity_matrix_.shape

(2225, 2225)