# Mediods Analysis

This experiment has the purpose of analysing the medoid of some of the clusters defined in the previous experiment. According to it, the best number of cluster is 72 regarding the square of the 2-norm distance, and 802 regarding the silhouette score. With 802 clusters, the silhouette score was 0.1, a considerably low value. Therefore, we won't consider it in the next experiments, implying that 72 is the best number of clusters.

In [1]:
import sys
import os
sys.path.append('../')
from src import reader as r
from src import visualization as v

In [2]:
%matplotlib notebook
import numpy as np
import pandas as pd
import sklearn
print(sklearn.__version__)
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import pairwise_distances

0.20.0


In [3]:
X = r.readBOW()
npX = X.values
print(npX)
print(X.head(10))
print(X.shape)

['health.txt', 'bags.csv', 'word2vec.csv']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
   0     1     2     3     4     5     6     7     8     9     ...   1193  \
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
5   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
6   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
7   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
8   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
9   0.0   0.0   0.0   0.0   0.0

In [4]:
news = r.readNews()
print(news.head(10))
print(news.shape)

['health.txt', 'bags.csv', 'word2vec.csv']
                   id                    publish_date  \
0  576880531301801984  Sat Mar 14 23:00:11 +0000 2015   
1  576820122666471424  Sat Mar 14 19:00:08 +0000 2015   
2  576744652717461504  Sat Mar 14 14:00:15 +0000 2015   
3  576736754436304896  Sat Mar 14 13:28:52 +0000 2015   
4  576736614766010368  Sat Mar 14 13:28:18 +0000 2015   
5  576548368740052992  Sat Mar 14 01:00:17 +0000 2015   
6  576518190286536704  Fri Mar 13 23:00:22 +0000 2015   
7  576494177971732480  Fri Mar 13 21:24:57 +0000 2015   
8  576472874946433024  Fri Mar 13 20:00:18 +0000 2015   
9  576464606551490560  Fri Mar 13 19:27:27 +0000 2015   

                                       headline_test  
0  An abundance of online info can turn us into e...  
1  A plant-based diet that incorporates fish may ...  
2  It doesn't take much to damage your hearing at...  
3  RT @CNN: Forever young? Discover this island’s...  
4  RT @CNN: Is post-traumatic stress disorder in ...  

In [5]:
from MulticoreTSNE import MulticoreTSNE as TSNE

smp_sz = X.size
tsne_bow = TSNE(n_components=2, perplexity=10, verbose=True, n_jobs=-1)#500
tsne_bow_result = tsne_bow.fit_transform(X)

## Computing the Medoids
In here, we compute the medoids as the closest point to the centroids of the clusters.

In [6]:
best_K = 72
print("#############################")
print("Best K =", best_K)
print("Applying K-means")
best_cluster = KMeans(n_clusters=best_K, n_jobs=-1)
best_cluster_result = best_cluster.fit(X)
print("Finished")
print("#############################")    

#############################
Best K = 72
Applying K-means
Finished
#############################


In [7]:
y_pred = best_cluster_result.labels_
arrays=[]
for i in range(best_K):
    dist = np.linalg.norm(best_cluster_result.cluster_centers_[i]-npX[y_pred==i],axis=1)
    index = np.argmin(dist)
    arrays.append(npX[y_pred==i][index])   

medoids = np.vstack(arrays)
print(medoids.shape)
print(medoids)

(72, 1203)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [8]:
l = []
for i in range(best_K):
    l.append(np.where(np.all(npX==medoids[i],axis=1))[0][0]) # these last [0][0] return the index, since the output of this command is (array([id]),)
    
medoids_index = np.array(l)
print(medoids_index)
print(len(np.unique(medoids_index)))

[   99  2278  3257  3691  9451  9313  4181   777  2299  3697   569  7291
  9420  5444 12108  6089  5940   851 10224  3043  5875   817  2938  5067
   315  2751  7576  4566  4387  2798  2529  8891  5033 11025  2464  1546
  2944  3725  9979  4131  6852  4107  6063  2971  6390  9599  4130 11178
  3003   616   401  1168 10247   188  7991   962  5812  4897 12881  4016
   681   841  5091  2206  5435  3490 10133  1723  4614  4287  1298  2889]
72


## Selecting medoids neighbors
Here the medoids neighbors are selected in order to plot them and see if the clusters make sense.

In [9]:
nm = 5 #number of medoids to be analyzed
nn = 5 #number of neighbor of each medoids
np.random.seed(42)
selected_medoids_index = np.random.randint(0,61,nm)
selected_medoids = medoids[selected_medoids_index]
print("Random indexes:",selected_medoids_index)
indexes_per_medoid = []

for i in range(nm):
    p = selected_medoids[i]
    aux = npX[y_pred==selected_medoids_index[i]]
    d = np.linalg.norm(aux-p,axis=1)
    points = []
    points.append(medoids_index[selected_medoids_index[i]])
    
    for j in range(nn):
        min_d = d.argmin()
        while (d[min_d] == 0.):
            d[min_d] = np.inf
            min_d = d.argmin()
        points = points + (np.where(np.all(npX==aux[min_d],axis=1))[0]).tolist()
        d[min_d] = np.inf
        
    indexes_per_medoid.append(np.unique(points).tolist())

    
print("Points:",indexes_per_medoid)
print(len(indexes_per_medoid))

Random indexes: [38 51 28 14 42]
Points: [[9979, 10500, 10644, 10657, 10983, 12162], [734, 1168, 6787, 7699, 8094, 9746], [4387, 5871, 9304, 11738, 12027, 12129], [4401, 7138, 7638, 8515, 11321, 12108], [4668, 6063, 6664, 6763, 9070, 9178]]
5


## Analyzing the Medoids
Here we start the analysis of the medoids and their neighbors.

### Exporting the news of the medoids and clusters

In [10]:
print("Random indexes:",selected_medoids_index)
for i in range(best_K):
    np.savetxt("../output/clusters/k_"+str(i)+".txt", news[y_pred==i].values, fmt='%s')
for i in range(selected_medoids_index.size):
    p = indexes_per_medoid[i]
    np.savetxt("../output/medoids/k_"+str(selected_medoids_index[i])+".txt", np.take(news,p,0).values, fmt='%s')

Random indexes: [38 51 28 14 42]


### News of the medoids and their neighbors

In [11]:
for p in indexes_per_medoid:
    info = np.take(news,p,0).loc[:,"headline_test"].values
    print("----------------------------")
    for i in range(info.size):
        print(info[i],"\n")
print("----------------------------")

----------------------------
Ebola flights' UK passengers traced 

VIDEO: UK Ebola planning 'excellent' 

VIDEO: Ebola: UK 'must not be complacent' 

VIDEO: Ebola: 'Stringent procedures' in UK 

VIDEO: How the UK is guarding against Ebola 

The spinal pains of the UK workforce 

----------------------------
RT @jdwilson2: LOVE this new take on our #WeeklyWeighIn series -- She gained weight to love her body 

@PublicHealth We'd love to! 

Give love, get love and love your heart #valentinesday 

How to help someone you love #quitsmoking 

Why you fell in love with your shnookums, or whatever you like to call you call your significant other! 

How much do we love the NHS? 

----------------------------
Doctors’ unconscious bias may not influence their decisions  

Doctors dole out prescriptions for #exercise 

Coalition 'undermined NHS' - doctors 

Mad-doctors and inconvenient people 

The A&amp;E doctors moving to Australia 

Doctors aim to grow ears from fat 

--------------------------

### Generating cloud of words

In [15]:
%matplotlib inline
from wordcloud import WordCloud
import matplotlib
import matplotlib.pyplot as plt

# clusters
for i in range(best_K):
    info = ' '.join(news[y_pred==i].loc[:,"headline_test"])
    wordcloud = WordCloud(random_state=42).generate(info)
    fig = plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")    
    fig.savefig("../output/wordcloud_clusters/k_"+str(i))
    plt.close(fig)

#medoids
for i in range(selected_medoids_index.size):
    p = indexes_per_medoid[i]
    info = ' '.join(np.take(news,p,0).loc[:,"headline_test"])
    wordcloud = WordCloud(random_state=42).generate(info)
    fig = plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")    
    fig.savefig("../output/wordcloud_medoids/k_"+str(selected_medoids_index[i]))
    plt.close(fig)
