In [1]:
import numpy as np
import random
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.datasets import fetch_20newsgroups
np.random.seed(42)
random.seed(42)
py.init_notebook_mode(connected=True)
categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

newsgroups_data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

print(newsgroups_data.target_names)

['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']


In [2]:
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer

def tokenize(text):
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [token.strip(string.punctuation) for token in tokens if token.isalnum()]
    return tokens

tfidfVectorizer = TfidfVectorizer(min_df=3, stop_words='english', tokenizer = tokenize)
tfidfVectors = tfidfVectorizer.fit_transform(newsgroups_data.data)
print('When min_df=3, we get %d documents with %d terms.' % (tfidfVectors.shape[0], tfidfVectors.shape[1]))

When min_df=3, we get 7882 documents with 22553 terms.


In [3]:
from sklearn.cluster import KMeans
from sklearn import metrics

vfunc = np.vectorize(lambda t : int(t / 4))
labels = vfunc(newsgroups_data.target)

true_k = 2
km = KMeans(n_clusters = true_k, random_state=0,max_iter=1000,n_init=30)
km.fit(tfidfVectors)

def contingency_table(true_labels, pre_labels):
    n_clusters = len(np.unique(pre_labels))
    A = np.zeros(shape = (n_clusters,n_clusters))
    uniq_true = np.unique(true_labels)
    for i, true_label in enumerate(uniq_true):
        for j, pre_label in enumerate(pre_labels):
            if(true_labels[j] == true_label):
                A[i][pre_label] += 1
    return A

import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
%matplotlib inline

def evaluate(labels, km_labels_):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km_labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km_labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km_labels_))
    print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km_labels_))
    print("Adjusted Mutual Information Score: %.3f" % metrics.adjusted_mutual_info_score(labels, km_labels_))
    print("Contingency Table: ")
    A = contingency_table(labels, km_labels_).astype(int)
    if len(np.unique(labels)) > 2:
        print(plt.matshow(A, cmap=get_cmap('Blues')))
    else:
        print(A)


In [4]:
evaluate(labels, km.labels_)

Homogeneity: 0.251
Completeness: 0.332
V-measure: 0.286
Adjusted Rand-Index: 0.180
Adjusted Mutual Information Score: 0.251
Contingency Table: 
[[3897    6]
 [2264 1715]]



The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.



In [5]:
def plot_scores(x, scores, title):
    data = []
    legend = ['Homogeneity', 'Completeness', 'V-measure', 'Adjusted Rand', 'Adjusted Mutual Info']
    width, height = 800, 600

    for i in range(5):
        trace1 = go.Scatter(x=x, y=scores[i], 
                            mode='lines', 
                            line=dict(width=2),
                            name=legend[i]
                           )
        data.append(trace1)
    
    layout = go.Layout(title=title,
                       autosize=False,
                       width=width,
                       height=height,
                       xaxis=dict(title='R', type='log', ticks='outside', mirror=True, linewidth=1),
                       yaxis=dict(title='Scores', ticks='outside', mirror=True, linewidth=1, range=[0, 1]),
                       legend=dict(x=.7, y=.9, bordercolor='#D3D3D3', borderwidth=1))
    
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [6]:
from sklearn.decomposition import TruncatedSVD

# Reduce dimension to 1000
svd = TruncatedSVD(n_components = 1000, n_iter = 7, random_state = 42)
lsiVectors = svd.fit_transform(tfidfVectors)
total_var = np.dot(tfidfVectors.T,tfidfVectors).diagonal().sum()
dim1000_diag = np.diag(np.dot(lsiVectors.T,lsiVectors))
var_retained = [0 for i in range(1000)] 
var_retained[0] = dim1000_diag[0]
for i in range(1,1000):
    var_retained[i] = var_retained[i - 1] + dim1000_diag[i]

In [10]:
# SVD in specified r (1,2,3,5,10,20,50,100,300) 
r = [1,2,3,5,10,20,50,100,300]
svd_scores = [[],[],[],[],[]]

for i in r:
    km.fit(lsiVectors[:,:i])
    svd_scores[0].append(metrics.homogeneity_score(labels, km.labels_))
    svd_scores[1].append(metrics.completeness_score(labels, km.labels_))
    svd_scores[2].append(metrics.v_measure_score(labels, km.labels_))
    svd_scores[3].append(metrics.adjusted_rand_score(labels, km.labels_))
    svd_scores[4].append(metrics.adjusted_mutual_info_score(labels, km.labels_))

plot_scores(r, svd_scores, 'SVD')


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use avera

In [66]:
# NMF in specified r (1,2,3,5,10,20,50,100,300) 
import pickle
from sklearn.decomposition import NMF
r = [1,2,3,5,10,20,50,100,300]
nmf_scores = [[],[],[],[],[]]
for i in r:
    nmf = NMF(n_components = i, init='random', random_state=0)
    nmfVectors = nmf.fit_transform(tfidfVectors)
    km.fit(nmfVectors)
    nmf_scores[0].append(metrics.homogeneity_score(labels, km.labels_))
    nmf_scores[1].append(metrics.completeness_score(labels, km.labels_))
    nmf_scores[2].append(metrics.v_measure_score(labels, km.labels_))
    nmf_scores[3].append(metrics.adjusted_rand_score(labels, km.labels_))
    nmf_scores[4].append(metrics.adjusted_mutual_info_score(labels, km.labels_))

# pickle.dump(nmf_scores, open("NMF.txt", "w"))

# Load dumped scores
#nmf_scores = pickle.load(open("NMF.txt", "r"))
plot_scores(r, nmf_scores, 'NMF')


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.


The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use avera

In [60]:
def plot_clusters(X, k_means, title='Title',show_center = False):
    colors = ['#92A8D1', '#F7CAC9', '#F7CAC9', '#F7CAC9', '#F7CAC9', 
              '#F7CAC9', '#e6194b', '#3cb44b', '#3cb44b', '#3cb44b', 
              '#3cb44b', '#ffe119', '#ffe119', '#ffe119', '#ffe119', 
              '#f58231', '#911eb4', '#911eb4', '#911eb4', '#911eb4']
    
    data = []

    for k in range(len(k_means.cluster_centers_)):
        my_members = k_means.labels_ == k
        cluster_center = k_means.cluster_centers_[k]
        kmeans1 = go.Scatter(x=X[my_members, 0], y=X[my_members, 1],
                             showlegend=False,
                             mode='markers', 
                             marker=dict(color=colors[k], size=4), 
                             name='cluster ' + str(k))
        kmeans2 = go.Scatter(x=[cluster_center[0]], y=[cluster_center[1]],
                             showlegend=False,
                             mode='markers', 
                             marker=dict(color=colors[k], size=14, 
                                         line=dict(color='black', width=1)))
        data.append(kmeans1)
        if show_center: data.append(kmeans2)

    layout = go.Layout(title=title,
                       autosize=False,
                       width=600,
                       height=600,
                       xaxis=dict(showticklabels=False, ticks='', zeroline=False, showgrid=False),
                       yaxis=dict(showticklabels=False, ticks='', zeroline=False, showgrid=False))

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [64]:
def plot_clusters2(X, labels, title='Title',show_center = False):
    colors = ['#92A8D1', '#F7CAC9', '#F7CAC9', '#F7CAC9', '#F7CAC9', 
              '#F7CAC9', '#e6194b', '#3cb44b', '#3cb44b', '#3cb44b', 
              '#3cb44b', '#ffe119', '#ffe119', '#ffe119', '#ffe119', 
              '#f58231', '#911eb4', '#911eb4', '#911eb4', '#911eb4']
    
    data = []
    for k in range(2):
        my_members = labels == k
       
        kmeans1 = go.Scatter(x=X[my_members, 0], y=X[my_members, 1],
                             showlegend=False,
                             mode='markers', 
                             marker=dict(color=colors[k], size=4), 
                             name='cluster ' + str(k))
       
        data.append(kmeans1)

    layout = go.Layout(title=title,
                       autosize=False,
                       width=600,
                       height=600,
                       xaxis=dict(showticklabels=False, ticks='', zeroline=False, showgrid=False),
                       yaxis=dict(showticklabels=False, ticks='', zeroline=False, showgrid=False))

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)

In [62]:
r = [1,2,3,5,10,20,50,100,300]
best_r = r[svd_scores[2].index(max(svd_scores[2]))]
print('Best r for SVD: %d' % best_r)

svd2 = TruncatedSVD(n_components = best_r, n_iter = 7, random_state = 42)
lsiVectors2 = svd2.fit_transform(tfidfVectors)
km.fit(lsiVectors2)

evaluate(labels, km.labels_)

Best r for SVD: 3
Homogeneity: 0.634
Completeness: 0.639
V-measure: 0.637
Adjusted Rand-Index: 0.725
Adjusted Mutual Information Score: 0.634
Contingency Table: 
[[3435  468]
 [ 117 3862]]



The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.



In [63]:
#svd22 = TruncatedSVD(n_components = 3, n_iter = 7, random_state = 42)
#lsiVectors22 = svd22.fit_transform(lsiVectors2)
plot_clusters(lsiVectors2, km, 'Clustering Results')

In [65]:
plot_clusters2(lsiVectors2,labels,'Ground Truth');

In [69]:
# For NMF，the best r = 2
best_r = r[nmf_scores[1].index(max(nmf_scores[1]))]
print('Best r for NMF: %d' % best_r)

nmf2 = NMF(n_components = best_r, init='random', random_state=0)
nmfVectors2 = nmf2.fit_transform(tfidfVectors)
km.fit(nmfVectors2)

evaluate(labels, km.labels_)
plot_clusters(nmfVectors2, km, 'Clustering Results', True)

Best r for NMF: 2
Homogeneity: 0.595
Completeness: 0.609
V-measure: 0.602
Adjusted Rand-Index: 0.655
Adjusted Mutual Information Score: 0.595
Contingency Table: 
[[3194  709]
 [  43 3936]]



The behavior of AMI will change in version 0.22. To match the behavior of 'v_measure_score', AMI will use average_method='arithmetic' by default.

