In [1]:
import numpy as np
%matplotlib inline
from random import Random
from texch.experiments import ClusteringExperiment, MultiClusteringExperiment
from texch.clustering.nltk import KMeansClusterer
from texch.preprocessing import PreprocessStep, Preprocessor
from texch.preprocessing.sklearn import TfidfVectorizer
from texch.clustering.nltk import KMeansClusterer

In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(
    categories=[
        'alt.atheism', 'talk.religion.misc',
        'comp.graphics', 'sci.space'
    ],
    subset='test',
    random_state=42
)
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

stopset = set(stopwords.words('english'))

data = []
for text in dataset.data:
    sent_tokens = []
    for sentence in sent_tokenize(text):
        sent = sentence.lower()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(sent)
        sent_tokens += [w for w in tokens if not w in stopset]
    data.append(sent_tokens)

In [6]:
from gensim import corpora, models, similarities

Fast version of gensim.models.doc2vec is being used
'pattern' package not found; tag filters are not available for English


In [7]:
texts = data

In [8]:
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

adding document #0 to Dictionary(0 unique tokens: [])
built Dictionary(27114 unique tokens: [u'askew', u'woods', u'hanging', u'fractioning', u'5980']...) from 1353 documents (total 276586 corpus positions)
discarding 3 tokens: [(u'lines', 1351), (u'subject', 1353), (u'organization', 1315)]...
keeping 27111 tokens which were in no less than 1 and no more than 1082 (=80.0%) documents
rebuilding dictionary, shrinking gaps
resulting dictionary: Dictionary(27111 unique tokens: [u'askew', u'woods', u'hanging', u'fractioning', u'5980']...)


In [174]:
lda = models.LdaModel(corpus, num_topics=2, 
                            id2word=dictionary, 
                            update_every=5, 
                            chunksize=100, 
                            passes=10,
                            random_state=15
                      
                     )

using symmetric alpha at 0.5
using symmetric eta at 3.68853970713e-05
using serial LDA version on this node
running online LDA training, 2 topics, 10 passes over the supplied corpus of 1353 documents, updating model once every 500 documents, evaluating perplexity every 1000 documents, iterating 50x with a convergence threshold of 0.001000
PROGRESS: pass 0, at document #100/1353
performing inference on a chunk of 100 documents
0/100 documents converged within 50 iterations
PROGRESS: pass 0, at document #200/1353
performing inference on a chunk of 100 documents
1/100 documents converged within 50 iterations
PROGRESS: pass 0, at document #300/1353
performing inference on a chunk of 100 documents
0/100 documents converged within 50 iterations
PROGRESS: pass 0, at document #400/1353
performing inference on a chunk of 100 documents
0/100 documents converged within 50 iterations
PROGRESS: pass 0, at document #500/1353
performing inference on a chunk of 100 documents
0/100 documents converged 

In [175]:
data = []
for text in corpus:
    d = [0 for i in xrange(2)]
    for topic, score in lda[text]:
        d[topic] = score
    data.append(d)

In [176]:
from texch.clustering.sklearn import KMeans

In [177]:
c = ClusteringExperiment(
    method=KMeans(
        true_k,
        random_state=15
    ),
    preprocessor=Preprocessor([PreprocessStep(lambda x: x)]),
    verbose_name='stopwords removed unigrams'
)

In [178]:
c.set_input_data(data)

In [179]:
c.run()

Running experiment "stopwords removed unigrams (id=16)"...
Running preprocessing...
Step #0: PreprocessStep (id=16): finished in 8.10623168945e-06 sec
Finished preprocessing in 8.10623168945e-06
Running method...
Finished method in 0.0985021591187 sec
Finished experiment in 0.0985102653503 sec


Unnamed: 0,ExperimentID,ExperimentName,MethodSpent,PrepareFuncSpent,PreprocessorSpent,TotalSpent
0,16,stopwords removed unigrams,0.098502,0,8e-06,0.09851


In [180]:
c.set_true_labels(labels)
c.compute_scores()

Unnamed: 0,ExperimentID,ExperimentName,MethodSpent,PrepareFuncSpent,PreprocessorSpent,TotalSpent,entropy,homogeneity,v_measure,adj_rand_index,completeness,mutual_info_score,normalized_mutual_info_score,adjusted_mutual_info_score,fowlkes_mallows_score
0,16,stopwords removed unigrams,0.098502,0,8e-06,0.09851,1.088598,0.372445,0.415186,0.375778,0.469007,0.51056,0.417947,0.370908,0.579327


In [181]:
SCORES = [
    'homogeneity', 'completeness', 'v_measure',
    'adj_rand_index', 'adjusted_mutual_info_score',
    'fowlkes_mallows_score',
    'silhouette_coefficient', 'calinski_harabaz_score'
]

In [182]:
c.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
c.result[['ExperimentName'] + SCORES]

Unnamed: 0,ExperimentName,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,stopwords removed unigrams,0.372445,0.469007,0.415186,0.375778,0.370908,0.579327,0.846652,44786.672173
