In [1]:
from texch import SingleExperiment

In [2]:
from texch.clustering import KMeans

In [3]:
from texch.preprocessing import Preprocessor

In [4]:
from texch.preprocessing.cleaner import ClearPunctuationRegex, LowerCase, ExcludeChars
from texch.preprocessing.tokenizer import TextToWordsTokenizer
from texch.preprocessing.utils import TokensToText
from texch.preprocessing.vectorizer import TfidfVectorizer

In [5]:
import numpy as np

In [6]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
dataset = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [7]:
from string import punctuation
exclude = set(punctuation + u'0123456789[]—«»–')

preprocessing_steps = [
    ExcludeChars(exclude=exclude),
    LowerCase(),
    # just to show how it works
    TextToWordsTokenizer(),
    TokensToText(),
    TfidfVectorizer(stop_words='english')
]
preprocessor1 = Preprocessor(preprocessing_steps)

In [8]:
from random import Random

kmeans_euc = KMeans(true_k, distance='euclidean', rng=Random(42))

kmeans_cosine = KMeans(true_k, distance='cosine', rng=Random(42))

kmeans_pearson = KMeans(true_k, distance='correlation', rng=Random(42))
kmeans_braycurtis = KMeans(true_k, distance='braycurtis', rng=Random(42))

kmeans_chebyshev = KMeans(true_k, distance='chebyshev', rng=Random(42))
kmeans_chebyshev.set_cluster_params(trace=True)

In [9]:
def prepare_data_for_kmeans(matrix):
    return matrix.todense()

In [10]:
experiment_euc = SingleExperiment(
    data=dataset.data, 
    clustering_algorithm=kmeans_euc, 
    preprocessor=preprocessor1, 
    verbose_name='Euclid Kmeans on TfIdf',
    prepare_func=prepare_data_for_kmeans
)

In [11]:
experiment_euc.run()

Running experiment Euclid Kmeans on TfIdf
Running preprocessing...
Step#0: Exclude chars: 0.335907936096 sec
Step#1: To lower case: 0.0128769874573 sec
Step#2: From text to words tokenizer: 2.64668798447 sec
Step#3: TokensToText: 0.0156409740448 sec
Step#4: TfidfVectorizer: 0.557484149933 sec
Running in-middle prepare function
Running clustering...
11.8588662148


In [12]:
experiment_euc.summary()

To print summary set true_labels on experiment


In [13]:
experiment_euc.clustering_algorithm.converged

True

In [14]:
experiment_euc.clustering_algorithm.num_iterations

22

In [15]:
experiment_euc.set_true_labels(labels)

In [16]:
experiment_euc.summary()

Experiment Euclid Kmeans on TfIdf Summary
-------------------
Preprocessor:
Step #0: Exclude chars
Step #1: To lower case
Step #2: From text to words tokenizer
Step #3: TokensToText
Step #4: TfidfVectorizer
Clustering algorithm:
K means
Total objects to cluster: 1353
Total clusters found: 4
Cluster #0: 181 objects
Cluster #1: 423 objects
Cluster #2: 149 objects
Cluster #3: 600 objects
.......
Scores:
adj_rand_index: 0.341252415914
completeness: 0.461619902217
entropy: 1.23616079069
v_measure: 0.437773343662
homogeneity: 0.416269514837


In [17]:
experiment_euc.silhouette_coefficient

0.0061880964996659203

In [18]:
experiment_cosine = SingleExperiment(
    data=dataset.data, 
    clustering_algorithm=kmeans_cosine, 
    preprocessor=preprocessor1, 
    verbose_name='Cosine Kmeans on TfIdf',
    prepare_func=prepare_data_for_kmeans
)

In [19]:
experiment_cosine.run()

Running experiment Cosine Kmeans on TfIdf
Running preprocessing...
Step#0: Exclude chars: 0.330342054367 sec
Step#1: To lower case: 0.0121281147003 sec
Step#2: From text to words tokenizer: 2.09871387482 sec
Step#3: TokensToText: 0.0139529705048 sec
Step#4: TfidfVectorizer: 0.539141893387 sec
Running in-middle prepare function
Running clustering...
7.74876999855


In [21]:
experiment_cosine.set_true_labels(labels)

In [23]:
print experiment_cosine.clustering_algorithm.converged
print experiment_cosine.clustering_algorithm.num_iterations

True
10


In [24]:
experiment_cosine.summary()

Experiment Cosine Kmeans on TfIdf Summary
-------------------
Preprocessor:
Step #0: Exclude chars
Step #1: To lower case
Step #2: From text to words tokenizer
Step #3: TokensToText
Step #4: TfidfVectorizer
Clustering algorithm:
K means
Total objects to cluster: 1353
Total clusters found: 4
Cluster #0: 332 objects
Cluster #1: 393 objects
Cluster #2: 233 objects
Cluster #3: 395 objects
.......
Scores:
adj_rand_index: 0.422088057154
completeness: 0.460662670533
entropy: 1.36620392679
v_measure: 0.459883388996
homogeneity: 0.459106739555


In [26]:
experiment_cosine.silhouette_coefficient

0.0062367881996818093

In [27]:
experiment_pearson = SingleExperiment(
    data=dataset.data, 
    clustering_algorithm=kmeans_pearson, 
    preprocessor=preprocessor1, 
    verbose_name='Pearson Correlation Kmeans on TfIdf',
    prepare_func=prepare_data_for_kmeans
)

In [28]:
experiment_pearson.run()

Running experiment Pearson Correlation Kmeans on TfIdf
Running preprocessing...
Step#0: Exclude chars: 0.363492965698 sec
Step#1: To lower case: 0.0151379108429 sec
Step#2: From text to words tokenizer: 2.26861786842 sec
Step#3: TokensToText: 0.013571023941 sec
Step#4: TfidfVectorizer: 0.555256128311 sec
Running in-middle prepare function
Running clustering...
24.4648089409


In [29]:
experiment_pearson.set_true_labels(labels)

In [30]:
print experiment_pearson.clustering_algorithm.converged
print experiment_pearson.clustering_algorithm.num_iterations

True
12


In [31]:
experiment_pearson.summary()

Experiment Pearson Correlation Kmeans on TfIdf Summary
-------------------
Preprocessor:
Step #0: Exclude chars
Step #1: To lower case
Step #2: From text to words tokenizer
Step #3: TokensToText
Step #4: TfidfVectorizer
Clustering algorithm:
K means
Total objects to cluster: 1353
Total clusters found: 4
Cluster #0: 333 objects
Cluster #1: 390 objects
Cluster #2: 241 objects
Cluster #3: 389 objects
.......
Scores:
adj_rand_index: 0.427207915838
completeness: 0.462395533902
entropy: 1.3692983251
v_measure: 0.462136381981
homogeneity: 0.461877520383


In [32]:
experiment_pearson.spent

{'clustering': 24.46480894088745,
 'preprocessor': {'Step#0: Exclude chars': 0.3634929656982422,
  'Step#1: To lower case': 0.015137910842895508,
  'Step#2: From text to words tokenizer': 2.268617868423462,
  'Step#3: TokensToText': 0.013571023941040039,
  'Step#4: TfidfVectorizer': 0.5552561283111572}}

In [34]:
experiment_braycurtis = SingleExperiment(
    data=dataset.data, 
    clustering_algorithm=kmeans_braycurtis, 
    preprocessor=preprocessor1, 
    verbose_name='Bray-Curtis Kmeans on TfIdf',
    prepare_func=prepare_data_for_kmeans
)

In [35]:
experiment_braycurtis.run()

Running experiment Bray-Curtis Kmeans on TfIdf
Running preprocessing...
Step#0: Exclude chars: 0.391551017761 sec
Step#1: To lower case: 0.0128858089447 sec
Step#2: From text to words tokenizer: 2.83308911324 sec
Step#3: TokensToText: 0.0148909091949 sec
Step#4: TfidfVectorizer: 0.69079208374 sec
Running in-middle prepare function
Running clustering...
27.2871630192


In [38]:
experiment_braycurtis.set_true_labels(labels)

In [39]:
print experiment_braycurtis.clustering_algorithm.converged
print experiment_braycurtis.clustering_algorithm.num_iterations

True
19


In [40]:
experiment_braycurtis.summary()

Experiment Bray-Curtis Kmeans on TfIdf Summary
-------------------
Preprocessor:
Step #0: Exclude chars
Step #1: To lower case
Step #2: From text to words tokenizer
Step #3: TokensToText
Step #4: TfidfVectorizer
Clustering algorithm:
K means
Total objects to cluster: 1353
Total clusters found: 4
Cluster #0: 382 objects
Cluster #1: 388 objects
Cluster #2: 257 objects
Cluster #3: 326 objects
.......
Scores:
adj_rand_index: 0.436347148939
completeness: 0.449388184194
entropy: 1.37367028271
v_measure: 0.449852592712
homogeneity: 0.450317962085


In [41]:
experiment_chebyshev = SingleExperiment(
    data=dataset.data, 
    clustering_algorithm=kmeans_chebyshev, 
    preprocessor=preprocessor1, 
    verbose_name='Pearson Correlation Kmeans on TfIdf',
    prepare_func=prepare_data_for_kmeans
)

In [42]:
experiment_chebyshev.run()

Running experiment Pearson Correlation Kmeans on TfIdf
Running preprocessing...
Step#0: Exclude chars: 0.346951007843 sec
Step#1: To lower case: 0.0140719413757 sec
Step#2: From text to words tokenizer: 2.28541493416 sec
Step#3: TokensToText: 0.0138800144196 sec
Step#4: TfidfVectorizer: 0.539669036865 sec
Running in-middle prepare function
Running clustering...
k-means trial 0
iteration: 0
difference: 0.92084566733
iteration: 1
difference: 0.226764006085
iteration: 2
difference: 0.345843826502
iteration: 3
difference: 0.349472390515
iteration: 4
difference: 0.142473889569
iteration: 5
difference: 0.03251004882
iteration: 6
difference: 0.0459499288652
iteration: 7
difference: 0.0093600085925
iteration: 8
difference: 0.0154949761166
iteration: 9
difference: 0.00812926443584
iteration: 10
difference: 0.00741747468431
iteration: 11
difference: 0.0044482379098
iteration: 12
difference: 0.0
175.359150887


In [43]:
experiment_chebyshev.set_true_labels(labels)

In [44]:
experiment_chebyshev.summary()

Experiment Pearson Correlation Kmeans on TfIdf Summary
-------------------
Preprocessor:
Step #0: Exclude chars
Step #1: To lower case
Step #2: From text to words tokenizer
Step #3: TokensToText
Step #4: TfidfVectorizer
Clustering algorithm:
K means
Total objects to cluster: 1353
Total clusters found: 4
Cluster #0: 526 objects
Cluster #1: 384 objects
Cluster #2: 234 objects
Cluster #3: 209 objects
.......
Scores:
adj_rand_index: 0.0437650270214
completeness: 0.0412266770296
entropy: 1.31674025068
v_measure: 0.0403968925512
homogeneity: 0.0395998517823


# Bonus MultiExperiment

In [None]:
from texch import MultiExperimentiExperimenttiExperiment