In [1]:
import numpy as np
%matplotlib inline
from random import Random
from texch.experiments import ClusteringExperiment, MultiClusteringExperiment
from texch.clustering.nltk import KMeansClusterer
from texch.preprocessing import PreprocessStep, Preprocessor
from texch.preprocessing.sklearn import TfidfVectorizer
from texch.clustering.nltk import KMeansClusterer

In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(
    categories=[
        'alt.atheism', 'talk.religion.misc',
        'comp.graphics', 'sci.space'
    ],
    subset='test',
    random_state=42
)
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [3]:
from scipy.spatial.distance import cosine, euclidean, correlation, braycurtis, chebyshev

In [4]:
experiments = [
    ClusteringExperiment(
        method=KMeansClusterer(
            true_k,
            distance=euclidean,
            rng=Random(15),
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='eucedian'
    ),
    ClusteringExperiment(
        method=KMeansClusterer(
            true_k,
            distance=cosine,
            rng=Random(15),
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='cosine'
    ),
    ClusteringExperiment(
        method=KMeansClusterer(
            true_k,
            distance=correlation,
            rng=Random(15),
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='correlation'
    ),
    ClusteringExperiment(
        method=KMeansClusterer(
            true_k,
            distance=chebyshev,
            rng=Random(15),
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='chebyshev'
    ),   
    ClusteringExperiment(
        method=KMeansClusterer(
            true_k,
            distance=braycurtis,
            rng=Random(15),
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='braycurtis'
    ),
]

In [5]:
kmeans_dif_measures = MultiClusteringExperiment(
    data=dataset.data,
    experiments=experiments,
    verbose_name='different distances for kmeans'
)

In [6]:
kmeans_dif_measures.run()

Running multi experiment consisting of 5 sub experiments

--------------------------------------------------
*****Experiment #0*****
Running experiment "eucedian (id=0)"...
Running preprocessing...
Step #0: PreprocessStep (id=0): finished in 0.687386989594 sec
Finished preprocessing in 0.687386989594
Running in-middle prepare function...
Finished in-middle prepare function in 0.0515501499176 sec
Running method...
Finished method in 4.64220714569 sec
Finished experiment in 5.3811442852 sec

--------------------------------------------------
*****Experiment #1*****
Running experiment "cosine (id=1)"...
Running preprocessing...
Step #0: PreprocessStep (id=1): finished in 0.606809139252 sec
Finished preprocessing in 0.606809139252
Running in-middle prepare function...
Finished in-middle prepare function in 0.0369789600372 sec
Running method...
Finished method in 3.70285201073 sec
Finished experiment in 4.34664011002 sec

--------------------------------------------------
*****Experiment #2

Unnamed: 0,ExperimentID,ExperimentName,PreprocessorSpent,MethodSpent,TotalSpent
0,0,eucedian,0.687387,4.642207,5.381144
1,1,cosine,0.606809,3.702852,4.34664
2,2,correlation,0.58669,11.939144,12.560774
3,3,chebyshev,0.58425,22.991494,23.60746
4,4,braycurtis,0.550823,3.362664,3.947022


In [10]:
SCORES = [
    'homogeneity', 'completeness', 'v_measure',
    'adj_rand_index', 'adjusted_mutual_info_score',
    'fowlkes_mallows_score',
    'silhouette_coefficient', 'calinski_harabaz_score'
]

In [11]:
kmeans_dif_measures.set_true_labels(labels)
kmeans_dif_measures.compute_scores()

Unnamed: 0,ExperimentID,ExperimentName,PreprocessorSpent,MethodSpent,TotalSpent,entropy,homogeneity,v_measure,adj_rand_index,completeness,mutual_info_score,normalized_mutual_info_score,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0,eucedian,0.687387,4.642207,5.381144,1.369378,0.527135,0.527415,0.457742,0.527695,0.722614,0.527415,0.525981,0.597396,0.009779,9.381925
1,1,cosine,0.606809,3.702852,4.34664,1.372063,0.508019,0.507792,0.525129,0.507564,0.69641,0.507792,0.506364,0.647009,0.008843,8.609721
2,2,correlation,0.58669,11.939144,12.560774,1.380176,0.568609,0.566678,0.592506,0.56476,0.779468,0.566681,0.563705,0.696319,0.009761,9.252538
3,3,chebyshev,0.58425,22.991494,23.60746,1.329674,0.120954,0.122798,0.102604,0.124698,0.165808,0.122812,0.118808,0.342115,0.002828,4.212381
4,4,braycurtis,0.550823,3.362664,3.947022,1.373945,0.513505,0.512923,0.512755,0.512343,0.703931,0.512924,0.511156,0.637607,0.009951,9.019874


In [13]:
kmeans_dif_measures.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
kmeans_dif_measures.result[SCORES]

Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.527135,0.527695,0.527415,0.457742,0.525981,0.597396,0.009779,9.381925
1,0.508019,0.507564,0.507792,0.525129,0.506364,0.647009,0.008843,8.609721
2,0.568609,0.56476,0.566678,0.592506,0.563705,0.696319,0.009761,9.252538
3,0.120954,0.124698,0.122798,0.102604,0.118808,0.342115,0.002828,4.212381
4,0.513505,0.512343,0.512923,0.512755,0.511156,0.637607,0.009951,9.019874


In [15]:
from scipy.spatial.distance import cityblock
p = ClusteringExperiment(
    method=KMeansClusterer(
        true_k,
        distance=cityblock,
        rng=Random(15),
    ),
    preprocessor=Preprocessor(
        [TfidfVectorizer(
            stop_words='english', max_df=0.7, min_df=6
        ).as_preprocess_step()],
    ),
    prepare_func=lambda d: d.todense(),
    verbose_name='braycurtis'
)

In [16]:
p.set_true_labels(labels)

In [19]:
p.set_input_data(dataset.data)

In [20]:
p.run()

Running experiment "braycurtis (id=5)"...
Running preprocessing...
Step #0: PreprocessStep (id=5): finished in 0.658637046814 sec
Finished preprocessing in 0.658637046814
Running in-middle prepare function...
Finished in-middle prepare function in 0.0600328445435 sec
Running method...
Finished method in 1.57240200043 sec
Finished experiment in 2.29107189178 sec


Unnamed: 0,ExperimentID,ExperimentName,MethodSpent,PrepareFuncSpent,PreprocessorSpent,TotalSpent
0,5,braycurtis,1.572402,0.060033,0.658637,2.291072


In [21]:
p.set_true_labels(labels)
p.compute_scores()

Unnamed: 0,ExperimentID,ExperimentName,MethodSpent,PrepareFuncSpent,PreprocessorSpent,TotalSpent,entropy,homogeneity,v_measure,adj_rand_index,completeness,mutual_info_score,normalized_mutual_info_score,adjusted_mutual_info_score,fowlkes_mallows_score
0,5,braycurtis,1.572402,0.060033,0.658637,2.291072,0.331498,0.089737,0.144525,0.014753,0.371088,0.123015,0.182484,0.087178,0.472681


In [22]:
p.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
p.result[SCORES]

Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.089737,0.371088,0.144525,0.014753,0.087178,0.472681,-0.012846,5.084816
