In [3]:
import numpy as np
%matplotlib inline
from random import Random
from texch.experiments import ClusteringExperiment, MultiClusteringExperiment
from texch.clustering.nltk import KMeansClusterer
from texch.preprocessing import PreprocessStep, Preprocessor
from texch.preprocessing.sklearn import TfidfVectorizer
from texch.clustering.sklearn import AgglomerativeClustering

In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(
    categories=[
        'alt.atheism', 'talk.religion.misc',
        'comp.graphics', 'sci.space'
    ],
    subset='test',
    random_state=42
)
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [48]:
len(labels)

1353

In [13]:
experiments = [
    ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='complete',
            affinity='euclidean',
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='complete eucedian'
    ),
    ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='average',
            affinity='euclidean',
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='average eucedian'
    ),
    ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='complete',
            affinity='cosine',
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='complete cosine'
    ),
    ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='average',
            affinity='cosine',
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='average cosine'
    ),
        ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='average',
            affinity='cityblock',
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='average cityblock'
    ),
        ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='complete',
            affinity='cityblock',
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='complete cityblock'
    ),
    ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='ward eucledian'
    )
]

In [14]:
ag = MultiClusteringExperiment(
    data=dataset.data,
    experiments=experiments,
    verbose_name='different distances for ag'
)

In [15]:
ag.run()

Running multi experiment consisting of 7 sub experiments

--------------------------------------------------
*****Experiment #0*****
Running experiment "complete eucedian (id=11)"...
Running preprocessing...
Step #0: PreprocessStep (id=11): finished in 0.620146036148 sec
Finished preprocessing in 0.620146036148
Running in-middle prepare function...
Finished in-middle prepare function in 0.0767779350281 sec
Running method...
Finished method in 5.8874630928 sec
Finished experiment in 6.58438706398 sec

--------------------------------------------------
*****Experiment #1*****
Running experiment "average eucedian (id=12)"...
Running preprocessing...
Step #0: PreprocessStep (id=12): finished in 0.610491991043 sec
Finished preprocessing in 0.610491991043
Running in-middle prepare function...
Finished in-middle prepare function in 0.0165438652039 sec
Running method...
Finished method in 6.08434605598 sec
Finished experiment in 6.71138191223 sec

----------------------------------------------

Unnamed: 0,ExperimentID,ExperimentName,PreprocessorSpent,MethodSpent,TotalSpent
0,11,complete eucedian,0.620146,5.887463,6.584387
1,12,average eucedian,0.610492,6.084346,6.711382
2,13,complete cosine,0.59958,5.585318,6.203791
3,14,average cosine,0.612488,5.734408,6.37264
4,15,average cityblock,0.633025,5.611346,6.263795
5,16,complete cityblock,0.590803,5.632785,6.239817
6,17,ward eucledian,0.582999,5.99371,6.592489


In [16]:
ag.set_true_labels(labels)
ag.compute_scores()

Unnamed: 0,ExperimentID,ExperimentName,PreprocessorSpent,MethodSpent,TotalSpent,entropy,homogeneity,v_measure,adj_rand_index,completeness,mutual_info_score,normalized_mutual_info_score,adjusted_mutual_info_score,fowlkes_mallows_score
0,11,complete eucedian,0.620146,5.887463,6.584387,1.231584,0.115351,0.121524,0.061756,0.128394,0.158128,0.121698,0.11319,0.332452
1,12,average eucedian,0.610492,6.084346,6.711382,0.603545,0.214465,0.297811,0.187139,0.487115,0.293996,0.323217,0.212321,0.525075
2,13,complete cosine,0.59958,5.585318,6.203791,0.20116,0.033228,0.057952,0.003866,0.226437,0.04555,0.086741,0.030696,0.490682
3,14,average cosine,0.612488,5.734408,6.37264,0.032938,0.003679,0.007185,0.00048,0.153102,0.005043,0.023732,0.001031,0.504973
4,15,average cityblock,0.633025,5.611346,6.263795,0.028284,0.003604,0.007062,-0.000178,0.174661,0.00494,0.025088,0.001019,0.504865
5,16,complete cityblock,0.590803,5.632785,6.239817,0.047031,0.005458,0.010553,-0.000278,0.159072,0.007481,0.029464,0.002552,0.503296
6,17,ward eucledian,0.582999,5.99371,6.592489,0.869523,0.27847,0.340782,0.224536,0.439019,0.381737,0.349648,0.276686,0.514551


In [17]:
SCORES = [
    'homogeneity', 'completeness', 'v_measure',
    'adj_rand_index', 'adjusted_mutual_info_score',
    'fowlkes_mallows_score',
    'silhouette_coefficient', 'calinski_harabaz_score'
]

In [19]:
ag.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
ag.result[SCORES]

Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.115351,0.128394,0.121524,0.061756,0.11319,0.332452,0.001914,4.621019
1,0.214465,0.487115,0.297811,0.187139,0.212321,0.525075,0.005732,4.013101
2,0.033228,0.226437,0.057952,0.003866,0.030696,0.490682,-0.000526,3.14085
3,0.003679,0.153102,0.007185,0.00048,0.001031,0.504973,0.002221,1.449843
4,0.003604,0.174661,0.007062,-0.000178,0.001019,0.504865,-0.023403,1.327811
5,0.005458,0.159072,0.010553,-0.000278,0.002552,0.503296,-0.022785,1.695621
6,0.27847,0.439019,0.340782,0.224536,0.276686,0.514551,0.00866,9.804544


In [28]:
np.unique(ag.experiments[0].get_labels())

array([0, 1, 2, 3])

In [33]:
d = ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='average',
            affinity='correlation'
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='ward eucledian'
    )

In [34]:
d.set_input_data(dataset.data)
d.run()

Running experiment "ward eucledian (id=19)"...
Running preprocessing...
Step #0: PreprocessStep (id=19): finished in 0.581299066544 sec
Finished preprocessing in 0.581299066544
Running in-middle prepare function...
Finished in-middle prepare function in 0.0582098960876 sec
Running method...
Finished method in 5.90050292015 sec
Finished experiment in 6.54001188278 sec


Unnamed: 0,ExperimentID,ExperimentName,MethodSpent,PrepareFuncSpent,PreprocessorSpent,TotalSpent
0,19,ward eucledian,5.900503,0.05821,0.581299,6.540012


In [37]:
d.set_true_labels(labels)
d.compute_scores()
d.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
d.result[SCORES]

Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.345176,0.649343,0.450746,0.382913,0.343401,0.624214,0.005752,4.637055


In [41]:
d2 = ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='complete',
            affinity='correlation'
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='correlation complete'
    )

In [42]:
d2.set_input_data(dataset.data)
d2.run()

Running experiment "correlation complete (id=21)"...
Running preprocessing...
Step #0: PreprocessStep (id=21): finished in 0.595389127731 sec
Finished preprocessing in 0.595389127731
Running in-middle prepare function...
Finished in-middle prepare function in 0.0328040122986 sec
Running method...
Finished method in 6.93666291237 sec
Finished experiment in 7.5648560524 sec


Unnamed: 0,ExperimentID,ExperimentName,MethodSpent,PrepareFuncSpent,PreprocessorSpent,TotalSpent
0,21,correlation complete,6.936663,0.032804,0.595389,7.564856


In [43]:
d2.set_true_labels(labels)
d2.compute_scores()
d2.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
d2.result[SCORES]

Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.14447,0.171722,0.156922,0.070397,0.142379,0.36401,0.003189,5.653887


In [44]:
d3 = ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='average',
            affinity='braycurtis'
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='ward eucledian'
    )

In [45]:
d3.set_input_data(dataset.data)
d3.run()
d3.set_true_labels(labels)
d3.compute_scores()
d3.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
d3.result[SCORES]

Running experiment "ward eucledian (id=22)"...
Running preprocessing...
Step #0: PreprocessStep (id=22): finished in 0.603502988815 sec
Finished preprocessing in 0.603502988815
Running in-middle prepare function...
Finished in-middle prepare function in 0.0861752033234 sec
Running method...
Finished method in 8.55640792847 sec
Finished experiment in 9.24608612061 sec


Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.004226,0.173837,0.008251,-0.000351,0.001458,0.504365,0.000603,1.347945


In [46]:
d4 = ClusteringExperiment(
        method=AgglomerativeClustering(
            true_k,
            linkage='complete',
            affinity='braycurtis'
        ),
        preprocessor=Preprocessor(
            [TfidfVectorizer(
                stop_words='english', max_df=0.7, min_df=6
            ).as_preprocess_step()],
        ),
        prepare_func=lambda d: d.todense(),
        verbose_name='ward eucledian'
    )

In [47]:
d4.set_input_data(dataset.data)
d4.run()
d4.set_true_labels(labels)
d4.compute_scores()
d4.compute_scores(['silhouette_coefficient', 'calinski_harabaz_score'])
d4.result[SCORES]

Running experiment "ward eucledian (id=23)"...
Running preprocessing...
Step #0: PreprocessStep (id=23): finished in 0.615652084351 sec
Finished preprocessing in 0.615652084351
Running in-middle prepare function...
Finished in-middle prepare function in 0.0323851108551 sec
Running method...
Finished method in 8.02053189278 sec
Finished experiment in 8.66856908798 sec


Unnamed: 0,homogeneity,completeness,v_measure,adj_rand_index,adjusted_mutual_info_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabaz_score
0,0.05765,0.270702,0.095056,0.014255,0.055233,0.486098,-0.001707,3.194787
