# Seleccionando palabras

In [1]:
import string
import random
import numpy as np
import reuters_reader
import pickle
from collections import Counter
from sklearn.externals import joblib
from gensim.models.keyedvectors import KeyedVectors
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from copy import deepcopy
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import chi2
from pandas import DataFrame
import sklearn 
from scipy.sparse import csr_matrix
# from scipy.sparse import *

np.set_printoptions(precision=3)

### RCV1 Dataset

Use the function ```reuters_reader.reader(path)``` to retrieve the available documents from the rcv1 dataset stored in `path`. This function returns a generator (```reader```) which yields a single document each time we call ```next(reader)```. Each document is a dictionary with the followitn useful keys:
 - "title" is the title of the document
 - "text" is the body of the document
 - "bip:topics:1.0" is the list of topics
 
There are a total of 804420 available documents, although some may have no topic.

#### Building the dataset
We build a balanced dataset that contains ```n_docs```. To get a balanced dataset we iterate through the documents generator until we have ```n_docs / 2``` documents with the desired topic and the same amount without it. 

#### Get the labels
Select a topic we want to classify using the variable topic (there is a list of the topics https://gist.github.com/gavinmh/6253739 ). Then build the list of labels using a 1 for those documents with that topic and 0 otherwise

#### Training and validation set
Finally, we split the dataset using the ```train_split``` value. 

In [2]:
path = 'rcv1'
n_docs = 100000
train_split = 0.8
topic = 'GCAT'

docs = []
reader = reuters_reader.reader(path)

topic_true = 0
topic_false = 0

while len(docs) < n_docs:
    doc = next(reader)
    if doc['text'] == '':
        continue
    if topic in doc['bip:topics:1.0']:
        topic_true += 1
        if topic_true <= n_docs // 2:
            docs.append(doc)
    else:
        topic_false += 1
        if topic_false <= n_docs // 2:
            docs.append(doc)
     
random.shuffle(docs)

labels = np.zeros((n_docs), dtype=np.int16)
labels = [1 if topic in doc['bip:topics:1.0'] else 0 for doc in docs]

print('{} docs with topic {} (from {})'.format(np.sum(labels), topic, n_docs))

split_point = int(n_docs * train_split)
x_train, y_train = docs[:split_point], labels[:split_point]
x_val, y_val = docs[split_point:], labels[split_point:]

print('Training with {} docs'.format(len(x_train)))
print('Validating with {} docs'.format(len(x_val)))

19960917 / 59589newsML.xml failed to parse XML.
19970725 / 756041newsML.xml failed to parse XML.
50000 docs with topic GCAT (from 100000)
Training with 80000 docs
Validating with 20000 docs


### Word2vec model

We are loading the well known word2vec model from __[Google](https://code.google.com/archive/p/word2vec/)__ which is stored in the binary file `GoogleNews-vectors-negative300.bin`.

Load the word2vec model

In [3]:
w2v_name = 'GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=True)
# w2v_name = 'wiki.en/wiki.en.w2v.txt'
# w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=False)

### Get our vocabulary

Get all the the vectors from the word2vec for our vocabulary. Our vocabulary can include all the words used in the word2vec model or be limited to the words in our dataset.

We can change this behaviour with the flag ```dataset_vocabulary```. ```False``` will use all the words from the word2vec model and ```True``` will limit them to just the words that are in our dataset and in the model at the same time.

There is a ```count_threshold``` to remove those words appearing very few times because they are probably errors.

As we have to split each document in individual words, we already save this inside each document with the key "counter".

After this cell, ```X``` is a matrix including all the vectors we are going to use.

In [4]:
words_list = []
words_embedding = []
words_count = []
word2idx = {}
idx2word = {}
word2cluster = {}
cluster2words = {}

count_threshold = 1

for doc in docs:
    doc["counter"] = Counter()
    doc["word_count"] = 0
    words = doc["text"].split()
    words = [word.strip(string.punctuation) for word in words]
    for word in words:
        if word in w2v:
            doc["counter"][word] += 1
            doc["word_count"] += 1
    for word, count in doc["counter"].items():
        try:
            words_count[word2idx[word]] += count
        except:
            words_list.append(word)
            words_count.append(count)
            word2idx[word] = len(words_list) - 1
                
keep_it = [count > count_threshold for count in words_count] 
    
words_list = [word for idx, word in enumerate(words_list) if keep_it[idx]]
words_count = [count for idx, count in enumerate(words_count) if keep_it[idx]]
word2idx = {word: idx for idx, word in enumerate(words_list)}
    
words_embedding = np.zeros((len(words_list), w2v.vector_size), dtype=np.float32)
for idx, word in enumerate(words_list):
    words_embedding[idx, :] += w2v[word]
    
idx2word = dict(enumerate(words_list))

print("Vocabulary length: {}".format(len(words_list)))

for _ in range(100):
    idx = random.randint(0, len(words_list) - 1)
    word = words_list[idx]
    assert(word2idx[word] == idx)
    assert((w2v[word] == words_embedding[idx]).all())

Vocabulary length: 106548


### Bag of words

Represent each document using a bag-of-words model. This representation is done using term frequency–inverse document frequency (tf-idf) and stored in a sparse matrix. 

In [5]:
indptr = [0]
indices = []
data = []

for doc in docs:
    for word in doc["counter"]:
        if word in word2idx:
            indices.append(word2idx[word])
            data.append(1)
    indptr.append(len(indices))
    
matrix = csr_matrix((data, indices, indptr), dtype=float)

### Baseline

Using the bag-of-words representation we can already classify the documents, getting a good baseline to compare our algorithms to. 

Note: this classification takes quite some time (a couple of hours at least). That is the reason they are commented 

In [6]:
# split_point = int(n_docs * train_split)
# baseline_x_train, baseline_y_train = matrix[:split_point], labels[:split_point]
# baseline_x_val, baseline_y_val = matrix[split_point:], labels[split_point:]

# lasso = fit_lasso(baseline_x_train, baseline_y_train)
# validate(lasso, baseline_x_val, baseline_y_val)

### Initial Cluster

Train the first Kmeans cluster using the complete set of words in our dataset.

#### Initial centers

Supposedly, we can improve our results if the first time we build our clusters we know which are the good center candidates.

To select this initial centers we use a feature selection algorithm and retrieve the best ranked features

### Print each cluster

To have an idea of the clusters we are workings with we can print some of their neighbords. The option selected here is to print the most representative neighbords (those that appear more times in the dataset)

In [7]:
def print_clusters(cluster2words, word_counts, word2idx, topn=5, print_cluster_words=True): 
    clusters_len = [len(cluster2words[i]) for i in range(len(cluster2words))]
    if print_cluster_words:
        for cluster_index in range(len(cluster2words)):
            cluster_words = cluster2words[cluster_index]
            cluster_counter = Counter({w: word_counts[word2idx[w]] for w in cluster_words})
            print(cluster_counter.most_common(topn))
    
    print("Clusters mean length: %d" % (np.mean(clusters_len)))
    min_len = np.min(clusters_len)
    print("Clusters min length: %d (%d clusters)" % (min_len, sum(clusters_len == min_len)))
    print("Clusters max length: %d" % (np.max(clusters_len)))
    

# print_clusters(cluster2words, words_count, word2idx)

### Representing the documents using the clusters

Using the clusters from the kmeans classifier build a bow for each document. This bag of words can be normalized usign the frequency of each word with ```useFrequency=True```. 

How? 
* For each document
    * For each word
        * Obtain the w2v vector for that word
        * Obtain the cluster for that vector
        * Add 1 to that cluster in the document bow

To improve the performance the cluster for each word is saved in a dictionary. This way for each word we first check that dictionary instead of first the w2v model and then the classifier.


In [8]:
def clusters_bow(docs, n_clusters, word2cluster, tfidf_reference=None, useFrequency=True, normalize=True, verbose=False):
#     n_clusters = kmeans.n_clusters
    
    data = []
    indices = []
    indptr = [0]
  
    hashed_clusters = {}
    for doc in docs:
        doc_clusters = np.zeros((n_clusters,))
        for (word, count) in doc["counter"].items():
            try:
                cluster = word2cluster[word]
                doc_clusters[cluster] += count
            except:
                pass
#                 print("Skipping word: " + word)
        for i in range(n_clusters):
            if doc_clusters[i]:
                indices.append(i)
                data.append(doc_clusters[i])
        indptr.append(len(indices))

    bo_clusters = csr_matrix((data, indices, indptr), dtype=float, shape=(len(docs), n_clusters))
    
    if tfidf_reference is None:
        normalized = bo_clusters.copy()
    else:
        normalized = tfidf_reference.copy()
    normalized.data[:] = 1
            
    if useFrequency:
        max_doc = np.max(bo_clusters, axis=1).todense()
        tf = bo_clusters
        assert len(max_doc) == len(tf.indptr) - 1
        for i in range(1, len(tf.indptr)):
            tf.data[indptr[i-1]:indptr[i]] /= max_doc[i - 1, 0]
        count = np.sum(normalized, axis=0) + 1
        idf = np.log(normalized.shape[0] / count)
        boc = tf
        for i in range(len(tf.data)):
            boc.data[i] *= idf[0, tf.indices[i]]
    else:
        if normalize:
            boc = normalized
        else:
            assert tfidf is None
            boc = bo_clusters
            
    if verbose:
        for i in range(10):
            print('Document %d: %s (sum = %.2f)' % (i + 1, np.array2string(boc[i, :]), np.sum(boc[i, :])))
        
    return boc

# bows = clusters_bow(docs[:10], kmeans, word2cluster, useFrequency=False)

### How good is the cluster for this classification task?

Words from this cluster are more usual in documents of this class or in documents of other classes?

We can measure it obtaing the value $p_i$ for each cluster $i$. As we have a binary classification topic, documents belonging to the topic have a label 1 and 0 if not
 

$$ p_i = \frac{\sum_{j = doc}tf(j, i) \mid label(j) = 1}{\sum_{j = doc}tf(j, i)} $$

where $tf\_idf(j, i)$ is the value obtained for document $j$ and cluster $i$ in the bag-of-clusters representation. Using that value we can measure how good each cluster is with:

$$ Uncertainty(cluster_i) = -p_i * log_2(p_i) - (1-p_i) *log_2(1-p_i)$$



In [9]:
def clusters_uncertainty(bows, labels):
#     scores, _ = chi2(bows, labels)
    bows_copy = bows.copy()
    total = bows_copy.sum(axis=0)
    
    negative_rows = [i for (i, l) in enumerate(labels) if not l]
    for row in negative_rows:
        bows_copy.data[bows.indptr[row]:bows.indptr[row + 1]] = 0
    possitive = bows_copy.sum(axis=0)
    p = possitive / (total + 0.00000001)
    p = p.A1
    # Add 0.000001 to avoid nan with log2(0)
    uncertainty = -p * np.log2(p + 0.00000001) - (1 - p) * np.log2(1.00000001 - p)
    
#     return scores
    return uncertainty

# uncertainty = clusters_uncertainty(bows, y_train[:10])
# print(uncertainty)

### Fit a classifier

Use the representation of the documents to fit a classifier.

This classifier must return a weight for each feature (each cluster in this case) so we can decide which are the important clusters and which are not relevant

#### SVM classifier

Linear classifier, otherwise we don't have meaningful weights for each cluster

In [10]:
def fit_svm(bows, labels):
    features2c = {50: 8, 100: 8, 150: 8, 200: 8, 250: 1, 300: 0.5, 350: 0.25, 400: 0.25, 
                    500: 0.25, 750: 0.25, 1000: 0.03, 1500: 0.03, 2000: 0.015625, 5000: 0.0078125, 10000: 0.0078125}
    n_features = bows.shape[1]

    features_thresholds = sorted(list(features2c.keys()))
    for i, val in enumerate(features_thresholds):
        if n_features < val:
            break

    if i == 0:
        selected_c = features2c[features_thresholds[0]]
    elif i == len(features_thresholds) - 1:
        selected_c = features2c[features_thresholds[len(features_thresholds) - 1]]
    else:
        left_threshold = features_thresholds[i - 1]
        right_threshold = features_thresholds[i]
        if n_features - left_threshold > right_threshold - n_features:
            selected_c = features2c[features_thresholds[i]]
        else:
            selected_c = features2c[features_thresholds[i - 1]]

    clf = sklearn.svm.LinearSVC(dual=False, C=selected_c)
    clf.fit(bows, labels)
        
    return clf

# svm_classifier = fit_svm(bows, y_train[:10])

#### Lasso classifier

We build a Lasso model using the sklearn functions. Lasso is configured to only use positive coefficients (because it is easir to visualize them).

If we do not have an alpha value the function uses cross validation to obtain it.

In [11]:
def fit_lasso(bows, labels, alpha=None, verbose=False):
    if alpha:
        clf = linear_model.Lasso(alpha=alpha, positive=True)
    else:
        clf = linear_model.LassoCV(positive=True)
    clf.fit(bows, labels)
    
    if verbose:
        print('Lasso coefficients: %s' % (np.array2string(clf.coef_, suppress_small=True)))
        
    if alpha:
        return clf, clf.alpha_
    else:
        return clf, alpha

# lasso, alpha = fit_lasso(bows_train, y_train)
# lasso.coef_

### Analysis of the clusters

#### Keep, discard, or split each cluster?

We can study what to do with each cluster using the previous uncertainty and the weight given by the classifier.

For a certain cluster, if the weight given by the classifier is low, the cluster is useless, at least for this classification problem. But, the cluster may be bad for the classification task because it contains a lot of semmantic families, some in favor some against the label. We can check this using the uncertainty value. 

If the cluster has a low weight and low uncertainty we can deactivate it, removing the words it contains from the complete set of words. 
If the cluster has a low weight and high uncertainty we can split it and hopefully the new 2 clusters will have less uncertainty. 

Currently, to consider low weight the value must be lower than 0.4 times the highest weight. 

We assume low uncertainty for values lower than 0.7219 (meaning that p is lower than 0.2 or higher than 0.8) and high uncertainty for values higher than 0.8813 (p between 0.3 and 0.7)


In [12]:
def study_clusters(cluster_weights, uncertainty, cluster2words, 
                   low_uncertainty_p=0.5, high_uncertainty_p=0.5, cluster_weights_p=0.5):
    assert len(cluster_weights) == len(uncertainty)
        
    clusters = range(len(cluster_weights))
        
    idx = int(low_uncertainty_p * len(uncertainty)) - 1 if low_uncertainty_p > 0 else 0
    low_uncertainty_threshold = np.sort(uncertainty)[idx]
    
    idx = int(high_uncertainty_p * len(uncertainty)) - 1 if high_uncertainty_p > 0 else 0
    high_uncertainty_threshold = np.sort(uncertainty)[idx]

    cluster_weights = cluster_weights * cluster_weights
    idx = int(cluster_weights_p * len(cluster_weights)) - 1 if cluster_weights_p > 0 else 0
    bad_cluster_weigth = np.sort(cluster_weights)[idx]
     
    deactivate = [cluster_weights[c] <= bad_cluster_weigth 
                  and uncertainty[c] < low_uncertainty_threshold 
                  for c in clusters]
    split = [cluster_weights[c] <= bad_cluster_weigth
             and uncertainty[c] > high_uncertainty_threshold
             for c in clusters]
    
#     for i in range(len(cluster_weights)):
#         print("cluster %d: %f %f %d" % (i, cluster_weights[i], uncertainty[i], len(cluster2words[i])))    
    
    keep = [not split[c] and not deactivate[c] for c in clusters]

    return keep, split, deactivate

# keep, split, deactivate = study_clusters(svm_classifier.coef_[0], uncertainty, 0.5, 0.5)

# print('Keeping %d clusters' % (sum(keep)))
# print('Spliting %d clusters' % (sum(split)))
# print('Deactivating %d clusters' % (sum(deactivate)))

### Build new clusters

Once we know which clusters to keep and which to split, we can build the new ones. 

To split one cluster we need to select all the points belonging to that cluster and use a 2-means clustering algorithm.

In [13]:
def split_clusters(words_embedding, word2idx, word2cluster, cluster2words, keep, split):    
    new_idx = 0
    for idx in range(len(keep)):
        if keep[idx]:
            for word in cluster2words[idx]:
                word2cluster[word] = new_idx
            new_idx += 1
#             print('Keeping cluster %d' % idx)
        if split[idx]:
            # create kmeans with this data
            cluster_words = cluster2words[idx]
            if len(cluster_words) < 2:
#                 print('Cannot split cluster %d' % idx)
                for word in cluster2words[idx]:
                    word2cluster[word] = new_idx
                new_idx += 1
            else: 
#                 print('Splitting cluster %d' % idx)
#                 embeddings_idx = [word2idx[word] for word in cluster_words if word in word2idx]
                embeddings_idx = [word2idx[word] for word in cluster_words]
                cluster_embeddings = words_embedding[embeddings_idx]
                kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, compute_labels=True)
                kmeans.fit(cluster_embeddings)
                for word, embedding in zip(cluster_words, cluster_embeddings):
                    word2cluster[word] = new_idx + kmeans.predict([embedding])[0] 
                new_idx += 2
            
    return word2cluster, new_idx
        
def deactivate_clusters(word2idx, cluster2words, words_list, words_count, words_embedding, deactivate, split, keep):
#     print('Deactivating %d clusters' % (sum(deactivate)))
#     clusters_idx = [idx for idx in range(len(deactivate)) if deactivate[idx] or 
#                    (split[idx] and len(cluster2words[idx]) < 2)]
    clusters_idx = [idx for idx in range(len(deactivate)) if deactivate[idx]]
    words_idx = []
    for idx in clusters_idx:
        words_idx.append([word2idx[word] for word in cluster2words[idx]])
    
    if words_idx: 
        words_idx = np.concatenate(words_idx)

    keep = [True for _ in range(len(words_list))]
    for idx in words_idx:
        keep[idx] = False
    
    words_list = [word for idx, word in enumerate(words_list) if keep[idx]]
    words_count = [count for idx, count in enumerate(words_count) if keep[idx]]
    words_embedding = words_embedding[keep]
    
    return words_list, words_count, words_embedding

# words_list, words_count, words_embedding = update_dataset(word2idx, cluster2words, words_list, words_count, words_embedding, deactivate)

# word2idx = {word: idx for idx, word in enumerate(words_list)}
# idx2word = dict(enumerate(words_list))

# word2cluster = update_centers(words_embedding, word2idx, word2cluster, cluster2words, keep, split)    

# cluster2words = [[] for i in range(kmeans.n_clusters)]
# for (word, cluster) in word2cluster.items():
#     cluster2words[cluster].append(word)

### Validation

In [14]:
def validate(lasso, bows, y_true, threshold=None):
    y_predicted = lasso.predict(bows)
#     print("Predicted:")
#     print(y_predicted[:10])
#     print(y_predicted[-10:])
#     print("True:")
#     print(y_true[:10])
#     print(y_true[-10:])
    if threshold is None:
        threshold = np.mean(y_predicted)
    y_predicted = [1 if i > threshold else 0 for i in y_predicted]
    accuracy = accuracy_score(y_val, y_predicted)
    kappa = cohen_kappa_score(y_val, y_predicted)
    
    return accuracy, kappa

## Play time

Start by saving the first kmeans (so we can use it multiple times) and printing the first set of clusters. 

In [15]:
original_word2cluster = deepcopy(word2cluster)
original_cluster2words = deepcopy(cluster2words)
original_word2idx = deepcopy(word2idx)
original_idx2word = deepcopy(idx2word)

## Run

In [16]:
original_words_list = deepcopy(words_list)
original_words_count = deepcopy(words_count)
original_words_embedding = deepcopy(words_embedding)

In [18]:
words_list = deepcopy(original_words_list)
words_count = deepcopy(original_words_count)
words_embedding = deepcopy(original_words_embedding)

n_clusters = 50

for n_clusters in range(50,250, 10):
    scores, _ = chi2(matrix, labels)
    sorted_idx = np.argsort(scores, kind="mergesort")[-n_clusters:]
    words = [words_list[idx] for idx in sorted_idx]
    initial_centers = np.array([words_embedding[idx] for idx in sorted_idx])

    print(words)

    kmeans = MiniBatchKMeans(n_clusters=n_clusters, 
                             init=initial_centers, 
                             random_state=0, 
                             compute_labels=True)
    kmeans.fit(words_embedding)

    word2cluster = {word: kmeans.predict([words_embedding[i]])[0] 
                     for i, word in enumerate(words_list)}

    cluster2words = [[] for i in range(kmeans.n_clusters)]
    for (word, cluster) in word2cluster.items():
        cluster2words[cluster].append(word)

    cluster_weights_p = 8
    low_uncertainty_p = 4
    high_uncertainty_p = 6


    epochs = 1

    patience = 5
    tries = 0
    best_accuracy = 0

    history = []

    for i in range(epochs):
        print('Epoch %d of %d' % (i + 1, epochs))
        print('Number of clusters: %d' % (n_clusters))

        clusters_len = [len(cluster2words[i]) for i in range(len(cluster2words))]
        [pmin, p25, p50, p75, pmax] = np.percentile(clusters_len, [0, 25, 50, 75, 100])

        bows_train = clusters_bow(x_train, n_clusters, word2cluster, useFrequency=False)
        svm = fit_svm(bows_train, y_train)

        weights, _ = chi2(bows_train, y_train)
        uncertainty = clusters_uncertainty(bows_train, y_train)


        bows_val = clusters_bow(x_val, n_clusters, word2cluster, useFrequency=False)    
        accuracy, kappa = validate(svm, bows_val, y_val)

        print('Accuracy: %.3f' % (accuracy))

        

        history.append([
            i, 
            n_clusters,
            pmin,
            p25,
            p50, 
            p75,
            pmax,
            accuracy,
            kappa
        ])


    columns = [
    'Epoch',
    'Clusters', 
    'Min Length',
    'P25 Length',
    'Median Length',
    'P75 Length',
    'Max Length',
    'Accuracy',
    'Kappa'
    ]

    df = DataFrame(history, columns=columns).set_index('Epoch')
    display(df)

['after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']




Epoch 1 of 1
Number of clusters: 50
Accuracy: 0.821


Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,50,1.0,10.25,565.5,3312.25,13251.0,0.82055,0.641105


['called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']
Epoch 1 of 1
Number of clusters: 60
Accuracy: 0.852


Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,60,1.0,123.25,1232.5,2211.25,8493.0,0.8524,0.70482


['prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']
Epoch 1 of 1
Number of clusters: 70
Accuracy: 0.847


Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,70,1.0,8.25,892.0,2162.5,8142.0,0.84705,0.694102


['United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']
Epoch 1 of 1
Number of clusters: 80
Accuracy: 0.851


Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,80,1.0,9.75,919.0,1948.0,8266.0,0.8512,0.702426


['law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']
Epoch 1 of 1
Number of clusters: 90
Accuracy: 0.881


Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,90,1.0,19.5,846.5,1538.75,5720.0,0.8813,0.762617


['stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']
Epoch 1 of 1
Number of clusters: 100
Accuracy: 0.888


Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,100,1.0,10.5,802.0,1558.5,6420.0,0.88815,0.776303


['But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'hi

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,110,1.0,10.0,632.0,1579.0,6205.0,0.88315,0.766306


['as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have',

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,120,1.0,8.5,589.0,1282.25,7849.0,0.88185,0.763711


['first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,130,1.0,14.75,516.5,1206.5,5761.0,0.89095,0.781921


['workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'oppo

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,140,1.0,8.75,491.5,1135.75,7126.0,0.9027,0.805412


['must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', '

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,150,1.0,5.0,444.0,933.0,7136.0,0.8996,0.799208


['last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 'over', 'traders', 'war', 'talks', 'court', 'says', 'Inc', 'parliament', 'security', 'after', 'h

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,160,1.0,3.0,429.0,882.25,8664.0,0.898,0.796017


['countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but', 'prices', 'forces', 'elections', 'by', 'if', 'all', 'into', 'NOTE', 'years', 'no', 'called', 

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,170,1.0,9.25,393.0,864.0,7196.0,0.9013,0.802617


['other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union', 'vouch', 'United', 'rose', 'I', 'accuracy', 'say', 'are', 'does', 'election', 'Ltd', 'but',

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,180,0.0,6.0,383.0,743.75,8495.0,0.9007,0.801417


['earnings', 'EU', 'being', 'there', 'can', 'Foreign', 'for', 'coalition', 'made', 'decision', 'other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town', 'troops', 'law', 'accused', 'index', 'press', 'spokesman', 'verified', 'saying', 'home', 'Union

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,190,1.0,66.75,393.5,755.75,7107.0,0.9059,0.811816


['want', 'rebels', 'many', 'power', 'ministers', 'miles', 'data', 'million', 'higher', 'Clinton', 'earnings', 'EU', 'being', 'there', 'can', 'Foreign', 'for', 'coalition', 'made', 'decision', 'other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', 'her', 'since', 'stock', 'A', 'futures', 'be', 'price', 'only', 'ruling', 'trading', 'town'

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,200,1.0,40.5,340.0,712.5,7542.0,0.90695,0.813913


['vs', 'Reuters', 'km', 'played', 'authorities', 'Court', 'Year', 'my', 'asked', 'during', 'want', 'rebels', 'many', 'power', 'ministers', 'miles', 'data', 'million', 'higher', 'Clinton', 'earnings', 'EU', 'being', 'there', 'can', 'Foreign', 'for', 'coalition', 'made', 'decision', 'other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work', 'Democratic', 'would', 'But', 'company', 'visit', 'where', 'campaign', 'army', 'reporters', 'win', '

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,210,1.0,48.0,335.0,651.75,6177.0,0.9094,0.818811


['They', 'parliamentary', 'radio', 'place', 'dealer', 'Desk', 'Council', 'human', 'Operating', 'prime', 'vs', 'Reuters', 'km', 'played', 'authorities', 'Court', 'Year', 'my', 'asked', 'during', 'want', 'rebels', 'many', 'power', 'ministers', 'miles', 'data', 'million', 'higher', 'Clinton', 'earnings', 'EU', 'being', 'there', 'can', 'Foreign', 'for', 'coalition', 'made', 'decision', 'other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent', 'found', 'with', 'yen', 'newsroom', 'left', 'as', 'died', 'between', 'Co', 'city', 'what', 'dealers', 'work

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,220,1.0,75.25,301.0,623.75,5828.0,0.9087,0.817413


['protest', 'hours', 'We', 'France', 'newspapers', 'because', 'children', 'His', 'Securities', 'presidential', 'They', 'parliamentary', 'radio', 'place', 'dealer', 'Desk', 'Council', 'human', 'Operating', 'prime', 'vs', 'Reuters', 'km', 'played', 'authorities', 'Court', 'Year', 'my', 'asked', 'during', 'want', 'rebels', 'many', 'power', 'ministers', 'miles', 'data', 'million', 'higher', 'Clinton', 'earnings', 'EU', 'being', 'there', 'can', 'Foreign', 'for', 'coalition', 'made', 'decision', 'other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 'Sales', 'national', 'won', 'under', 'European', 'first', 'team', 'parties', 'cents', 'percent

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,230,1.0,82.25,313.0,602.75,7675.0,0.91355,0.827111


['buying', 'whether', 'on', 'taken', 'soldiers', 'arrested', 'Bill', 'match', 'armed', 'more', 'protest', 'hours', 'We', 'France', 'newspapers', 'because', 'children', 'His', 'Securities', 'presidential', 'They', 'parliamentary', 'radio', 'place', 'dealer', 'Desk', 'Council', 'human', 'Operating', 'prime', 'vs', 'Reuters', 'km', 'played', 'authorities', 'Court', 'Year', 'my', 'asked', 'during', 'want', 'rebels', 'many', 'power', 'ministers', 'miles', 'data', 'million', 'higher', 'Clinton', 'earnings', 'EU', 'being', 'there', 'can', 'Foreign', 'for', 'coalition', 'made', 'decision', 'other', 'office', 'rule', 'violence', 'Shares', 'three', 'meeting', 'do', 'case', 'victory', 'countries', 'take', 'way', 'Corp', 'volume', 'Police', 'without', 'member', 'these', 'attack', 'last', 'Cup', 'fighting', 'night', 'death', 'this', 'net', 'any', 'man', 'sales', 'must', 'union', 'Newsroom', 'took', 'men', 'could', 'vote', 'trader', 'which', 'profit', 'workers', 'stocks', 'traded', 'force', 'time', 

Unnamed: 0_level_0,Clusters,Min Length,P25 Length,Median Length,P75 Length,Max Length,Accuracy,Kappa
Epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,240,1.0,82.0,280.0,577.0,5620.0,0.9157,0.83141
