# Seleccionando palabras

In [1]:
import string
import random
import numpy as np
import reuters_reader
import pickle
from collections import Counter
from sklearn.externals import joblib
from gensim.models.keyedvectors import KeyedVectors
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from copy import deepcopy
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import chi2
from pandas import DataFrame
import sklearn 
from scipy.sparse import csr_matrix
# from scipy.sparse import *

np.set_printoptions(precision=3)

### RCV1 Dataset

Use the function ```reuters_reader.reader(path)``` to retrieve the available documents from the rcv1 dataset stored in `path`. This function returns a generator (```reader```) which yields a single document each time we call ```next(reader)```. Each document is a dictionary with the followitn useful keys:
 - "title" is the title of the document
 - "text" is the body of the document
 - "bip:topics:1.0" is the list of topics
 
There are a total of 804420 available documents, although some may have no topic.

#### Building the dataset
We build a balanced dataset that contains ```n_docs```. To get a balanced dataset we iterate through the documents generator until we have ```n_docs / 2``` documents with the desired topic and the same amount without it. 

#### Get the labels
Select a topic we want to classify using the variable topic (there is a list of the topics https://gist.github.com/gavinmh/6253739 ). Then build the list of labels using a 1 for those documents with that topic and 0 otherwise

#### Training and validation set
Finally, we split the dataset using the ```train_split``` value. 

In [2]:
path = 'rcv1'
n_docs = 100000
train_split = 0.8
topic = 'GCAT'

docs = []
reader = reuters_reader.reader(path)

topic_true = 0
topic_false = 0

while len(docs) < n_docs:
    doc = next(reader)
    if doc['text'] == '':
        continue
    if topic in doc['bip:topics:1.0']:
        topic_true += 1
        if topic_true <= n_docs // 2:
            docs.append(doc)
    else:
        topic_false += 1
        if topic_false <= n_docs // 2:
            docs.append(doc)
     
random.shuffle(docs)

labels = np.zeros((n_docs), dtype=np.int16)
labels = [1 if topic in doc['bip:topics:1.0'] else 0 for doc in docs]

print('{} docs with topic {} (from {})'.format(np.sum(labels), topic, n_docs))

split_point = int(n_docs * train_split)
x_train, y_train = docs[:split_point], labels[:split_point]
x_val, y_val = docs[split_point:], labels[split_point:]

print('Training with {} docs'.format(len(x_train)))
print('Validating with {} docs'.format(len(x_val)))

19960917 / 59589newsML.xml failed to parse XML.
19970725 / 756041newsML.xml failed to parse XML.
50000 docs with topic GCAT (from 100000)
Training with 80000 docs
Validating with 20000 docs


### Word2vec model

We are loading the well known word2vec model from __[Google](https://code.google.com/archive/p/word2vec/)__ which is stored in the binary file `GoogleNews-vectors-negative300.bin`.

Load the word2vec model

In [3]:
w2v_name = 'GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=True)

### Get our vocabulary

Get all the the vectors from the word2vec for our vocabulary. Our vocabulary can include all the words used in the word2vec model or be limited to the words in our dataset.

We can change this behaviour with the flag ```dataset_vocabulary```. ```False``` will use all the words from the word2vec model and ```True``` will limit them to just the words that are in our dataset and in the model at the same time.

There is a ```count_threshold``` to remove those words appearing very few times because they are probably errors.

As we have to split each document in individual words, we already save this inside each document with the key "counter".

After this cell, ```X``` is a matrix including all the vectors we are going to use.

In [4]:
words_list = []
words_embedding = []
words_count = []
word2idx = {}
idx2word = {}
word2cluster = {}
cluster2words = {}

count_threshold = 5

for doc in docs:
    doc["counter"] = Counter()
    doc["word_count"] = 0
    words = doc["text"].split()
    words = [word.strip(string.punctuation) for word in words]
    for word in words:
        if word in w2v:
            doc["counter"][word] += 1
            doc["word_count"] += 1
    for word, count in doc["counter"].items():
        try:
            words_count[word2idx[word]] += count
        except:
            words_list.append(word)
            words_count.append(count)
            word2idx[word] = len(words_list) - 1
                
keep_it = [count > count_threshold for count in words_count] 
    
words_list = [word for idx, word in enumerate(words_list) if keep_it[idx]]
words_count = [count for idx, count in enumerate(words_count) if keep_it[idx]]
word2idx = {word: idx for idx, word in enumerate(words_list)}
    
words_embedding = np.zeros((len(words_list), w2v.vector_size), dtype=np.float32)
for idx, word in enumerate(words_list):
    words_embedding[idx, :] += w2v[word]
    
idx2word = dict(enumerate(words_list))

print("Vocabulary length: {}".format(len(words_list)))

for _ in range(100):
    idx = random.randint(0, len(words_list) - 1)
    word = words_list[idx]
    assert(word2idx[word] == idx)
    assert((w2v[word] == words_embedding[idx]).all())

Vocabulary length: 61651


### Bag of words

Represent each document using a bag-of-words model. This representation is done using term frequency–inverse document frequency (tf-idf) and stored in a sparse matrix. 

In [5]:
indptr = [0]
indices = []
data = []

for doc in docs:
    for word in doc["counter"]:
        if word in word2idx:
            indices.append(word2idx[word])
            data.append(1)
    indptr.append(len(indices))
    
matrix = csr_matrix((data, indices, indptr), dtype=float)

### Baseline

Using the bag-of-words representation we can already classify the documents, getting a good baseline to compare our algorithms to. 

Note: this classification takes quite some time (a couple of hours at least). That is the reason they are commented 

In [6]:
# split_point = int(n_docs * train_split)
# baseline_x_train, baseline_y_train = matrix[:split_point], labels[:split_point]
# baseline_x_val, baseline_y_val = matrix[split_point:], labels[split_point:]

# lasso = fit_lasso(baseline_x_train, baseline_y_train)
# validate(lasso, baseline_x_val, baseline_y_val)

### Initial Cluster

Train the first Kmeans cluster using the complete set of words in our dataset.

#### Initial centers

Supposedly, we can improve our results if the first time we build our clusters we know which are the good center candidates.

To select this initial centers we use a feature selection algorithm and retrieve the best ranked features

In [7]:
scores, _ = chi2(matrix, labels)
sorted_idx = np.argsort(scores, kind="mergesort")[-50:]
words = [words_list[idx] for idx in sorted_idx]
initial_centers = np.array([words_embedding[idx] for idx in sorted_idx])
print(words)

['after', 'had', 'members', 'peace', 'Saturday', 'an', 'country', 'stories', 'state', 'that', 'against', 'minister', 'leaders', 'killed', 'two', 'military', 'they', 'Party', 'He', 'opposition', 'out', 'officials', 'them', 'party', 'been', 'when', 'Sunday', 'share', 'per', 'Prime', 'has', 'police', 'political', 'Net', 'former', 'market', 'shares', 'leader', 'told', 'have', 'him', 'not', 'government', 'people', 'he', 'Minister', 'their', 'President', 'who', 'his']


In [8]:
n_clusters = 50

In [9]:
kmeans = MiniBatchKMeans(n_clusters=n_clusters, 
                         init=initial_centers, 
                         random_state=0, 
                         compute_labels=True)
kmeans.fit(words_embedding)

  """


MiniBatchKMeans(batch_size=100, compute_labels=True,
        init=array([[ 0.09375, -0.05005, ...,  0.06396, -0.02637],
       [-0.05811,  0.05811, ..., -0.02307, -0.04346],
       ...,
       [ 0.0791 ,  0.09668, ...,  0.07129, -0.05005],
       [ 0.31836,  0.17676, ...,  0.05249, -0.00233]], dtype=float32),
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=50,
        n_init=3, random_state=0, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [10]:
word2cluster = {word: kmeans.predict([words_embedding[i]])[0] 
                 for i, word in enumerate(words_list)}

In [11]:
cluster2words = [[] for i in range(kmeans.n_clusters)]
for (word, cluster) in word2cluster.items():
    cluster2words[cluster].append(word)

### Print each cluster

To have an idea of the clusters we are workings with we can print some of their neighbords. The option selected here is to print the most representative neighbords (those that appear more times in the dataset)

In [12]:
def print_clusters(cluster2words, word_counts, word2idx, topn=5): 
    clusters_len = [len(cluster2words[i]) for i in range(len(cluster2words))]
    for cluster_index in range(len(cluster2words)):
        cluster_words = cluster2words[cluster_index]
        cluster_counter = Counter({w: word_counts[word2idx[w]] for w in cluster_words})
        print(cluster_counter.most_common(topn))
    
    print("Clusters mean length: %d" % (np.mean(clusters_len)))
    print("Clusters min length: %d" % (np.min(clusters_len)))
    print("Clusters max length: %d" % (np.max(clusters_len)))
    

# print_clusters(cluster2words, words_count, word2idx)

### Representing the documents using the clusters

Using the clusters from the kmeans classifier build a bow for each document. This bag of words can be normalized usign the frequency of each word with ```useFrequency=True```. 

How? 
* For each document
    * For each word
        * Obtain the w2v vector for that word
        * Obtain the cluster for that vector
        * Add 1 to that cluster in the document bow

To improve the performance the cluster for each word is saved in a dictionary. This way for each word we first check that dictionary instead of first the w2v model and then the classifier.


In [13]:
def clusters_bow(docs, n_clusters, word2cluster, tfidf_reference=None, useFrequency=True, normalize=True, verbose=False):
#     n_clusters = kmeans.n_clusters
    
    data = []
    indices = []
    indptr = [0]
  
    hashed_clusters = {}
    for doc in docs:
        doc_clusters = np.zeros((n_clusters,))
        for (word, count) in doc["counter"].items():
            try:
                cluster = word2cluster[word]
                doc_clusters[cluster] += count
            except:
                pass
#                 print("Skipping word: " + word)
        for i in range(n_clusters):
            if doc_clusters[i]:
                indices.append(i)
                data.append(doc_clusters[i])
        indptr.append(len(indices))

    bo_clusters = csr_matrix((data, indices, indptr), dtype=float, shape=(len(docs), n_clusters))
    
    if tfidf_reference is None:
        normalized = bo_clusters.copy()
    else:
        normalized = tfidf_reference.copy()
    normalized.data[:] = 1
            
    if useFrequency:
        max_doc = np.max(bo_clusters, axis=1).todense()
        tf = bo_clusters
        assert len(max_doc) == len(tf.indptr) - 1
        for i in range(1, len(tf.indptr)):
            tf.data[indptr[i-1]:indptr[i]] /= max_doc[i - 1, 0]
        count = np.sum(normalized, axis=0) + 1
        idf = np.log(normalized.shape[0] / count)
        boc = tf
        for i in range(len(tf.data)):
            boc.data[i] *= idf[0, tf.indices[i]]
    else:
        if normalize:
            boc = normalized
        else:
            assert tfidf is None
            boc = bo_clusters
            
    if verbose:
        for i in range(10):
            print('Document %d: %s (sum = %.2f)' % (i + 1, np.array2string(boc[i, :]), np.sum(boc[i, :])))
        
    return boc

# bows = clusters_bow(docs[:10], kmeans, word2cluster, useFrequency=False)

### How good is the cluster for this classification task?

Words from this cluster are more usual in documents of this class or in documents of other classes?

We can measure it obtaing the value $p_i$ for each cluster $i$. As we have a binary classification topic, documents belonging to the topic have a label 1 and 0 if not
 

$$ p_i = \frac{\sum_{j = doc}tf(j, i) \mid label(j) = 1}{\sum_{j = doc}tf(j, i)} $$

where $tf\_idf(j, i)$ is the value obtained for document $j$ and cluster $i$ in the bag-of-clusters representation. Using that value we can measure how good each cluster is with:

$$ Uncertainty(cluster_i) = -p_i * log_2(p_i) - (1-p_i) *log_2(1-p_i)$$



In [14]:
def clusters_uncertainty(bows, labels):
    bows_copy = bows.copy()
    total = bows_copy.sum(axis=0)
    
    negative_rows = [i for (i, l) in enumerate(labels) if not l]
    for row in negative_rows:
        bows_copy.data[bows.indptr[row]:bows.indptr[row + 1]] = 0
    possitive = bows_copy.sum(axis=0)
    p = possitive / (total + 0.00000001)
    p = p.A1
    # Add 0.000001 to avoid nan with log2(0)
    uncertainty = -p * np.log2(p + 0.00000001) - (1 - p) * np.log2(1.00000001 - p)

    return uncertainty

# uncertainty = clusters_uncertainty(bows, y_train[:10])
# print(uncertainty)

### Fit a classifier

Use the representation of the documents to fit a classifier.

This classifier must return a weight for each feature (each cluster in this case) so we can decide which are the important clusters and which are not relevant

#### SVM classifier

Linear classifier, otherwise we don't have meaningful weights for each cluster

In [15]:
def fit_svm(bows, labels):
    features2c = {50: 8, 100: 8, 150: 8, 200: 8, 250: 1, 300: 0.5, 350: 0.25, 400: 0.25, 
                    500: 0.25, 750: 0.25, 1000: 0.03, 1500: 0.03, 2000: 0.015625, 5000: 0.0078125, 10000: 0.0078125}
    n_features = bows.shape[1]

    features_thresholds = sorted(list(features2c.keys()))
    for i, val in enumerate(features_thresholds):
        if n_features < val:
            break

    if i == 0:
        selected_c = features2c[features_thresholds[0]]
    elif i == len(features_thresholds) - 1:
        selected_c = features2c[features_thresholds[len(features_thresholds) - 1]]
    else:
        left_threshold = features_thresholds[i - 1]
        right_threshold = features_thresholds[i]
        if n_features - left_threshold > right_threshold - n_features:
            selected_c = features2c[features_thresholds[i]]
        else:
            selected_c = features2c[features_thresholds[i - 1]]

    clf = sklearn.svm.LinearSVC(dual=False, C=selected_c)
    clf.fit(bows, labels)
        
    return clf

# svm_classifier = fit_svm(bows, y_train[:10])

#### Lasso classifier

We build a Lasso model using the sklearn functions. Lasso is configured to only use positive coefficients (because it is easir to visualize them).

If we do not have an alpha value the function uses cross validation to obtain it.

In [16]:
def fit_lasso(bows, labels, alpha=None, verbose=False):
    if alpha:
        clf = linear_model.Lasso(alpha=alpha, positive=True)
    else:
        clf = linear_model.LassoCV(positive=True)
    clf.fit(bows, labels)
    
    if verbose:
        print('Lasso coefficients: %s' % (np.array2string(clf.coef_, suppress_small=True)))
        
    if alpha:
        return clf, clf.alpha_
    else:
        return clf, alpha

# lasso, alpha = fit_lasso(bows_train, y_train)
# lasso.coef_

### Analysis of the clusters

#### Keep, discard, or split each cluster?

We can study what to do with each cluster using the previous uncertainty and the weight given by the classifier.

For a certain cluster, if the weight given by the classifier is low, the cluster is useless, at least for this classification problem. But, the cluster may be bad for the classification task because it contains a lot of semmantic families, some in favor some against the label. We can check this using the uncertainty value. 

If the cluster has a low weight and low uncertainty we can deactivate it, removing the words it contains from the complete set of words. 
If the cluster has a low weight and high uncertainty we can split it and hopefully the new 2 clusters will have less uncertainty. 

Currently, to consider low weight the value must be lower than 0.4 times the highest weight. 

We assume low uncertainty for values lower than 0.7219 (meaning that p is lower than 0.2 or higher than 0.8) and high uncertainty for values higher than 0.8813 (p between 0.3 and 0.7)


In [26]:
def study_clusters(cluster_weights, uncertainty, weights_p):
    assert len(cluster_weights) == len(uncertainty)
        
    clusters = range(len(cluster_weights))
        
    low_uncertainty_threshold = 0.722 # value for p = 0.8
    high_uncertainty_threshold = 0.722 # value for p = 0.3 or 0.7

    cluster_weights = np.abs(cluster_weights)
    deactivate_value = np.sort(cluster_weights)[int(weights_p * len(cluster_weights)) - 1]
#     deactivate_value = np.inf
    
    deactivate = [cluster_weights[c] <= deactivate_value 
                  and uncertainty[c] <= low_uncertainty_threshold 
                  for c in clusters]
    split = [cluster_weights[c] <= deactivate_value
             and uncertainty[c] >= high_uncertainty_threshold
             for c in clusters]
    keep = [not deactivate[c] and not split[c] for c in clusters]

    return keep, split, deactivate

# keep, split, deactivate = study_clusters(svm_classifier.coef_[0], uncertainty, 0.5, 0.5)

# print('Keeping %d clusters' % (sum(keep)))
# print('Spliting %d clusters' % (sum(split)))
# print('Deactivating %d clusters' % (sum(deactivate)))

### Build new clusters

Once we know which clusters to keep and which to split, we can build the new ones. 

To split one cluster we need to select all the points belonging to that cluster and use a 2-means clustering algorithm.

In [18]:
def split_clusters(words_embedding, word2idx, word2cluster, cluster2words, keep, split):    
    new_idx = 0
    for idx in range(len(keep)):
        if keep[idx]:
            for word in cluster2words[idx]:
                word2cluster[word] = new_idx
            new_idx += 1
        if split[idx]:
#             print('Splitting cluster %d' % (idx))
            # create kmeans with this data
            cluster_words = cluster2words[idx]
#             if len(cluster_words) < 20:
#                 print(cluster_words)
            if len(cluster_words) < 2:
                continue
            embeddings_idx = [word2idx[word] for word in cluster_words if word in word2idx]
            cluster_embeddings = words_embedding[embeddings_idx]
            kmeans = MiniBatchKMeans(n_clusters=2, random_state=0, compute_labels=True)
            kmeans.fit(cluster_embeddings)
            for word, embedding in zip(cluster_words, cluster_embeddings):
                word2cluster[word] = new_idx + kmeans.predict([embedding])[0] 
            new_idx += 2
            
    return word2cluster, new_idx
        
def deactivate_clusters(word2idx, cluster2words, words_list, words_count, words_embedding, deactivate, split, keep):
#     print('Deactivating %d clusters' % (sum(deactivate)))
    clusters_idx = [idx for idx in range(len(deactivate)) if deactivate[idx] or 
                   (split[idx] and len(cluster2words[idx]) < 2)]
    words_idx = []
    for idx in clusters_idx:
        words_idx.append([word2idx[word] for word in cluster2words[idx]])
    
#     print(words_idx)
    if words_idx: 
        words_idx = np.concatenate(words_idx)
#     print(words_idx)
    keep = [True for _ in range(len(words_list))]
    for idx in words_idx:
        keep[idx] = False
    
    words_list = [word for idx, word in enumerate(words_list) if keep[idx]]
    words_count = [count for idx, count in enumerate(words_count) if keep[idx]]
    words_embedding = words_embedding[keep]
    
    return words_list, words_count, words_embedding

# words_list, words_count, words_embedding = update_dataset(word2idx, cluster2words, words_list, words_count, words_embedding, deactivate)

# word2idx = {word: idx for idx, word in enumerate(words_list)}
# idx2word = dict(enumerate(words_list))

# word2cluster = update_centers(words_embedding, word2idx, word2cluster, cluster2words, keep, split)    

# cluster2words = [[] for i in range(kmeans.n_clusters)]
# for (word, cluster) in word2cluster.items():
#     cluster2words[cluster].append(word)

### Remove words from the dataset

If we have to deactivate some cluster, we need to remove the words it included from the original set of words

In [19]:
# def update_dataset(kmeans, X, vocab, deactivate):
#     clusters = [i for i in range(len(deactivate)) if deactivate[i]]
#     words = np.empty((0,), dtype=np.int)
#     for cluster_i in clusters:
#         words = np.concatenate((words, np.array(np.where(kmeans.labels_ == cluster_i)[0])))
#     mask = np.ones(X.shape[0], dtype=bool)
#     mask[words] = False
#     X = X[mask, :]
#     new_vocab = {w: c for i, (w, c) in enumerate(vocab.items()) if mask[i]}
#     return X, new_vocab

# X2, vocab2 = update_dataset(kmeans, X, vocab, deactivate)

### Update the classifier

We have to use the new clusters to classify the data in the following steps. One possible to solution to use these new clusters is to update the classifier centers and the relevant attributes. Then we can use the classifier's function ```predict``` as before. Moreover, to keep using this classifier to build the next cluster we also need to update the ```labels_``` and ```counts_``` parameters. 

In [20]:
# def update_kmeans(kmeans, new_centers, x):
#     kmeans.cluster_centers_ = new_centers
#     kmeans.n_clusters = len(new_centers)
#     kmeans.labels_, _ = kmeans._labels_inertia_minibatch(x)
#     kmeans.counts_ = np.zeros(kmeans.n_clusters, dtype=np.int32)
#     for i in range(kmeans.n_clusters):
#         kmeans.counts_[i] = np.sum(kmeans.labels_ == i)
#         print('Cluster %d contains %d elements' % (i, kmeans.counts_[i]))
    
#     return kmeans

# kmeans = update_kmeans(kmeans, new_centers, X)

### Validation

In [21]:
def validate(lasso, bows, y_true, threshold=None):
    y_predicted = lasso.predict(bows)
#     print("Predicted:")
#     print(y_predicted[:10])
#     print(y_predicted[-10:])
#     print("True:")
#     print(y_true[:10])
#     print(y_true[-10:])
    if threshold is None:
        threshold = np.mean(y_predicted)
    y_predicted = [1 if i > threshold else 0 for i in y_predicted]
    accuracy = accuracy_score(y_val, y_predicted)
    kappa = cohen_kappa_score(y_val, y_predicted)
    
    return accuracy, kappa

## Play time

Start by saving the first kmeans (so we can use it multiple times) and printing the first set of clusters. 

In [22]:
original_words_list = deepcopy(words_list)
original_words_count = deepcopy(words_count)
original_words_embedding = deepcopy(words_embedding)
original_word2cluster = deepcopy(word2cluster)
original_cluster2words = deepcopy(cluster2words)
original_word2idx = deepcopy(word2idx)
original_idx2word = deepcopy(idx2word)

## Run

In [23]:
low =  [0.469, 0.469, 0.469, 0.469, 0.722, 0.722, 0.722, 0.881, 0.881, 0.971]
high = [0.971, 0.881, 0.722, 0.469, 0.971, 0.881, 0.722, 0.971, 0.881, 0.971]

for value_index in range(len(low)):
    print('Low: %.3f High: %.3f\n' % (low[value_index], high[value_index]))
    epochs = 20

    patience = 3
    tries = 0
    best_accuracy = 0

    words_list = deepcopy(original_words_list)
    words_count = deepcopy(original_words_count)
    words_embedding = deepcopy(original_words_embedding)
    word2cluster = deepcopy(original_word2cluster)
    cluster2words = deepcopy(original_cluster2words)
    word2idx = deepcopy(original_word2idx)
    idx2word = deepcopy(original_idx2word)
    n_clusters = 50
    
    history = []
    
    for i in range(epochs):
        print('Epoch %d of %d' % (i + 1, epochs))
        print('Number of clusters: %d' % (n_clusters))
        bows_train = clusters_bow(x_train, n_clusters, word2cluster, useFrequency=False)
        svm = fit_svm(bows_train, y_train)

        uncertainty = clusters_uncertainty(bows_train, y_train)

        bows_val = clusters_bow(x_val, n_clusters, word2cluster, useFrequency=False)    
        accuracy, kappa = validate(svm, bows_val, y_val)

        print('Accuracy: %.3f' % (accuracy))
#         print('Kappa: %.3f' % (kappa))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
#             pickle.dump([kmeans, svm], open("best.classifier.pckl", "wb"))
            tries = 0
        else:
            tries += 1
            if tries >= patience:
#                 [kmeans, svm] = pickle.load(open("best.classifier.pckl", "rb"))
                break      

        keep, split, deactivate = study_clusters(svm.coef_[0], uncertainty, low[value_index], high[value_index])

        keep_count = sum(keep)
        split_count = sum(split)
        deactivate_count = sum(deactivate)

#         print('Keeping %d clusters' % (keep_count))
#         print('Spliting %d clusters' % (split_count))
#         print('Deactivating %d clusters' % (deactivate_count))

        before = len(words_list)
#         print('Before %d' % (before))
        assert before == len(words_count)
        assert before == words_embedding.shape[0]
        
        words_list, words_count, words_embedding = deactivate_clusters(word2idx, cluster2words, words_list, words_count, words_embedding, deactivate, split, keep)
        
        word2idx = {word: idx for idx, word in enumerate(words_list)}
        idx2word = dict(enumerate(words_list))

        word2cluster, next_n_clusters = split_clusters(words_embedding, word2idx, word2cluster, cluster2words, keep, split)    

        assert next_n_clusters <= n_clusters - deactivate_count + split_count    
    
        cluster2words = [[] for i in range(next_n_clusters)]
        for word in words_list:
            cluster2words[word2cluster[word]].append(word)

        after = len(words_list)
#         print('After %d' % (after))
        assert after == len(words_count)
        assert after == words_embedding.shape[0]

        history.append([
            i, 
            n_clusters, 
            keep_count, 
            split_count,
            deactivate_count,
            before,
            after,
            accuracy,
            kappa
        ])

        n_clusters = next_n_clusters
#         kmeans = update_kmeans(kmeans, next_centers, X)

    columns = [
    'Epoch',
    'Clusters', 
    'Keep', 
    'Split',
    'Deactivate',
    'Words before',
    'Words after',
    'Accuracy',
    'Kappa'
    ]

    df = DataFrame(history, columns=columns)
    display(df)


Low: 0.469 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 44
Accuracy: 0.797
Epoch 3 of 20
Number of clusters: 51
Accuracy: 0.832
Epoch 4 of 20
Number of clusters: 59
Accuracy: 0.842
Epoch 5 of 20
Number of clusters: 68
Accuracy: 0.855
Epoch 6 of 20
Number of clusters: 80
Accuracy: 0.854
Epoch 7 of 20
Number of clusters: 97
Accuracy: 0.872
Epoch 8 of 20
Number of clusters: 113
Accuracy: 0.877
Epoch 9 of 20
Number of clusters: 136
Accuracy: 0.885
Epoch 10 of 20
Number of clusters: 168
Accuracy: 0.890
Epoch 11 of 20
Number of clusters: 208
Accuracy: 0.896
Epoch 12 of 20
Number of clusters: 266
Accuracy: 0.903
Epoch 13 of 20
Number of clusters: 334
Accuracy: 0.907
Epoch 14 of 20
Number of clusters: 406
Accuracy: 0.911
Epoch 15 of 20
Number of clusters: 478
Accuracy: 0.913
Epoch 16 of 20
Number of clusters: 543
Accuracy: 0.914
Epoch 17 of 20
Number of clusters: 615
Accuracy: 0.915
Epoch 18 of 20
Number of clusters: 678
Accuracy: 0.915
Ep

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,28,8,14,61651,61562,0.80775,0.615523
1,1,44,35,8,1,61562,61561,0.7975,0.595018
2,2,51,37,11,3,61561,61550,0.8321,0.664203
3,3,59,44,13,2,61550,61528,0.8418,0.6836
4,4,68,50,16,2,61528,61525,0.8549,0.709796
5,5,80,61,18,1,61525,61524,0.854,0.70799
6,6,97,77,19,1,61524,61516,0.87225,0.744491
7,7,113,86,26,1,61516,61514,0.87695,0.753894
8,8,136,96,37,3,61514,61050,0.8846,0.769193
9,9,168,120,45,3,61050,61043,0.88975,0.779493


Low: 0.469 High: 0.881

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 50
Accuracy: 0.798
Epoch 3 of 20
Number of clusters: 68
Accuracy: 0.834
Epoch 4 of 20
Number of clusters: 87
Accuracy: 0.845
Epoch 5 of 20
Number of clusters: 123
Accuracy: 0.866
Epoch 6 of 20
Number of clusters: 173
Accuracy: 0.877
Epoch 7 of 20
Number of clusters: 250
Accuracy: 0.898
Epoch 8 of 20
Number of clusters: 359
Accuracy: 0.904
Epoch 9 of 20
Number of clusters: 514
Accuracy: 0.908
Epoch 10 of 20
Number of clusters: 710
Accuracy: 0.914
Epoch 11 of 20
Number of clusters: 961
Accuracy: 0.922
Epoch 12 of 20
Number of clusters: 1200
Accuracy: 0.924
Epoch 13 of 20
Number of clusters: 1560
Accuracy: 0.929
Epoch 14 of 20
Number of clusters: 1906
Accuracy: 0.932
Epoch 15 of 20
Number of clusters: 2300
Accuracy: 0.933
Epoch 16 of 20
Number of clusters: 2710
Accuracy: 0.936
Epoch 17 of 20
Number of clusters: 3074
Accuracy: 0.936
Epoch 18 of 20
Number of clusters: 3448
Accuracy

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,16,20,14,61651,61559,0.80775,0.615523
1,1,50,20,27,3,61559,61553,0.79845,0.596919
2,2,68,29,36,3,61553,61524,0.83445,0.668898
3,3,87,35,48,4,61524,61340,0.8449,0.68979
4,4,123,61,58,4,61340,61334,0.86575,0.731492
5,5,173,74,89,10,61334,61009,0.8768,0.753592
6,6,250,105,133,12,61009,60917,0.89845,0.796894
7,7,359,158,186,15,60917,60834,0.9042,0.808396
8,8,514,238,246,30,60834,60302,0.9077,0.815395
9,9,710,323,346,41,60302,60145,0.91435,0.828696


Low: 0.469 High: 0.722

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 55
Accuracy: 0.808
Epoch 3 of 20
Number of clusters: 79
Accuracy: 0.842
Epoch 4 of 20
Number of clusters: 112
Accuracy: 0.853
Epoch 5 of 20
Number of clusters: 175
Accuracy: 0.870
Epoch 6 of 20
Number of clusters: 257
Accuracy: 0.886
Epoch 7 of 20
Number of clusters: 400
Accuracy: 0.904
Epoch 8 of 20
Number of clusters: 607
Accuracy: 0.911
Epoch 9 of 20
Number of clusters: 897
Accuracy: 0.920
Epoch 10 of 20
Number of clusters: 1280
Accuracy: 0.924
Epoch 11 of 20
Number of clusters: 1771
Accuracy: 0.930
Epoch 12 of 20
Number of clusters: 2266
Accuracy: 0.933
Epoch 13 of 20
Number of clusters: 2935
Accuracy: 0.937
Epoch 14 of 20
Number of clusters: 3673
Accuracy: 0.939
Epoch 15 of 20
Number of clusters: 4416
Accuracy: 0.939
Epoch 16 of 20
Number of clusters: 5075
Accuracy: 0.941
Epoch 17 of 20
Number of clusters: 5679
Accuracy: 0.941
Epoch 18 of 20
Number of clusters: 6117
Accur

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,9,27,14,61651,61558,0.80775,0.615523
1,1,55,13,37,5,61558,61545,0.80815,0.616283
2,2,79,18,55,6,61545,61224,0.842,0.683982
3,3,112,25,79,8,61224,60660,0.8525,0.70498
4,4,175,39,118,18,60660,60586,0.8701,0.740189
5,5,257,64,171,22,60586,60400,0.88645,0.772893
6,6,400,93,272,35,60400,60181,0.904,0.807994
7,7,607,161,391,55,60181,59849,0.9108,0.821596
8,8,897,236,574,87,59849,58951,0.91965,0.839295
9,9,1280,337,816,127,58951,58439,0.9239,0.847796


Low: 0.469 High: 0.469

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 62
Accuracy: 0.812
Epoch 3 of 20
Number of clusters: 94
Accuracy: 0.836
Epoch 4 of 20
Number of clusters: 138
Accuracy: 0.854
Epoch 5 of 20
Number of clusters: 208
Accuracy: 0.864
Epoch 6 of 20
Number of clusters: 312
Accuracy: 0.886
Epoch 7 of 20
Number of clusters: 488
Accuracy: 0.904
Epoch 8 of 20
Number of clusters: 748
Accuracy: 0.911
Epoch 9 of 20
Number of clusters: 1130
Accuracy: 0.920
Epoch 10 of 20
Number of clusters: 1656
Accuracy: 0.925
Epoch 11 of 20
Number of clusters: 2310
Accuracy: 0.930
Epoch 12 of 20
Number of clusters: 2930
Accuracy: 0.934
Epoch 13 of 20
Number of clusters: 3624
Accuracy: 0.935
Epoch 14 of 20
Number of clusters: 4396
Accuracy: 0.938
Epoch 15 of 20
Number of clusters: 5124
Accuracy: 0.939
Epoch 16 of 20
Number of clusters: 5750
Accuracy: 0.944
Epoch 17 of 20
Number of clusters: 6168
Accuracy: 0.943
Epoch 18 of 20
Number of clusters: 6324
Accu

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,0,36,14,61651,61557,0.80775,0.615523
1,1,62,0,54,8,61557,60862,0.8118,0.623583
2,2,94,0,83,11,60862,60296,0.8362,0.672377
3,3,138,0,115,23,60296,59260,0.85405,0.708078
4,4,208,0,180,28,59260,58537,0.86435,0.728685
5,5,312,0,271,41,58537,57719,0.88595,0.77189
6,6,488,0,403,85,57719,56862,0.9044,0.808793
7,7,748,0,636,112,56862,55950,0.91115,0.822295
8,8,1130,0,949,181,55950,54164,0.91985,0.839695
9,9,1656,0,1374,282,54164,52758,0.92495,0.849896


Low: 0.722 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 35
Accuracy: 0.777
Epoch 3 of 20
Number of clusters: 37
Accuracy: 0.812
Epoch 4 of 20
Number of clusters: 43
Accuracy: 0.817
Epoch 5 of 20
Number of clusters: 51
Accuracy: 0.846
Epoch 6 of 20
Number of clusters: 59
Accuracy: 0.846
Epoch 7 of 20
Number of clusters: 72
Accuracy: 0.861
Epoch 8 of 20
Number of clusters: 87
Accuracy: 0.864
Epoch 9 of 20
Number of clusters: 109
Accuracy: 0.881
Epoch 10 of 20
Number of clusters: 132
Accuracy: 0.890
Epoch 11 of 20
Number of clusters: 165
Accuracy: 0.894
Epoch 12 of 20
Number of clusters: 215
Accuracy: 0.900
Epoch 13 of 20
Number of clusters: 275
Accuracy: 0.904
Epoch 14 of 20
Number of clusters: 336
Accuracy: 0.908
Epoch 15 of 20
Number of clusters: 400
Accuracy: 0.909
Epoch 16 of 20
Number of clusters: 454
Accuracy: 0.911
Epoch 17 of 20
Number of clusters: 494
Accuracy: 0.912
Epoch 18 of 20
Number of clusters: 546
Accuracy: 0.915
Epo

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,19,8,23,61651,60133,0.80775,0.615523
1,1,35,21,9,5,60133,60072,0.77705,0.554112
2,2,37,21,13,3,60072,59596,0.81195,0.623898
3,3,43,25,13,5,59596,59560,0.81715,0.634317
4,4,51,31,14,6,59560,59502,0.8455,0.691002
5,5,59,36,19,4,59502,59369,0.8462,0.692392
6,6,72,49,19,4,59369,59330,0.8606,0.721192
7,7,87,59,26,2,59330,59323,0.8635,0.726994
8,8,109,68,33,8,59323,58830,0.8811,0.762194
9,9,132,77,45,10,58830,58749,0.88975,0.779494


Low: 0.722 High: 0.881

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 41
Accuracy: 0.777
Epoch 3 of 20
Number of clusters: 55
Accuracy: 0.829
Epoch 4 of 20
Number of clusters: 71
Accuracy: 0.835
Epoch 5 of 20
Number of clusters: 101
Accuracy: 0.860
Epoch 6 of 20
Number of clusters: 138
Accuracy: 0.869
Epoch 7 of 20
Number of clusters: 206
Accuracy: 0.897
Epoch 8 of 20
Number of clusters: 298
Accuracy: 0.903
Epoch 9 of 20
Number of clusters: 421
Accuracy: 0.905
Epoch 10 of 20
Number of clusters: 567
Accuracy: 0.910
Epoch 11 of 20
Number of clusters: 754
Accuracy: 0.918
Epoch 12 of 20
Number of clusters: 953
Accuracy: 0.924
Epoch 13 of 20
Number of clusters: 1232
Accuracy: 0.926
Epoch 14 of 20
Number of clusters: 1544
Accuracy: 0.931
Epoch 15 of 20
Number of clusters: 1855
Accuracy: 0.931
Epoch 16 of 20
Number of clusters: 2181
Accuracy: 0.933
Epoch 17 of 20
Number of clusters: 2425
Accuracy: 0.933
Epoch 18 of 20
Number of clusters: 2676
Accuracy:

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,7,20,23,61651,60130,0.80775,0.615523
1,1,41,9,24,8,60130,60065,0.7774,0.554814
2,2,55,15,31,9,60065,59704,0.8286,0.657197
3,3,71,23,40,8,59704,59232,0.83505,0.670096
4,4,101,34,54,13,59232,58226,0.85955,0.719093
5,5,138,42,82,14,58226,57719,0.869,0.737993
6,6,206,62,122,22,57719,56343,0.89695,0.793894
7,7,298,79,176,43,56343,55430,0.90325,0.806495
8,8,421,127,232,62,55430,54102,0.9054,0.810795
9,9,567,190,299,78,54102,53327,0.90985,0.819696


Low: 0.722 High: 0.722

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 46
Accuracy: 0.800
Epoch 3 of 20
Number of clusters: 64
Accuracy: 0.837
Epoch 4 of 20
Number of clusters: 88
Accuracy: 0.841
Epoch 5 of 20
Number of clusters: 138
Accuracy: 0.861
Epoch 6 of 20
Number of clusters: 200
Accuracy: 0.882
Epoch 7 of 20
Number of clusters: 304
Accuracy: 0.899
Epoch 8 of 20
Number of clusters: 452
Accuracy: 0.904
Epoch 9 of 20
Number of clusters: 666
Accuracy: 0.915
Epoch 10 of 20
Number of clusters: 920
Accuracy: 0.920
Epoch 11 of 20
Number of clusters: 1248
Accuracy: 0.925
Epoch 12 of 20
Number of clusters: 1558
Accuracy: 0.928
Epoch 13 of 20
Number of clusters: 1930
Accuracy: 0.934
Epoch 14 of 20
Number of clusters: 2378
Accuracy: 0.935
Epoch 15 of 20
Number of clusters: 2784
Accuracy: 0.936
Epoch 16 of 20
Number of clusters: 3120
Accuracy: 0.939
Epoch 17 of 20
Number of clusters: 3402
Accuracy: 0.940
Epoch 18 of 20
Number of clusters: 3574
Accurac

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,0,27,23,61651,60129,0.80775,0.615523
1,1,46,0,34,12,60129,59970,0.8003,0.60058
2,2,64,0,53,11,59970,57951,0.83715,0.674279
3,3,88,0,72,16,57951,56523,0.84115,0.682283
4,4,138,0,105,33,56523,55020,0.8614,0.72279
5,5,200,0,157,43,55020,52548,0.8824,0.764793
6,6,304,0,239,65,52548,50741,0.8995,0.798994
7,7,452,0,350,102,50741,48231,0.9043,0.808596
8,8,666,0,496,170,48231,45525,0.91465,0.829296
9,9,920,0,696,224,45525,43505,0.92,0.839997


Low: 0.881 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 28
Accuracy: 0.747
Epoch 3 of 20
Number of clusters: 32
Accuracy: 0.790
Epoch 4 of 20
Number of clusters: 39
Accuracy: 0.796


Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,12,8,30,61651,57677,0.80775,0.615523
1,1,28,16,9,3,57677,57620,0.7465,0.493046
2,2,32,15,13,4,57620,55817,0.7903,0.58058


Low: 0.881 High: 0.881

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 34
Accuracy: 0.757
Epoch 3 of 20
Number of clusters: 46
Accuracy: 0.815
Epoch 4 of 20
Number of clusters: 56
Accuracy: 0.823
Epoch 5 of 20
Number of clusters: 66
Accuracy: 0.846
Epoch 6 of 20
Number of clusters: 92
Accuracy: 0.850
Epoch 7 of 20
Number of clusters: 140
Accuracy: 0.885
Epoch 8 of 20
Number of clusters: 188
Accuracy: 0.891
Epoch 9 of 20
Number of clusters: 252
Accuracy: 0.898
Epoch 10 of 20
Number of clusters: 308
Accuracy: 0.901
Epoch 11 of 20
Number of clusters: 396
Accuracy: 0.907
Epoch 12 of 20
Number of clusters: 498
Accuracy: 0.914
Epoch 13 of 20
Number of clusters: 616
Accuracy: 0.919
Epoch 14 of 20
Number of clusters: 778
Accuracy: 0.921
Epoch 15 of 20
Number of clusters: 886
Accuracy: 0.923
Epoch 16 of 20
Number of clusters: 982
Accuracy: 0.923
Epoch 17 of 20
Number of clusters: 1062
Accuracy: 0.923
Epoch 18 of 20
Number of clusters: 1076
Accuracy: 0.925

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,0,20,30,61651,57674,0.80775,0.615523
1,1,34,0,24,10,57674,56841,0.7566,0.513237
2,2,46,0,30,16,56841,52804,0.81505,0.63009
3,3,56,0,36,20,52804,50526,0.82275,0.645496
4,4,66,0,46,20,50526,47232,0.84615,0.692295
5,5,92,0,70,22,47232,45525,0.84965,0.699294
6,6,140,0,95,45,45525,40425,0.88535,0.770693
7,7,188,0,130,58,40425,36525,0.8908,0.781596
8,8,252,0,157,95,36525,32789,0.8979,0.795794
9,9,308,0,208,100,32789,29020,0.9008,0.801595


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 16
Accuracy: 0.747
Epoch 3 of 20
Number of clusters: 16
Accuracy: 0.766
Epoch 4 of 20
Number of clusters: 20
Accuracy: 0.737


Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,0,8,42,61651,53579,0.80775,0.615523
1,1,16,0,8,8,53579,45856,0.747,0.494076
2,2,16,0,10,6,45856,44043,0.7656,0.531138


In [27]:
weights = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0]

for weights_p in weights:
    print('Low: %.3f High: %.3f\n' % (low[value_index], high[value_index]))
    epochs = 20

    patience = 3
    tries = 0
    best_accuracy = 0

    words_list = deepcopy(original_words_list)
    words_count = deepcopy(original_words_count)
    words_embedding = deepcopy(original_words_embedding)
    word2cluster = deepcopy(original_word2cluster)
    cluster2words = deepcopy(original_cluster2words)
    word2idx = deepcopy(original_word2idx)
    idx2word = deepcopy(original_idx2word)
    n_clusters = 50
    
    history = []
    
    for i in range(epochs):
        print('Epoch %d of %d' % (i + 1, epochs))
        print('Number of clusters: %d' % (n_clusters))
        bows_train = clusters_bow(x_train, n_clusters, word2cluster, useFrequency=False)
        svm = fit_svm(bows_train, y_train)

        uncertainty = clusters_uncertainty(bows_train, y_train)

        bows_val = clusters_bow(x_val, n_clusters, word2cluster, useFrequency=False)    
        accuracy, kappa = validate(svm, bows_val, y_val)

        print('Accuracy: %.3f' % (accuracy))
#         print('Kappa: %.3f' % (kappa))

        if accuracy > best_accuracy:
            best_accuracy = accuracy
#             pickle.dump([kmeans, svm], open("best.classifier.pckl", "wb"))
            tries = 0
        else:
            tries += 1
            if tries >= patience:
#                 [kmeans, svm] = pickle.load(open("best.classifier.pckl", "rb"))
                break      

        keep, split, deactivate = study_clusters(svm.coef_[0], uncertainty, weights_p)

        keep_count = sum(keep)
        split_count = sum(split)
        deactivate_count = sum(deactivate)

#         print('Keeping %d clusters' % (keep_count))
#         print('Spliting %d clusters' % (split_count))
#         print('Deactivating %d clusters' % (deactivate_count))

        before = len(words_list)
#         print('Before %d' % (before))
        assert before == len(words_count)
        assert before == words_embedding.shape[0]
        
        words_list, words_count, words_embedding = deactivate_clusters(word2idx, cluster2words, words_list, words_count, words_embedding, deactivate, split, keep)
        
        word2idx = {word: idx for idx, word in enumerate(words_list)}
        idx2word = dict(enumerate(words_list))

        word2cluster, next_n_clusters = split_clusters(words_embedding, word2idx, word2cluster, cluster2words, keep, split)    

        assert next_n_clusters <= n_clusters - deactivate_count + split_count    
    
        cluster2words = [[] for i in range(next_n_clusters)]
        for word in words_list:
            cluster2words[word2cluster[word]].append(word)

        after = len(words_list)
#         print('After %d' % (after))
        assert after == len(words_count)
        assert after == words_embedding.shape[0]

        history.append([
            i, 
            n_clusters, 
            keep_count, 
            split_count,
            deactivate_count,
            before,
            after,
            accuracy,
            kappa
        ])

        n_clusters = next_n_clusters
#         kmeans = update_kmeans(kmeans, next_centers, X)

    columns = [
    'Epoch',
    'Clusters', 
    'Keep', 
    'Split',
    'Deactivate',
    'Words before',
    'Words after',
    'Accuracy',
    'Kappa'
    ]

    df = DataFrame(history, columns=columns)
    display(df)


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 46
Accuracy: 0.800
Epoch 3 of 20
Number of clusters: 64
Accuracy: 0.837
Epoch 4 of 20
Number of clusters: 88
Accuracy: 0.841
Epoch 5 of 20
Number of clusters: 138
Accuracy: 0.861
Epoch 6 of 20
Number of clusters: 200
Accuracy: 0.882
Epoch 7 of 20
Number of clusters: 304
Accuracy: 0.899
Epoch 8 of 20
Number of clusters: 452
Accuracy: 0.904
Epoch 9 of 20
Number of clusters: 666
Accuracy: 0.915
Epoch 10 of 20
Number of clusters: 920
Accuracy: 0.920
Epoch 11 of 20
Number of clusters: 1248
Accuracy: 0.925
Epoch 12 of 20
Number of clusters: 1558
Accuracy: 0.928
Epoch 13 of 20
Number of clusters: 1930
Accuracy: 0.934
Epoch 14 of 20
Number of clusters: 2378
Accuracy: 0.935
Epoch 15 of 20
Number of clusters: 2784
Accuracy: 0.936
Epoch 16 of 20
Number of clusters: 3120
Accuracy: 0.939
Epoch 17 of 20
Number of clusters: 3402
Accuracy: 0.940
Epoch 18 of 20
Number of clusters: 3574
Accurac

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,0,27,23,61651,60129,0.80775,0.615523
1,1,46,0,34,12,60129,59970,0.8003,0.60058
2,2,64,0,53,11,59970,57951,0.83715,0.674279
3,3,88,0,72,16,57951,56523,0.84115,0.682283
4,4,138,0,105,33,56523,55020,0.8614,0.72279
5,5,200,0,157,43,55020,52548,0.8824,0.764793
6,6,304,0,239,65,52548,50741,0.8995,0.798994
7,7,452,0,350,102,50741,48231,0.9043,0.808596
8,8,666,0,496,170,48231,45525,0.91465,0.829296
9,9,920,0,696,224,45525,43505,0.92,0.839997


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 49
Accuracy: 0.811
Epoch 3 of 20
Number of clusters: 71
Accuracy: 0.820
Epoch 4 of 20
Number of clusters: 98
Accuracy: 0.846
Epoch 5 of 20
Number of clusters: 156
Accuracy: 0.860
Epoch 6 of 20
Number of clusters: 218
Accuracy: 0.874
Epoch 7 of 20
Number of clusters: 324
Accuracy: 0.884
Epoch 8 of 20
Number of clusters: 449
Accuracy: 0.895
Epoch 9 of 20
Number of clusters: 615
Accuracy: 0.901
Epoch 10 of 20
Number of clusters: 854
Accuracy: 0.911
Epoch 11 of 20
Number of clusters: 1118
Accuracy: 0.918
Epoch 12 of 20
Number of clusters: 1300
Accuracy: 0.921
Epoch 13 of 20
Number of clusters: 1584
Accuracy: 0.923
Epoch 14 of 20
Number of clusters: 1865
Accuracy: 0.926
Epoch 15 of 20
Number of clusters: 2137
Accuracy: 0.927
Epoch 16 of 20
Number of clusters: 2410
Accuracy: 0.927
Epoch 17 of 20
Number of clusters: 2639
Accuracy: 0.929
Epoch 18 of 20
Number of clusters: 2732
Accurac

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,5,26,19,61651,60171,0.80775,0.615523
1,1,49,5,35,9,60171,60105,0.8107,0.621411
2,2,71,8,54,9,60105,59397,0.8197,0.639382
3,3,98,10,76,12,59397,57558,0.8457,0.691383
4,4,156,16,107,33,57558,55984,0.8595,0.718981
5,5,218,22,156,40,55984,53925,0.8738,0.747587
6,6,324,33,221,70,53925,51219,0.8842,0.768386
7,7,449,45,306,98,51219,49543,0.89485,0.78969
8,8,615,62,427,126,49543,48181,0.9012,0.802391
9,9,854,86,575,193,48181,45983,0.9108,0.821593


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 54
Accuracy: 0.812
Epoch 3 of 20
Number of clusters: 73
Accuracy: 0.836
Epoch 4 of 20
Number of clusters: 105
Accuracy: 0.852
Epoch 5 of 20
Number of clusters: 159
Accuracy: 0.859
Epoch 6 of 20
Number of clusters: 232
Accuracy: 0.874
Epoch 7 of 20
Number of clusters: 341
Accuracy: 0.882
Epoch 8 of 20
Number of clusters: 483
Accuracy: 0.886
Epoch 9 of 20
Number of clusters: 661
Accuracy: 0.889
Epoch 10 of 20
Number of clusters: 867
Accuracy: 0.898
Epoch 11 of 20
Number of clusters: 1168
Accuracy: 0.902
Epoch 12 of 20
Number of clusters: 1314
Accuracy: 0.901
Epoch 13 of 20
Number of clusters: 1577
Accuracy: 0.906
Epoch 14 of 20
Number of clusters: 1760
Accuracy: 0.907
Epoch 15 of 20
Number of clusters: 1860
Accuracy: 0.906
Epoch 16 of 20
Number of clusters: 1992
Accuracy: 0.908
Epoch 17 of 20
Number of clusters: 1973
Accuracy: 0.906
Epoch 18 of 20
Number of clusters: 1907
Accura

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,10,25,15,61651,61505,0.80775,0.615523
1,1,54,11,35,8,61505,61434,0.81235,0.624716
2,2,73,15,53,5,61434,61420,0.83555,0.671098
3,3,105,21,72,12,61420,60366,0.85175,0.703492
4,4,159,32,106,21,60366,59728,0.8588,0.71759
5,5,232,47,152,33,59728,58152,0.87415,0.748291
6,6,341,69,218,54,58152,56421,0.88235,0.764691
7,7,483,97,302,84,56421,54339,0.8859,0.771792
8,8,661,133,404,124,54339,53187,0.8895,0.778992
9,9,867,174,541,152,53187,52279,0.8977,0.795394


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 59
Accuracy: 0.817
Epoch 3 of 20
Number of clusters: 80
Accuracy: 0.838
Epoch 4 of 20
Number of clusters: 110
Accuracy: 0.852
Epoch 5 of 20
Number of clusters: 159
Accuracy: 0.857
Epoch 6 of 20
Number of clusters: 226
Accuracy: 0.872
Epoch 7 of 20
Number of clusters: 322
Accuracy: 0.881
Epoch 8 of 20
Number of clusters: 467
Accuracy: 0.887
Epoch 9 of 20
Number of clusters: 627
Accuracy: 0.888
Epoch 10 of 20
Number of clusters: 841
Accuracy: 0.895
Epoch 11 of 20
Number of clusters: 1107
Accuracy: 0.898
Epoch 12 of 20
Number of clusters: 1249
Accuracy: 0.899
Epoch 13 of 20
Number of clusters: 1467
Accuracy: 0.903
Epoch 14 of 20
Number of clusters: 1615
Accuracy: 0.904
Epoch 15 of 20
Number of clusters: 1767
Accuracy: 0.904
Epoch 16 of 20
Number of clusters: 1797
Accuracy: 0.905
Epoch 17 of 20
Number of clusters: 1808
Accuracy: 0.902
Epoch 18 of 20
Number of clusters: 1799
Accura

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,15,24,11,61651,61531,0.80775,0.615523
1,1,59,18,34,7,61531,61459,0.81725,0.634509
2,2,80,24,50,6,61459,61433,0.8376,0.675196
3,3,110,33,68,9,61433,60654,0.85215,0.70429
4,4,159,48,94,17,60654,60014,0.85705,0.714086
5,5,226,68,133,25,60014,58032,0.8716,0.743191
6,6,322,97,193,32,58032,56623,0.8809,0.76179
7,7,467,141,255,71,56623,55503,0.8869,0.77379
8,8,627,189,349,89,55503,54412,0.88805,0.776091
9,9,841,253,460,128,54412,53494,0.89535,0.79069


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 64
Accuracy: 0.817
Epoch 3 of 20
Number of clusters: 86
Accuracy: 0.838
Epoch 4 of 20
Number of clusters: 113
Accuracy: 0.853
Epoch 5 of 20
Number of clusters: 158
Accuracy: 0.858
Epoch 6 of 20
Number of clusters: 228
Accuracy: 0.868
Epoch 7 of 20
Number of clusters: 312
Accuracy: 0.878
Epoch 8 of 20
Number of clusters: 447
Accuracy: 0.886
Epoch 9 of 20
Number of clusters: 599
Accuracy: 0.888
Epoch 10 of 20
Number of clusters: 786
Accuracy: 0.893
Epoch 11 of 20
Number of clusters: 1031
Accuracy: 0.898
Epoch 12 of 20
Number of clusters: 1135
Accuracy: 0.899
Epoch 13 of 20
Number of clusters: 1308
Accuracy: 0.904
Epoch 14 of 20
Number of clusters: 1456
Accuracy: 0.904
Epoch 15 of 20
Number of clusters: 1593
Accuracy: 0.904
Epoch 16 of 20
Number of clusters: 1666
Accuracy: 0.907
Epoch 17 of 20
Number of clusters: 1709
Accuracy: 0.904
Epoch 18 of 20
Number of clusters: 1714
Accura

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,20,24,6,61651,61555,0.80775,0.615523
1,1,64,26,32,6,61555,61503,0.8172,0.634408
2,2,86,35,45,6,61503,61478,0.8379,0.675796
3,3,113,46,59,8,61478,60873,0.8527,0.705391
4,4,158,64,86,8,60873,60398,0.8577,0.715389
5,5,228,92,113,23,60398,58409,0.86835,0.736687
6,6,312,125,165,22,58409,57118,0.87825,0.756489
7,7,447,179,218,50,57118,56187,0.8857,0.771388
8,8,599,240,287,72,56187,55324,0.888,0.775989
9,9,786,315,382,89,55324,54797,0.89315,0.78629


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 63
Accuracy: 0.817
Epoch 3 of 20
Number of clusters: 86
Accuracy: 0.837
Epoch 4 of 20
Number of clusters: 111
Accuracy: 0.849
Epoch 5 of 20
Number of clusters: 150
Accuracy: 0.857
Epoch 6 of 20
Number of clusters: 201
Accuracy: 0.861
Epoch 7 of 20
Number of clusters: 269
Accuracy: 0.870
Epoch 8 of 20
Number of clusters: 367
Accuracy: 0.874
Epoch 9 of 20
Number of clusters: 498
Accuracy: 0.875
Epoch 10 of 20
Number of clusters: 649
Accuracy: 0.881
Epoch 11 of 20
Number of clusters: 827
Accuracy: 0.887
Epoch 12 of 20
Number of clusters: 1014
Accuracy: 0.892
Epoch 13 of 20
Number of clusters: 1113
Accuracy: 0.894
Epoch 14 of 20
Number of clusters: 1271
Accuracy: 0.897
Epoch 15 of 20
Number of clusters: 1392
Accuracy: 0.899
Epoch 16 of 20
Number of clusters: 1490
Accuracy: 0.898
Epoch 17 of 20
Number of clusters: 1593
Accuracy: 0.899
Epoch 18 of 20
Number of clusters: 1637
Accurac

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,25,21,4,61651,61631,0.80775,0.615523
1,1,63,32,29,2,61631,61584,0.8168,0.633609
2,2,86,43,38,5,61584,61555,0.8369,0.673797
3,3,111,56,50,5,61555,61237,0.849,0.697993
4,4,150,75,66,9,61237,60801,0.85745,0.714892
5,5,201,101,84,16,60801,59640,0.86055,0.72109
6,6,269,135,118,16,59640,58694,0.87005,0.74009
7,7,367,184,163,20,58694,58506,0.8736,0.747188
8,8,498,249,212,37,58506,57713,0.875,0.74999
9,9,649,325,266,58,57713,57279,0.88125,0.762492


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 62
Accuracy: 0.817
Epoch 3 of 20
Number of clusters: 80
Accuracy: 0.829
Epoch 4 of 20
Number of clusters: 100
Accuracy: 0.836
Epoch 5 of 20
Number of clusters: 124
Accuracy: 0.841
Epoch 6 of 20
Number of clusters: 161
Accuracy: 0.846
Epoch 7 of 20
Number of clusters: 207
Accuracy: 0.851
Epoch 8 of 20
Number of clusters: 271
Accuracy: 0.853
Epoch 9 of 20
Number of clusters: 347
Accuracy: 0.860
Epoch 10 of 20
Number of clusters: 433
Accuracy: 0.867
Epoch 11 of 20
Number of clusters: 520
Accuracy: 0.869
Epoch 12 of 20
Number of clusters: 628
Accuracy: 0.872
Epoch 13 of 20
Number of clusters: 753
Accuracy: 0.876
Epoch 14 of 20
Number of clusters: 898
Accuracy: 0.879
Epoch 15 of 20
Number of clusters: 949
Accuracy: 0.881
Epoch 16 of 20
Number of clusters: 992
Accuracy: 0.882
Epoch 17 of 20
Number of clusters: 1040
Accuracy: 0.881
Epoch 18 of 20
Number of clusters: 1038
Accuracy: 0.

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,30,17,3,61651,61645,0.80775,0.615523
1,1,62,38,23,1,61645,61641,0.817,0.634008
2,2,80,48,29,3,61641,61635,0.82855,0.657094
3,3,100,60,37,3,61635,61377,0.83605,0.672093
4,4,124,75,44,5,61377,61182,0.8409,0.681793
5,5,161,97,56,8,61182,60810,0.8456,0.691192
6,6,207,125,74,8,60810,60666,0.8507,0.701392
7,7,271,163,94,14,60666,60587,0.853,0.705989
8,8,347,209,118,20,60587,60453,0.8603,0.720593
9,9,433,260,141,32,60453,59670,0.8667,0.733393


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 59
Accuracy: 0.817
Epoch 3 of 20
Number of clusters: 72
Accuracy: 0.820
Epoch 4 of 20
Number of clusters: 83
Accuracy: 0.831
Epoch 5 of 20
Number of clusters: 97
Accuracy: 0.835
Epoch 6 of 20
Number of clusters: 120
Accuracy: 0.836
Epoch 7 of 20
Number of clusters: 142
Accuracy: 0.838
Epoch 8 of 20
Number of clusters: 178
Accuracy: 0.841
Epoch 9 of 20
Number of clusters: 215
Accuracy: 0.845
Epoch 10 of 20
Number of clusters: 265
Accuracy: 0.848
Epoch 11 of 20
Number of clusters: 316
Accuracy: 0.850
Epoch 12 of 20
Number of clusters: 376
Accuracy: 0.853
Epoch 13 of 20
Number of clusters: 454
Accuracy: 0.854
Epoch 14 of 20
Number of clusters: 510
Accuracy: 0.858
Epoch 15 of 20
Number of clusters: 585
Accuracy: 0.861
Epoch 16 of 20
Number of clusters: 662
Accuracy: 0.862
Epoch 17 of 20
Number of clusters: 730
Accuracy: 0.860
Epoch 18 of 20
Number of clusters: 761
Accuracy: 0.866


Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,35,12,3,61651,61646,0.80775,0.615523
1,1,59,42,16,1,61646,61643,0.8168,0.633611
2,2,72,51,18,3,61643,61637,0.82045,0.640906
3,3,83,59,22,2,61637,61402,0.83095,0.661895
4,4,97,68,28,1,61402,61399,0.8345,0.668993
5,5,120,84,29,7,61399,61029,0.83565,0.671294
6,6,142,100,40,2,61029,60933,0.8379,0.675795
7,7,178,125,45,8,60933,60853,0.8407,0.681393
8,8,215,151,57,7,60853,60834,0.8452,0.690394
9,9,265,186,70,9,60834,60787,0.84785,0.695695


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 54
Accuracy: 0.819
Epoch 3 of 20
Number of clusters: 64
Accuracy: 0.820
Epoch 4 of 20
Number of clusters: 70
Accuracy: 0.822
Epoch 5 of 20
Number of clusters: 80
Accuracy: 0.824
Epoch 6 of 20
Number of clusters: 92
Accuracy: 0.823
Epoch 7 of 20
Number of clusters: 104
Accuracy: 0.823
Epoch 8 of 20
Number of clusters: 118
Accuracy: 0.828
Epoch 9 of 20
Number of clusters: 139
Accuracy: 0.831
Epoch 10 of 20
Number of clusters: 166
Accuracy: 0.833
Epoch 11 of 20
Number of clusters: 187
Accuracy: 0.835
Epoch 12 of 20
Number of clusters: 214
Accuracy: 0.844
Epoch 13 of 20
Number of clusters: 244
Accuracy: 0.845
Epoch 14 of 20
Number of clusters: 268
Accuracy: 0.847
Epoch 15 of 20
Number of clusters: 303
Accuracy: 0.848
Epoch 16 of 20
Number of clusters: 331
Accuracy: 0.850
Epoch 17 of 20
Number of clusters: 375
Accuracy: 0.852
Epoch 18 of 20
Number of clusters: 430
Accuracy: 0.857
E

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,40,7,3,61651,61646,0.80775,0.615523
1,1,54,44,10,0,61646,61646,0.81865,0.63731
2,2,64,52,10,2,61646,61642,0.8204,0.640808
3,3,70,56,14,0,61642,61640,0.82175,0.643508
4,4,80,64,15,1,61640,61637,0.8236,0.647206
5,5,92,74,17,1,61637,61634,0.82305,0.646108
6,6,104,84,18,2,61634,61556,0.8227,0.645404
7,7,118,95,22,1,61556,61554,0.82795,0.655904
8,8,139,112,27,0,61554,61554,0.8312,0.662403
9,9,166,133,27,6,61554,61533,0.83255,0.665104


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 51
Accuracy: 0.808
Epoch 3 of 20
Number of clusters: 56
Accuracy: 0.808
Epoch 4 of 20
Number of clusters: 61
Accuracy: 0.810
Epoch 5 of 20
Number of clusters: 63
Accuracy: 0.811
Epoch 6 of 20
Number of clusters: 65
Accuracy: 0.812
Epoch 7 of 20
Number of clusters: 69
Accuracy: 0.814
Epoch 8 of 20
Number of clusters: 71
Accuracy: 0.814
Epoch 9 of 20
Number of clusters: 76
Accuracy: 0.813
Epoch 10 of 20
Number of clusters: 83
Accuracy: 0.813


Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,45,3,2,61651,61647,0.80775,0.615523
1,1,51,46,5,0,61647,61647,0.8079,0.615823
2,2,56,51,5,0,61647,61647,0.808,0.616023
3,3,61,55,4,2,61647,61633,0.8104,0.620816
4,4,63,57,4,2,61633,61631,0.811,0.622015
5,5,65,59,6,0,61631,61630,0.81155,0.623112
6,6,69,63,5,1,61630,61627,0.814,0.628013
7,7,71,64,6,1,61627,61626,0.8139,0.627811
8,8,76,69,7,0,61626,61626,0.8132,0.626411


Low: 0.971 High: 0.971

Epoch 1 of 20
Number of clusters: 50
Accuracy: 0.808
Epoch 2 of 20
Number of clusters: 46
Accuracy: 0.800
Epoch 3 of 20
Number of clusters: 64
Accuracy: 0.837
Epoch 4 of 20
Number of clusters: 88
Accuracy: 0.841
Epoch 5 of 20
Number of clusters: 138
Accuracy: 0.861
Epoch 6 of 20
Number of clusters: 200
Accuracy: 0.882
Epoch 7 of 20
Number of clusters: 304
Accuracy: 0.899
Epoch 8 of 20
Number of clusters: 452
Accuracy: 0.904
Epoch 9 of 20
Number of clusters: 666
Accuracy: 0.915
Epoch 10 of 20
Number of clusters: 920
Accuracy: 0.920
Epoch 11 of 20
Number of clusters: 1248
Accuracy: 0.925
Epoch 12 of 20
Number of clusters: 1558
Accuracy: 0.928
Epoch 13 of 20
Number of clusters: 1930
Accuracy: 0.934
Epoch 14 of 20
Number of clusters: 2378
Accuracy: 0.935
Epoch 15 of 20
Number of clusters: 2784
Accuracy: 0.936
Epoch 16 of 20
Number of clusters: 3120
Accuracy: 0.939
Epoch 17 of 20
Number of clusters: 3402
Accuracy: 0.940
Epoch 18 of 20
Number of clusters: 3574
Accurac

Unnamed: 0,Epoch,Clusters,Keep,Split,Deactivate,Words before,Words after,Accuracy,Kappa
0,0,50,0,27,23,61651,60129,0.80775,0.615523
1,1,46,0,34,12,60129,59970,0.8003,0.60058
2,2,64,0,53,11,59970,57951,0.83715,0.674279
3,3,88,0,72,16,57951,56523,0.84115,0.682283
4,4,138,0,105,33,56523,55020,0.8614,0.72279
5,5,200,0,157,43,55020,52548,0.8824,0.764793
6,6,304,0,239,65,52548,50741,0.8995,0.798994
7,7,452,0,350,102,50741,48231,0.9043,0.808596
8,8,666,0,496,170,48231,45525,0.91465,0.829296
9,9,920,0,696,224,45525,43505,0.92,0.839997
