# Seleccionando palabras

In [1]:
import string
import random
import numpy as np
import reuters_reader
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from sklearn.externals import joblib
from gensim.models.keyedvectors import KeyedVectors
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from copy import deepcopy

np.set_printoptions(precision=3)

### RCV1 Dataset

Use the function ```reuters_reader.reader(path)``` to retrieve the available documents from the rcv1 dataset stored in `path`. This function returns a generator (```reader```) which yields a single document each time we call ```next(reader)```. Each document is a dictionary with the followitn useful keys:
 - "title" is the title of the document
 - "text" is the body of the document
 - "bip:topics:1.0" is the list of topics
 
There are a total of 804420 available documents, although some may have no topic.

#### Building the dataset
We build a balanced dataset that contains ```n_docs```. To get a balanced dataset we iterate through the documents generator until we have ```n_docs / 2``` documents with the desired topic and the same amount without it. 

#### Get the labels
Select a topic we want to classify using the variable topic (there is a list of the topics https://gist.github.com/gavinmh/6253739). Then build the list of labels using a 1 for those documents with that topic and 0 otherwise

#### Training and validation set
Finally, we split the dataset using the ```train_split``` value. 

In [2]:
path = 'rcv1'
n_docs = 100000
train_split = 0.8
topic = 'GCAT'

docs = []
reader = reuters_reader.reader(path)

topic_true = 0
topic_false = 0

while len(docs) < n_docs:
    doc = next(reader)
    if topic in doc['bip:topics:1.0']:
        topic_true += 1
        if topic_true <= n_docs // 2:
            docs.append(doc)
    else:
        topic_false += 1
        if topic_false <= n_docs // 2:
            docs.append(doc)
     
random.shuffle(docs)

labels = np.zeros((n_docs), dtype=np.int16)
labels = [1 if topic in doc['bip:topics:1.0'] else 0 for doc in docs]

print('{} docs with topic {} (from {})'.format(np.sum(labels), topic, n_docs))

split_point = int(n_docs * train_split)
x_train, y_train = docs[:split_point], labels[:split_point]
x_val, y_val = docs[split_point:], labels[split_point:]

print('Training with {} docs'.format(len(x_train)))
print('Validating with {} docs'.format(len(x_val)))

19961028 / 146544newsML.xml failed to parse XML.
19970601 / 629003newsML.xml failed to parse XML.
19970725 / 756041newsML.xml failed to parse XML.
50000 docs with topic GCAT (from 100000)
Training with 80000 docs
Validating with 20000 docs


### Word2vec model

We are loading the well known word2vec model from __[Google](https://code.google.com/archive/p/word2vec/)__ which is stored in the binary file `GoogleNews-vectors-negative300.bin`.

Load the word2vec model

In [3]:
w2v_name = 'GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=True)

### Get our vocabulary

Get all the the vectors from the word2vec for our vocabulary. Our vocabulary can include all the words used in the word2vec model or be limited to the words in our dataset.

We can change this behaviour with the flag ```dataset_vocabulary```. ```False``` will use all the words from the word2vec model and ```True``` will limit them to just the words that are in our dataset and in the model at the same time.

There is a ```count_threshold``` to remove those words appearing very few times because they are probably errors.

As we have to split each documents in individual words, we already save this inside each document with the key "counter".

After this cell, ```X``` is a matrix including all the vectors we are going to use.

In [4]:
dataset_vocabulary = True
count_threshold = 5

if dataset_vocabulary:
    vocab = Counter()
    for doc in docs:
        doc["counter"] = Counter()
        words = doc["text"].split()
        words = [word.strip(string.punctuation) for word in words]
        for word in words:
            if word in w2v:
                doc["counter"][word] += 1
                vocab[word] += 1
    vocab = {word: count for word, count in vocab.items() if count > count_threshold}
    vocab_array = np.array(list(vocab))
else:
    vocab = w2v.index2word

X = np.zeros((len(vocab), w2v.vector_size), dtype=np.float32)
for index, word in enumerate(vocab):
    X[index, :] += w2v[word]
    
print("Vocabulary length: {}".format(X.shape[0]))
print("Vector length: {}".format(X.shape[1]))

Vocabulary length: 62100
Vector length: 300


Let's take a look at the most common and uncommon words in our dataset

In [5]:
print(Counter(vocab).most_common()[:10])
print(Counter(vocab).most_common()[-10:])

[('the', 1244626), ('in', 526190), ('said', 330822), ('on', 294388), ('for', 233096), ('The', 208167), ('that', 164547), ('was', 163462), ('at', 151405), ('is', 150826)]
[('Asri', 6), ('Kleinberg', 6), ('Belani', 6), ('DSSI', 6), ('CACL', 6), ('Delon', 6), ('Ozimek', 6), ('Komag', 6), ('blockers', 6), ('Reifenrath', 6)]


### Initial Cluster

Train the first Kmeans cluster using the complete set of words in our dataset.

In [6]:
n_clusters = 50
# kmeans_name = 'kmeans' + str(n_clusters) + '.pkl'

In [7]:
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, compute_labels=True)
kmeans.fit(X)
# joblib.dump(kmeans, kmeans_name)

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=50,
        n_init=3, random_state=0, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

### Print each cluster

To have an idea of the clusters we are workings with we can print some of their neighbords. The option selected here is to print the most representative neighbords (those that appear more times in the dataset)

In [8]:
def print_clusters(kmeans, vocab, topn=5):   
    vocab_array = np.array(list(vocab))
    for (i, center) in enumerate(kmeans.cluster_centers_):
        cluster_vocab = vocab_array[np.where(kmeans.labels_ == i)[0]]
        cluster_counter = Counter()
        for v in cluster_vocab:
            cluster_counter[v] = vocab[v]
        print(cluster_counter.most_common(topn))

# print_clusters(kmeans, vocab)

### Build a bow model 

Using the clusters from the kmeans classifier build a bow for each document. This bag of words can be normalized usign the frequency of each word with ```useFrequency=True```. 

How? 
* For each document
    * For each word
        * Obtain the w2v vector for that word
        * Obtain the cluster for that vector
        * Add 1 to that cluster in the document bow

To improve the performance the cluster for each word is saved in a dictionary. This way for each word we first check that dictionary instead of first the w2v model and then the classifier.


In [9]:
def clusters_bow(docs, kmeans, w2v, useFrequency=True, verbose=False):
    n_docs = len(docs)
    n_clusters = kmeans.n_clusters
    bo_clusters = np.zeros((n_docs, n_clusters))

    hashed_clusters = {}
    for (i, doc) in enumerate(docs):
        for word, count in doc["counter"].items():
            if word in hashed_clusters:
                cluster = hashed_clusters[word]
            else:
                cluster = kmeans.predict([w2v[word]])[0]
                hashed_clusters[word] = cluster
            bo_clusters[i][cluster] += count
            
    if useFrequency:
        sums = np.sum(bo_clusters, axis=1)
        sums += 0.0001
        boc = bo_clusters / sums.reshape((sums.shape[0], 1))
    else:
        # Add a small amount to the denominator to avoid zero division
        boc = np.round(bo_clusters / (bo_clusters + 0.001))
        
    if verbose:
        for i in range(10):
            print('Document %d: %s (sum = %.2f)' % (i + 1, np.array2string(boc[i, :]), np.sum(boc[i, :])))
        
    return boc

# bows = clusters_bow(docs, kmeans, w2v)

### Lasso model

Once we have our bags of words we can study how good are our clusters. To do so we use a Lasso function feeding it with the bows. 

#### Build and train the model

We build a Lasso model using the sklearn functions. Lasso is configured to only use positive coefficients (because it is easir to visualize them). 

If we do not have an ```alpha``` value the function uses cross validation to obtain it. 

#### Keep, split or discard clusters

Using the lasso coefficients we can decide how good is a particular cluster. There are different ways to make this decision: 

 - With ```use_mean=False``` and no value for ```deactivate_threshold``` we remove clusters with a coefficient lower than ```deactivate_value``` (typically near 0), split those with a coefficient lower than ```split_threshold``` times the maximum coefficient, and kept those with the higher coefficients. 
 
- With ```use_mean=False``` and a value for ```deactivate_threshold``` we remove clusters with a coefficient lower than ```deactivate_threshold``` times the maximum coefficient, split those with a coefficient lower than ```split_threshold``` times the maximum coefficient, and kept those with the higher coefficients. 

- With ```use_mean=True``` we remove clusters with a coefficient lower than the mean coefficient minus the standard deviation, split those between that value and mean plus standard coefficient, and kept those with a coefficient higher than this last value (mean + std)

  

In [10]:
def fit_lasso(bows, labels, alpha=None, verbose=False):
    if alpha:
        clf = linear_model.Lasso(alpha=alpha, positive=True)
    else:
        clf = linear_model.LassoCV(positive=True)
    clf.fit(bows, labels)
    
    if verbose:
        print('Lasso coefficients: %s' % (np.array2string(clf.coef_, suppress_small=True)))
        
    if alpha:
        return clf, clf.alpha_
    else:
        return clf, alpha

# lasso = fit_lasso(bows, labels)

In [11]:
def study_lasso(lasso_values, deactivate_value=0.1, deactivate_threshold=None, 
                split_threshold=0.6, use_mean=False, verbose=False):  
    
    if use_mean:
        mean = np.mean([i for i in np.abs(lasso_values) if i > 0])
        std = np.std([i for i in np.abs(lasso_values) if i > 0])
        deactivate_value = mean - std
        split_value = mean + std
    else:
        max_w = np.max(np.abs(lasso_values))
        split_value = split_threshold * max_w
        if deactivate_threshold:
            deactivate_value = deactivate_threshold * max_w
        
    if verbose:
        print('Deactivate value: %.2f' % (deactivate_value))
        print('Split value: %.2f' % (split_value))
    
    # deactivate = [x <= deactivate_value for x in np.abs(clf.coef_)]
    split = [x > deactivate_value and x <= split_value for x in np.abs(lasso_values)]
    keep = [x > split_value for x in np.abs(lasso_values)]
    
    return keep, split

# keep, split = study_lasso(lasso.coef_)

# print('Keeping %d clusters' % (len([x for x in keep if x == True])))
# print('Spliting %d clusters' % (len([x for x in split if x == True])))

# new_count = len([x for x in keep if x == True]) + 2 * len([x for x in split if x == True])
# print('Using %d clusters in the next iteration' % (new_count))

### Build new clusters

Once we know which clusters to keep and which to split, we can build the new ones. 

To split one cluster we need to select all the points belonging to that cluster and classify then using 2 neighbourds. 

In [12]:
def update_centers(kmeans, X, keep, split):
    n_centers = len([x for x in keep if x == True]) + 2 * len([x for x in split if x == True])
    updated_centers = np.empty((n_centers, kmeans.cluster_centers_.shape[1]))
    new_idx = 0
    for i in range(len(keep)):
        if keep[i]:
            updated_centers[new_idx, :] = kmeans.cluster_centers_[i, :]
            new_idx += 1
        if split[i]:
            # create kmeans with this data
            newX = X[np.where(kmeans.labels_ == i)[0], :]
            if (newX.shape[0] < 2):
                continue
            small_class = MiniBatchKMeans(n_clusters=2, random_state=0, compute_labels=True)
            small_class.fit(newX)
            updated_centers[new_idx, :] = small_class.cluster_centers_[0, :]
            updated_centers[new_idx + 1, :] = small_class.cluster_centers_[1, :]
            new_idx += 2
            
    return updated_centers[:new_idx, :]
        
# new_centers = update_centers(kmeans, X, keep, split)

### Update the classifier

We have to use the new clusters to classify the data in the following steps. One possible to solution to use these new clusters is to update the classifier centers and the relevant attributes. Then we can use the classifier's function ```predict``` as before. Moreover, to keep using this classifier to build the next cluster we also need to update the ```labels_``` and ```counts_``` parameters. 

In [13]:
def update_kmeans(kmeans, new_centers, x):
    kmeans.cluster_centers_ = new_centers
    kmeans.n_clusters = len(new_centers)
    kmeans.labels_, _ = kmeans._labels_inertia_minibatch(x)
    kmeans.counts_ = np.zeros(kmeans.n_clusters, dtype=np.int32)
    for i in range(kmeans.n_clusters):
        kmeans.counts_[i] = np.sum(kmeans.labels_ == i)
    
    return kmeans

# kmeans = update_kmeans(kmeans, new_centers, X)

### Validation

In [14]:
def validate(lasso, bows, y_true, threshold=0):
    y_predicted = lasso.predict(bows)
    print(y_predicted)
    y_predicted = [1 if i > threshold else 0 for i in y_predicted]
    accuracy = accuracy_score(y_val, y_predicted)
    kappa = cohen_kappa_score(y_val, y_predicted)
    
    print('Accuracy: %.3f' % (accuracy))
    print('Kappa: %.3f' % (kappa))

## Play time

Start by saving the first kmeans (so we can use it multiple times) and printing the first set of clusters. 

In [15]:
original_kmeans = deepcopy(kmeans)
print_clusters(kmeans, vocab)

[('detect', 117), ('detection', 84), ('detector', 40), ('detectors', 36), ('detecting', 26)]
[('PCT', 12607), ('British', 12083), ('London', 11016), ('York', 10679), ('Australian', 6770)]
[('refiners', 347), ('CFDT', 125), ('Grandpuits', 46), ('Sucden', 14), ('BEUC', 13)]
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('Czech', 2957)]
[('Vatican', 396), ('Catholics', 241), ('priest', 184), ('Archbishop', 149), ('priests', 143)]
[('John', 6081), ('David', 3602), ('Bill', 3246), ('Michael', 3023), ('Jan', 2870)]
[('tightened', 319), ('clamp', 96), ('clamped', 52), ('clamping', 37)]
[('Hellaby', 15)]
[('Minister', 21548), ('minister', 10086), ('India', 6127), ('ministry', 5024), ('army', 4791)]
[('blockbuster', 40), ('sequel', 25)]
[('respected', 445), ('distinguished', 73), ('illustrious', 27), ('inducted', 24), ('eminent', 21)]
[('not', 84663), ('But', 28660), ('against', 28079), ('no', 26582), ('if', 21143)]
[('Arab', 3118), ('Saudi', 2723), ('Islamic', 2389

### First configuration

Using the mean and standard deviation to select which clusters to keep or divide.

The output for each epoch is:

 - Current number of clusters
 - Lasso coefficients
 - Predicted values for the validation set
 - Accuracy
 - Kappa
 - Value for deactivating clusters (those with lower coefficient)
 - Value for splitting clusters (those with lower coefficient and not deactivated)
 - Clusters to be kept
 - Clusters to be split
 - Clusters

In [16]:
epochs = 20

kmeans = deepcopy(original_kmeans)

alpha = None

for i in range(epochs):
    print('Epoch %d of %d' % (i + 1, epochs))
    print('Number of clusters: %d' % (kmeans.n_clusters))
    bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=False, verbose=False)
    lasso, alpha = fit_lasso(bows_train, y_train, alpha=alpha, verbose=True)
    
    bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
    validate(lasso, bows_val, y_val, threshold=0.02)
    
    keep, split = study_lasso(lasso.coef_, use_mean=True, verbose=True)

    print('Keeping %d clusters' % (len([x for x in keep if x == True])))
    print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
    next_centers = update_centers(kmeans, X, keep, split)
    
    kmeans = update_kmeans(kmeans, next_centers, X)
    print_clusters(kmeans, vocab)    

Epoch 1 of 20
Number of clusters: 50
Lasso coefficients: [ 0.184  0.     0.     0.063  0.186  0.099  0.     0.     0.205  0.     0.134
  0.115  0.107  0.084  0.173  0.025  0.     0.     0.     0.059  0.     0.
  0.     0.186  0.065  0.079  0.     0.132  0.122  0.163  0.     0.     0.
  0.043  0.186  0.     0.     0.     0.     0.262  0.     0.021  0.     0.
  0.041  0.149  0.169  0.     0.     0.   ]
[ 0.016  0.024  0.019 ...,  0.017  0.048  0.015]
Accuracy: 0.781
Kappa: 0.563
Deactivate value: 0.06
Split value: 0.18
Keeping 5 clusters
Spliting 15 clusters
[('telescope', 131), ('particles', 55), ('sensors', 50), ('detector', 40), ('detectors', 36)]
[('detected', 202), ('detect', 117), ('diagnostic', 103), ('detection', 84), ('alerts', 54)]
[('German', 10280), ('crowns', 3284), ('Sweden', 2230), ('Bosnia', 1888), ('Austria', 1789)]
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('Czech', 2957)]
[('Vatican', 396), ('Catholics', 241), ('priest', 184), ('Archbis

[ 0.046  0.045  0.052 ...,  0.058  0.043  0.053]
Accuracy: 0.508
Kappa: 0.008
Deactivate value: 0.02
Split value: 0.15
Keeping 3 clusters
Spliting 24 clusters
[('research', 3374), ('medical', 1973), ('Health', 1547), ('Medical', 648), ('scientists', 611)]
[('disease', 1671), ('cancer', 1518), ('patients', 1032), ('cells', 410), ('HIV', 404)]
[('system', 8553), ('data', 7679), ('technology', 2859), ('computer', 2246), ('systems', 2192)]
[('reported', 10298), ('control', 6376), ('found', 6194), ('information', 4804), ('test', 3579)]
[('Dole', 1756), ('Lynch', 1566), ('Jacques', 1502), ('Le', 1386), ('Lebed', 929)]
[('York', 10679), ('Chicago', 4617), ('Page', 3822), ('White', 3447), ('She', 3313)]
[('European', 23424), ('EU', 10599), ('German', 10280), ('Germany', 7676), ('Russian', 6800)]
[('China', 10364), ('Japanese', 5562), ('Chinese', 4920), ('Korea', 3624), ('Taiwan', 3383)]
[('envoy', 1153), ('Rome', 1019), ('Pope', 781), ('Vatican', 396), ('pilgrims', 323)]
[('religious', 1068), 

Lasso coefficients: [ 0.138  0.     0.     0.     0.004  0.     0.113  0.     0.062  0.148
  0.052  0.017  0.121  0.     0.018  0.058  0.     0.     0.     0.     0.119
  0.22   0.134  0.049  0.083  0.019  0.019  0.     0.038  0.     0.059
  0.024  0.     0.233  0.059  0.136  0.017  0.     0.179  0.031  0.162  0.
  0.042  0.033  0.031  0.078  0.028  0.063  0.013  0.     0.025]
[-0.073 -0.076 -0.063 ..., -0.061 -0.076 -0.069]
Accuracy: 0.497
Kappa: 0.000
Deactivate value: 0.01
Split value: 0.14
Keeping 6 clusters
Spliting 27 clusters
[('drug', 2983), ('OCT', 2061), ('drugs', 1818), ('treatment', 1754), ('disease', 1671)]
[('astronauts', 219), ('spacewalk', 99), ('spacewalks', 57), ('spacesuits', 30), ('Astronauts', 21)]
[('Rome', 1019), ('Pope', 781), ('Vatican', 396), ('Ciampi', 237), ('throne', 158)]
[('peace', 9547), ('treaty', 2766), ('democracy', 1936), ('constitution', 1330), ('constitutional', 1317)]
[('St', 2369), ('Catholic', 1136), ('Dublin', 948), ('College', 290), ('Bishop',

Lasso coefficients: [ 0.028  0.005  0.233  0.032  0.039  0.066  0.116  0.006  0.007  0.     0.
  0.     0.053  0.056  0.     0.022  0.01   0.141  0.     0.084  0.     0.049
  0.     0.037  0.057  0.025  0.021  0.028  0.041  0.062  0.005  0.     0.
  0.02   0.123  0.     0.     0.212  0.02   0.     0.     0.162  0.041
  0.136  0.     0.     0.016  0.236  0.01   0.     0.006  0.024  0.     0.046
  0.08   0.042  0.     0.031  0.042  0.08 ]
[ 0.107  0.106  0.104 ...,  0.108  0.104  0.104]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.00
Split value: 0.12
Keeping 7 clusters
Spliting 35 clusters
[('sales', 14388), ('Inc', 13405), ('PCT', 12607), ('yen', 9919), ('earnings', 7025)]
[('rates', 13108), ('index', 10344), ('health', 4877), ('insurance', 3466), ('research', 3374)]
[('unc', 602), ('cellular', 529), ('virus', 494), ('researchers', 483), ('cells', 410)]
[('tonnes', 11785), ('sugar', 2229), ('copper', 2208), ('corn', 1693), ('metals', 1122)]
[('astronauts', 219), ('spacewalk', 99), 

[('shares', 22284), ('stock', 12117), ('trading', 10998), ('cents', 10510), ('earnings', 7025)]
[('company', 30511), ('sales', 14388), ('companies', 13408), ('Inc', 13405), ('profit', 8352)]
[('rate', 15168), ('rates', 13108), ('results', 8049), ('data', 7679), ('levels', 5607)]
[('health', 4877), ('hospital', 2463), ('heart', 2277), ('medical', 1973), ('drugs', 1818)]
[('cells', 410), ('protein', 302), ('immune', 199), ('gene', 169), ('tissue', 147)]
[('unc', 602), ('cellular', 529), ('researchers', 483), ('cell', 337), ('humans', 287)]
[('copper', 2208), ('steel', 1772), ('material', 1374), ('metal', 1350), ('metals', 1122)]
[('tonnes', 11785), ('wheat', 4442), ('crude', 3901), ('tonne', 3619), ('imports', 2946)]
[('astronauts', 219), ('spacewalk', 99), ('spacewalks', 57), ('Astronauts', 21), ('undocking', 9)]
[('Italy', 5356), ('Italian', 4273), ('Milan', 1479), ('lire', 1465), ('Portugal', 1455)]
[('French', 10375), ('France', 9362), ('Paris', 4298), ('Belgian', 1994), ('Le', 1386)

Lasso coefficients: [ 0.     0.     0.     0.064  0.05   0.004  0.     0.     0.218  0.036
  0.008  0.     0.014  0.     0.     0.033  0.072  0.023  0.012  0.     0.056
  0.     0.14   0.     0.     0.004  0.     0.04   0.     0.012  0.     0.
  0.     0.     0.     0.099  0.024  0.     0.02   0.     0.     0.     0.
  0.009  0.     0.008  0.022  0.003  0.051  0.026  0.078  0.008  0.     0.048
  0.106  0.     0.019  0.     0.055  0.018  0.018  0.     0.024  0.016  0.
  0.     0.     0.     0.     0.     0.062  0.     0.05   0.     0.062  0.
  0.     0.     0.     0.136  0.     0.026  0.     0.204  0.     0.     0.
  0.002  0.013  0.125  0.     0.014  0.276  0.153  0.     0.005  0.     0.002
  0.073  0.     0.     0.135  0.225  0.     0.     0.     0.11   0.154  0.
  0.     0.     0.     0.067  0.07   0.     0.     0.     0.     0.034  0.
  0.074  0.002  0.     0.021  0.     0.     0.084  0.     0.     0.     0.
  0.     0.01   0.053  0.     0.027  0.     0.     0.     0.     0.003
  0.

Lasso coefficients: [ 0.     0.     0.     0.     0.     0.055  0.025  0.059  0.018  0.037
  0.028  0.     0.     0.     0.     0.     0.133  0.025  0.035  0.     0.
  0.     0.158  0.     0.     0.013  0.     0.     0.     0.     0.     0.07
  0.     0.016  0.03   0.     0.     0.008  0.012  0.041  0.     0.005  0.
  0.062  0.     0.     0.     0.073  0.     0.     0.038  0.     0.     0.062
  0.015  0.     0.002  0.038  0.     0.     0.     0.     0.007  0.     0.
  0.006  0.     0.     0.11   0.036  0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     0.     0.     0.     0.029  0.003  0.
  0.     0.041  0.     0.063  0.004  0.     0.     0.131  0.     0.058  0.
  0.094  0.042  0.     0.     0.     0.     0.046  0.029  0.     0.043  0.
  0.     0.072  0.     0.     0.     0.04   0.032  0.013  0.001  0.027
  0.061  0.     0.     0.     0.008  0.023  0.     0.     0.     0.     0.
  0.     0.     0.     0.022  0.034  0.     0.     0.     0.029  0.021  0.
  0.    

Lasso coefficients: [ 0.     0.     0.     0.     0.     0.     0.002  0.     0.     0.     0.053
  0.042  0.007  0.     0.05   0.     0.01   0.     0.     0.096  0.023  0.
  0.     0.004  0.011  0.     0.     0.     0.     0.     0.     0.     0.183
  0.017  0.02   0.     0.024  0.022  0.     0.     0.     0.006  0.     0.116
  0.     0.     0.009  0.     0.038  0.     0.     0.     0.     0.     0.
  0.     0.027  0.     0.048  0.     0.052  0.04   0.     0.016  0.028  0.
  0.     0.     0.004  0.002  0.006  0.006  0.     0.     0.017  0.038
  0.041  0.     0.     0.     0.     0.     0.028  0.023  0.025  0.     0.094
  0.     0.     0.     0.035  0.086  0.     0.     0.     0.     0.     0.029
  0.     0.     0.     0.048  0.     0.056  0.024  0.     0.     0.     0.
  0.007  0.042  0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.005  0.02   0.     0.     0.     0.     0.028  0.006  0.     0.     0.
  0.     0.105  0.04   0.     0.     0.     0.019  0.     0.     0.  

[('knife', 115), ('sticks', 104), ('cannon', 74), ('knives', 73), ('sword', 62)]
[('wand', 16), ('flashlight', 9), ('flashlights', 9), ('compass', 8), ('binoculars', 6)]
[('gun', 552), ('guns', 428), ('bullets', 311), ('ammunition', 297), ('weapon', 247)]
[('shooting', 922), ('shoot', 226), ('sniper', 41), ('Weapons', 37), ('Shooting', 30)]
[('ship', 1440), ('crew', 1106), ('vessel', 893), ('maritime', 202), ('Admiral', 81)]
[('vessels', 1948), ('ships', 1007), ('navy', 438), ('naval', 362), ('submarine', 330)]
[('Sea', 1644), ('coast', 1602), ('aboard', 602), ('boat', 536), ('boats', 493)]
[('sail', 172), ('anchorage', 146), ('knots', 80), ('crane', 59), ('rudder', 48)]
[('rig', 200), ('rigs', 71), ('subsea', 16), ('topsides', 6), ('dayrates', 6)]
[('cargo', 2909), ('shipping', 1352), ('freight', 813), ('Cargo', 671), ('container', 651)]
[('dockworkers', 11)]
[('port', 2878), ('Port', 1203), ('ports', 887), ('Maritime', 216), ('shipyard', 148)]
[('fly', 870), ('flying', 753), ('helico

Lasso coefficients: [ 0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.011  0.004  0.     0.     0.     0.     0.     0.     0.069
  0.043  0.019  0.033  0.     0.     0.     0.024  0.029  0.     0.     0.
  0.056  0.     0.     0.     0.     0.076  0.009  0.     0.036  0.07   0.
  0.011  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     0.     0.     0.     0.     0.205  0.
  0.     0.02   0.     0.     0.03   0.     0.033  0.     0.     0.     0.
  0.     0.     0.06   0.004  0.     0.     0.117  0.     0.     0.004  0.
  0.     0.     0.     0.     0.     0.049  0.     0.     0.     0.     0.
  0.     0.005  0.     0.     0.     0.01   0.     0.027  0.     0.03   0.
  0.     0.038  0.1    0.     0.     0.079  0.     0.     0.033  0.     0.019
  0.     0.     0.     0.014  0.     0.     0.     0.025  0.008  0.     0.006
  0.009  0.     0.002  0.     0.092  0.     0.     0.     0.     0.007


[('Zairean', 1241), ('Sese', 539), ('Seko', 482), ('Lama', 285), ('Shaba', 127)]
[('B', 3949), ('H', 2017), ('S', 1627), ('K', 1409), ('L', 1258)]
[('FW', 9)]
[('sec', 20), ('ws', 9)]
[('b', 1799), ('c', 1333), ('unc', 602), ('Pol', 361), ('approx', 303)]
[('Sao', 1174), ('Suu', 469), ('Phnom', 456), ('Kyi', 331), ('Aung', 190)]
[('Hun', 1184), ('Zemin', 305), ('Chi', 243), ('Tin', 224), ('Pa', 197)]
[('pesetas', 564), ('escudos', 414), ('Pan', 243), ('peseta', 164), ('Palo', 162)]
[('San', 2381), ('Los', 1349), ('El', 976), ('Buenos', 871), ('Las', 246)]
[('Telebras', 237), ('Grupo', 190), ('RBD', 186), ('bolsa', 138), ('Companhia', 122)]
[('Banorte', 13)]
[('Caqueta', 27), ('Antioquia', 24), ('Putumayo', 22), ('Florencia', 15), ('Valledupar', 15)]
[('Norte', 50), ('Ica', 42), ('Neuquen', 30), ('Ayacucho', 24), ('Arequipa', 22)]
[('Lima', 781), ('Havana', 362), ('Caracas', 326), ('Puerto', 322), ('Bogota', 319)]
[('Latin', 1491), ('Cuba', 1311), ('Argentine', 1166), ('Colombia', 971),

Lasso coefficients: [ 0.     0.     0.    ...,  0.006  0.     0.   ]
[ 0.023 -0.004  0.024 ...,  0.009 -0.004  0.013]
Accuracy: 0.678
Kappa: 0.357
Deactivate value: -0.00
Split value: 0.09
Keeping 40 clusters
Spliting 1768 clusters
[('cents', 10510), ('Net', 8432), ('net', 7774), ('Total', 5194), ('gross', 2297)]
[('revenues', 3424), ('revenue', 2716), ('Turnover', 1198), ('Revenues', 890), ('Revenue', 787)]
[('profit', 8352), ('earnings', 7025), ('Income', 3060), ('EPS', 1504), ('Earnings', 1236)]
[('FFO', 51), ('GAAP', 20)]
[('shrs', 1059), ('Q3', 1046), ('Q1', 998), ('Q2', 705), ('Q4', 624)]
[('mth', 200)]
[('warrants', 695), ('CUSIP', 422), ('debentures', 273), ('Issuer', 264), ('pursuant', 121)]
[('exercisable', 44)]
[('Repurchase', 20), ('Issuance', 16), ('Borrowings', 15), ('Maturities', 12), ('Repurchases', 6)]
[('convertible', 710), ('payable', 530), ('subordinated', 209), ('Convertible', 45), ('callable', 32)]
[('shareholders', 4140), ('shareholder', 1409), ('holders', 621), 

[('Kasparov', 36)]
[('Soyuz', 77), ('Baikonur', 27), ('cosmodrome', 21)]
[('Caspian', 224), ('Nakhodka', 34), ('Sakhalin', 22)]
[('Ceyhan', 100)]
[('Gazprom', 225), ('LUKoil', 158), ('MOL', 132), ('Novorossiisk', 77), ('Rosneft', 64)]
[('Tupras', 60), ('Petkim', 6)]
[('UES', 149), ('Norilsk', 140), ('Aeroflot', 52), ('Potanin', 33), ('AvtoVAZ', 28)]
[('EBRD', 199), ('Menatep', 26), ('Vneshekonombank', 24), ('VEB', 14), ('Vneshtorgbank', 11)]
[('Khrunichev', 13)]
[('Svyazinvest', 66), ('MICEX', 55), ('MGTS', 55), ('Mosenergo', 33), ('OAO', 25)]
[('Penang', 134), ('Bernama', 100), ('Malacca', 23), ('Kuching', 19), ('Merdeka', 8)]
[('Sarawak', 86), ('Sabah', 76), ('Perak', 50), ('Johor', 48), ('Selangor', 28)]
[('Malays', 48), ('Malaysians', 43), ('Taib', 42), ('Singaporeans', 42), ('Badawi', 37)]
[('PAP', 236), ('MCA', 104), ('Pas', 100), ('UMNO', 73), ('DAP', 55)]
[('Kuala', 796), ('Tanjung', 119), ('Indah', 95), ('Putra', 76), ('Rimba', 32)]
[('Negara', 196), ('Nusantara', 68)]
[('Mala

[('claims', 2570), ('claim', 1778), ('answer', 848), ('conclusion', 577), ('truth', 507)]
[('evidence', 2666), ('indications', 468), ('proof', 368), ('motive', 164), ('conclusive', 66)]
[('tape', 332), ('videotape', 71), ('affidavit', 59), ('transcript', 56), ('dossier', 53)]
[('statements', 1278), ('witnesses', 1256), ('testimony', 665), ('witness', 630), ('innocence', 134)]
[('verify', 91), ('ascertain', 36), ('verifying', 16), ('authenticate', 8)]
[('verification', 59), ('Verification', 18)]
[('contrary', 312), ('unfounded', 128), ('untrue', 79), ('baseless', 61), ('groundless', 54)]
[('fabricated', 93), ('fabrication', 64), ('fabricating', 16), ('concocted', 10), ('Fabricated', 6)]
[('intransigence', 51), ('inaction', 33), ('stonewalling', 16), ('intransigent', 8), ('obstructionist', 8)]
[('meddling', 82), ('filibustering', 27), ('posturing', 18), ('pandering', 14), ('dithering', 14)]
[('complacency', 52), ('laxity', 18), ('indiscipline', 17)]
[('aggression', 166), ('negligence', 1

MemoryError: 

### Second configuration

Deactivating clusters with a coefficient lower than 0.01 and splitting those with lower value than 0.6 times the max(abs(coefficient))

In [17]:
epochs = 20

kmeans = deepcopy(original_kmeans)

alpha = None

for i in range(epochs):
    print('Epoch %d of %d' % (i + 1, epochs))
    print('Number of clusters: %d' % (kmeans.n_clusters))
    bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=False, verbose=False)
    lasso, alpha = fit_lasso(bows_train, y_train, alpha=alpha, verbose=True)
    
    bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
    validate(lasso, bows_val, y_val, threshold=0.02)
    
    keep, split = study_lasso(lasso.coef_, deactivate_value=0.01, verbose=True)

    print('Keeping %d clusters' % (len([x for x in keep if x == True])))
    print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
    next_centers = update_centers(kmeans, X, keep, split)
    
    kmeans = update_kmeans(kmeans, next_centers, X)
    print_clusters(kmeans, vocab)

Epoch 1 of 20
Number of clusters: 50
Lasso coefficients: [ 0.184  0.     0.     0.063  0.186  0.099  0.     0.     0.205  0.     0.134
  0.115  0.107  0.084  0.173  0.025  0.     0.     0.     0.059  0.     0.
  0.     0.186  0.065  0.079  0.     0.132  0.122  0.163  0.     0.     0.
  0.043  0.186  0.     0.     0.     0.     0.262  0.     0.021  0.     0.
  0.041  0.149  0.169  0.     0.     0.   ]
[ 0.016  0.024  0.019 ...,  0.017  0.048  0.015]
Accuracy: 0.781
Kappa: 0.563
Deactivate value: 0.01
Split value: 0.16
Keeping 9 clusters
Spliting 16 clusters
[('detect', 117), ('detection', 84), ('detector', 40), ('detectors', 36), ('detecting', 26)]
[('German', 10280), ('Germany', 7676), ('crowns', 3284), ('Sweden', 2230), ('Bosnia', 1888)]
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('Czech', 2957)]
[('Vatican', 396), ('Catholics', 241), ('priest', 184), ('Archbishop', 149), ('priests', 143)]
[('Co', 9971), ('John', 6081), ('B', 3949), ('David', 3602), ('B

Lasso coefficients: [ 0.092  0.     0.001  0.001  0.064  0.061  0.124  0.     0.     0.111
  0.135  0.122  0.     0.097  0.126  0.     0.109  0.2    0.058  0.177
  0.067  0.     0.13   0.408  0.205  0.002  0.088  0.042  0.045  0.11
  0.072  0.026  0.     0.     0.209  0.     0.247  0.053  0.079  0.     0.
  0.     0.     0.06   0.125  0.13 ]
[ 0.086  0.091  0.093 ...,  0.083  0.08   0.085]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.01
Split value: 0.24
Keeping 2 clusters
Spliting 28 clusters
[('accuracy', 3136), ('technology', 2859), ('systems', 2192), ('OCT', 2061), ('electronic', 1139)]
[('data', 7679), ('test', 3579), ('verified', 2949), ('reduce', 2854), ('Security', 2672)]
[('pct', 8369), ('Plc', 4453), ('francs', 3977), ('stg', 3368), ('euro', 1691)]
[('European', 23424), ('EU', 10599), ('German', 10280), ('Germany', 7676), ('Italy', 5356)]
[('China', 10364), ('parliament', 8303), ('Russian', 6800), ('Russia', 5594), ('Chinese', 4920)]
[('yen', 9919), ('currency', 9378), ('

Lasso coefficients: [ 0.     0.     0.     0.136  0.     0.02   0.098  0.057  0.     0.013
  0.178  0.072  0.     0.029  0.     0.     0.066  0.     0.042  0.113  0.
  0.     0.     0.095  0.161  0.     0.     0.15   0.008  0.109  0.009
  0.226  0.14   0.089  0.     0.024  0.014  0.052  0.04   0.062  0.035
  0.173  0.067  0.134  0.07   0.173  0.     0.     0.204  0.     0.     0.
  0.016  0.     0.074  0.024  0.018  0.     0.058  0.025  0.   ]
[ 0.056  0.034  0.05  ...,  0.054  0.017  0.048]
Accuracy: 0.517
Kappa: 0.029
Deactivate value: 0.01
Split value: 0.14
Keeping 9 clusters
Spliting 27 clusters
[('European', 23424), ('EU', 10599), ('German', 10280), ('parliament', 8303), ('Germany', 7676)]
[('Swedish', 1655), ('Polish', 1584), ('Boris', 1390), ('Finnish', 1311), ('Finland', 1284)]
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('Soviet', 2302)]
[('Rome', 1019), ('Pope', 781), ('Vatican', 396), ('pilgrims', 323), ('monarch', 112)]
[('diplomat', 1372), ('

Lasso coefficients: [ 0.059  0.055  0.015  0.051  0.039  0.09   0.     0.021  0.034  0.021  0.
  0.001  0.063  0.018  0.101  0.     0.008  0.062  0.     0.081  0.018  0.
  0.145  0.021  0.168  0.     0.126  0.067  0.104  0.121  0.186  0.     0.009
  0.044  0.03   0.006  0.003  0.013  0.046  0.061  0.     0.046  0.     0.
  0.     0.     0.106  0.178  0.038  0.     0.169  0.11   0.029  0.158  0.
  0.25   0.018  0.237  0.     0.     0.147  0.     0.019  0.     0.075
  0.063  0.041  0.     0.     0.   ]
[ 0.078  0.076  0.078 ...,  0.081  0.078  0.079]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.01
Split value: 0.15
Keeping 7 clusters
Spliting 37 clusters
[('Iraq', 3651), ('Arab', 3118), ('Iran', 2618), ('Iraqi', 2606), ('Arafat', 2478)]
[('Israel', 6220), ('Israeli', 5985), ('Palestinian', 4107), ('Netanyahu', 3166), ('Palestinians', 2733)]
[('EU', 10599), ('NATO', 3638), ('Turkey', 2447), ('Turkish', 2364), ('Socialist', 2058)]
[('pct', 8369), ('Russian', 6800), ('Russia', 5594), ('

[ 0.065  0.058  0.065 ...,  0.07   0.054  0.064]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.01
Split value: 0.18
Keeping 2 clusters
Spliting 46 clusters
[('Iraq', 3651), ('Arab', 3118), ('Pakistan', 3003), ('Saudi', 2723), ('Iran', 2618)]
[('yen', 9919), ('Taiwan', 3383), ('Yeltsin', 3203), ('Tokyo', 2936), ('Korean', 2776)]
[('Israeli', 5985), ('Palestinian', 4107), ('Palestinians', 2733), ('Gaza', 1006), ('Hamas', 323)]
[('Israel', 6220), ('Netanyahu', 3166), ('Hebron', 1961), ('Jerusalem', 1887), ('Israelis', 833)]
[('Bosnia', 1888), ('Bosnian', 1630), ('Serb', 1397), ('Albania', 1270), ('Belgrade', 1209)]
[('European', 23424), ('EU', 10599), ('German', 10280), ('parliament', 8303), ('Germany', 7676)]
[('ore', 418), ('volcanic', 56), ('oxide', 54), ('geological', 52), ('sands', 47)]
[('ancient', 273), ('fossil', 102), ('discoveries', 94), ('archives', 86), ('cave', 78)]
[('stadium', 775), ('tunnel', 704), ('Hotel', 563), ('pyramid', 359), ('Palace', 318)]
[('Rome', 1019), ('Ro

Lasso coefficients: [ 0.003  0.     0.114  0.     0.08   0.033  0.     0.02   0.     0.     0.011
  0.052  0.002  0.     0.     0.069  0.074  0.108  0.054  0.071  0.06   0.
  0.068  0.     0.     0.     0.014  0.     0.     0.     0.1    0.013
  0.033  0.     0.041  0.     0.     0.036  0.056  0.062  0.     0.128
  0.096  0.     0.     0.064  0.016  0.108  0.     0.     0.097  0.038
  0.351  0.057  0.073  0.042  0.081  0.     0.034  0.007  0.     0.009
  0.153  0.     0.016  0.023  0.06   0.099  0.065  0.034  0.001  0.     0.
  0.089  0.03   0.     0.031  0.109  0.039  0.07   0.027  0.088  0.     0.025
  0.108  0.051  0.     0.033  0.     0.     0.009  0.     0.     0.015]
[ 0.042  0.031  0.042 ...,  0.05   0.029  0.04 ]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.01
Split value: 0.21
Keeping 1 clusters
Spliting 53 clusters
[('Hebron', 1961), ('Ramallah', 157), ('Haifa', 77), ('Nablus', 72), ('Bekaa', 62)]
[('Israel', 6220), ('Israeli', 5985), ('Palestinian', 4107), ('Netanyahu', 

Lasso coefficients: [ 0.057  0.027  0.103  0.05   0.     0.     0.     0.     0.     0.052
  0.073  0.     0.037  0.     0.     0.125  0.018  0.038  0.144  0.038
  0.182  0.066  0.139  0.014  0.022  0.111  0.     0.072  0.     0.013  0.
  0.     0.     0.029  0.021  0.     0.059  0.     0.003  0.016  0.05
  0.092  0.111  0.056  0.167  0.     0.     0.084  0.003  0.     0.047  0.
  0.     0.102  0.015  0.     0.35   0.     0.113  0.084  0.     0.     0.
  0.024  0.     0.036  0.026  0.038  0.271  0.012  0.005  0.123  0.016  0.
  0.019  0.072  0.008  0.07   0.02   0.023  0.     0.014  0.067  0.024
  0.062  0.023  0.02   0.029  0.     0.081  0.092  0.     0.093  0.101
  0.134  0.     0.063  0.     0.     0.162  0.     0.089  0.     0.     0.
  0.     0.   ]
[ 0.021  0.017  0.024 ...,  0.018  0.015  0.016]
Accuracy: 0.774
Kappa: 0.547
Deactivate value: 0.01
Split value: 0.21
Keeping 2 clusters
Spliting 62 clusters
[('Katyusha', 83), ('Haifa', 77), ('Kiryat', 74), ('Sidon', 56), ('Aqaba', 5

Lasso coefficients: [ 0.     0.12   0.067  0.01   0.114  0.046  0.     0.     0.038  0.     0.058
  0.053  0.037  0.     0.     0.     0.     0.142  0.     0.     0.065
  0.074  0.071  0.025  0.065  0.047  0.     0.081  0.018  0.026  0.007
  0.024  0.     0.117  0.09   0.012  0.055  0.032  0.016  0.     0.     0.
  0.     0.     0.066  0.     0.     0.     0.05   0.063  0.075  0.047
  0.083  0.043  0.     0.069  0.117  0.055  0.083  0.048  0.     0.063
  0.169  0.089  0.     0.02   0.278  0.083  0.029  0.142  0.     0.007  0.
  0.     0.009  0.     0.021  0.     0.049  0.24   0.     0.     0.103
  0.102  0.001  0.     0.015  0.     0.059  0.016  0.045  0.     0.03   0.
  0.     0.     0.017  0.     0.085  0.     0.002  0.021  0.029  0.051  0.
  0.     0.     0.018  0.176  0.     0.034  0.     0.014  0.     0.089
  0.135  0.019  0.     0.13   0.063  0.     0.012  0.019  0.13   0.058
  0.039]
[ 0.108  0.108  0.113 ...,  0.109  0.104  0.109]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 

Lasso coefficients: [ 0.007  0.003  0.085  0.014  0.02   0.     0.     0.061  0.048  0.     0.
  0.     0.019  0.035  0.     0.     0.     0.09   0.095  0.     0.     0.
  0.06   0.056  0.024  0.     0.071  0.     0.07   0.     0.     0.039
  0.088  0.042  0.     0.     0.     0.     0.     0.     0.129  0.046
  0.148  0.     0.     0.021  0.01   0.006  0.     0.032  0.     0.001  0.
  0.     0.     0.007  0.     0.     0.051  0.     0.046  0.     0.056  0.
  0.     0.001  0.     0.076  0.097  0.101  0.202  0.     0.     0.114
  0.045  0.     0.     0.     0.177  0.128  0.     0.026  0.     0.363  0.
  0.212  0.     0.044  0.     0.042  0.021  0.062  0.028  0.02   0.283
  0.079  0.002  0.072  0.15   0.008  0.     0.     0.     0.056  0.005
  0.049  0.046  0.017  0.     0.     0.     0.     0.     0.04   0.     0.
  0.024  0.     0.12   0.     0.     0.     0.     0.072  0.     0.042
  0.04   0.053  0.054  0.173  0.07   0.001  0.097  0.     0.067  0.     0.
  0.     0.     0.02   0.061 

Lasso coefficients: [ 0.098  0.     0.     0.029  0.     0.     0.067  0.     0.033  0.036  0.
  0.     0.     0.     0.     0.     0.065  0.158  0.042  0.     0.058
  0.01   0.     0.065  0.046  0.047  0.055  0.032  0.     0.     0.001
  0.102  0.     0.009  0.     0.152  0.     0.089  0.     0.     0.023
  0.007  0.     0.002  0.003  0.     0.044  0.     0.042  0.     0.126
  0.072  0.     0.     0.     0.097  0.1    0.     0.204  0.046  0.066
  0.11   0.04   0.     0.094  0.137  0.013  0.071  0.     0.     0.332
  0.051  0.098  0.     0.038  0.107  0.     0.     0.     0.048  0.     0.094
  0.     0.038  0.032  0.253  0.05   0.     0.02   0.059  0.03   0.084
  0.125  0.     0.     0.042  0.089  0.     0.     0.017  0.079  0.009
  0.045  0.     0.     0.003  0.     0.055  0.042  0.038  0.006  0.081
  0.046  0.086  0.083  0.     0.104  0.059  0.     0.     0.039  0.038
  0.028  0.     0.027  0.023  0.019  0.     0.031  0.118  0.004  0.058
  0.024  0.046  0.028  0.   ]
[ 0.062  0.066  

Lasso coefficients: [ 0.09   0.015  0.037  0.038  0.048  0.003  0.036  0.021  0.009  0.017  0.
  0.     0.08   0.135  0.057  0.     0.     0.     0.     0.     0.012  0.
  0.024  0.043  0.     0.048  0.081  0.011  0.032  0.     0.093  0.072
  0.004  0.117  0.075  0.02   0.026  0.018  0.047  0.     0.     0.015  0.
  0.036  0.007  0.04   0.     0.091  0.     0.05   0.12   0.048  0.134  0.
  0.047  0.05   0.021  0.091  0.     0.     0.043  0.012  0.114  0.02   0.
  0.     0.     0.288  0.026  0.062  0.     0.062  0.1    0.     0.013  0.
  0.     0.02   0.     0.009  0.     0.054  0.     0.01   0.307  0.     0.034
  0.     0.     0.023  0.027  0.     0.     0.254  0.     0.046  0.106  0.
  0.031  0.003  0.     0.     0.     0.03   0.067  0.018  0.     0.     0.068
  0.     0.069  0.     0.077  0.037  0.029  0.006  0.075  0.106  0.003  0.
  0.058  0.232  0.     0.005  0.     0.     0.031  0.     0.085  0.     0.042
  0.     0.046  0.     0.025  0.     0.045  0.056  0.002  0.057  0.075
  0.

Lasso coefficients: [ 0.     0.113  0.026  0.059  0.     0.029  0.025  0.018  0.058  0.071  0.
  0.     0.     0.004  0.     0.     0.08   0.     0.062  0.119  0.069
  0.077  0.016  0.043  0.     0.     0.044  0.084  0.     0.091  0.061
  0.064  0.     0.     0.     0.     0.084  0.083  0.046  0.     0.     0.057
  0.     0.184  0.     0.069  0.     0.002  0.007  0.021  0.     0.043  0.
  0.028  0.086  0.     0.     0.059  0.104  0.066  0.054  0.048  0.077  0.
  0.05   0.     0.     0.075  0.     0.067  0.054  0.     0.     0.088  0.
  0.184  0.     0.     0.095  0.     0.     0.105  0.129  0.014  0.312
  0.113  0.     0.     0.14   0.032  0.     0.     0.009  0.025  0.     0.08
  0.002  0.006  0.002  0.292  0.068  0.     0.012  0.024  0.     0.019
  0.151  0.079  0.     0.     0.135  0.058  0.     0.068  0.     0.088
  0.002  0.     0.209  0.     0.195  0.     0.     0.     0.     0.     0.029
  0.053  0.091  0.074  0.008  0.081  0.     0.043  0.027  0.285  0.009  0.
  0.     0.     0

Lasso coefficients: [ 0.     0.1    0.     0.     0.     0.005  0.     0.013  0.     0.04
  0.052  0.     0.021  0.007  0.     0.     0.001  0.047  0.     0.01
  0.081  0.019  0.     0.04   0.     0.     0.022  0.     0.074  0.     0.025
  0.023  0.038  0.009  0.     0.08   0.     0.028  0.09   0.     0.041
  0.017  0.     0.092  0.     0.     0.     0.118  0.052  0.098  0.01   0.06
  0.013  0.001  0.05   0.     0.     0.     0.042  0.     0.     0.     0.042
  0.     0.06   0.023  0.014  0.105  0.     0.044  0.     0.274  0.03
  0.021  0.06   0.097  0.053  0.     0.     0.024  0.006  0.023  0.094  0.
  0.     0.101  0.011  0.     0.111  0.     0.     0.     0.314  0.     0.
  0.     0.006  0.015  0.024  0.     0.087  0.328  0.     0.077  0.     0.
  0.     0.014  0.029  0.     0.144  0.     0.045  0.005  0.109  0.023
  0.07   0.     0.023  0.     0.029  0.084  0.164  0.164  0.002  0.001
  0.003  0.04   0.002  0.055  0.012  0.085  0.024  0.044  0.091  0.035
  0.016  0.     0.186  0.037

Lasso coefficients: [ 0.117  0.005  0.     0.002  0.     0.106  0.071  0.103  0.046  0.     0.04
  0.065  0.068  0.     0.     0.     0.     0.038  0.     0.031  0.     0.059
  0.     0.     0.     0.035  0.042  0.016  0.     0.061  0.074  0.007
  0.095  0.005  0.022  0.087  0.005  0.     0.027  0.063  0.105  0.069  0.
  0.042  0.     0.064  0.     0.     0.     0.048  0.035  0.063  0.077  0.
  0.045  0.043  0.055  0.05   0.035  0.     0.     0.059  0.     0.156  0.
  0.042  0.285  0.025  0.     0.     0.     0.066  0.05   0.     0.129  0.
  0.026  0.008  0.     0.     0.     0.     0.134  0.055  0.024  0.     0.107
  0.     0.008  0.297  0.024  0.018  0.     0.019  0.046  0.     0.259
  0.026  0.085  0.     0.05   0.016  0.     0.048  0.177  0.022  0.058
  0.061  0.089  0.     0.     0.111  0.     0.     0.042  0.     0.     0.
  0.     0.106  0.012  0.     0.086  0.     0.134  0.     0.007  0.     0.101
  0.109  0.031  0.064  0.     0.055  0.127  0.049  0.     0.     0.009
  0.047  0

Lasso coefficients: [ 0.061  0.     0.029  0.011  0.105  0.     0.011  0.076  0.016  0.011  0.
  0.     0.039  0.037  0.044  0.096  0.006  0.082  0.     0.054  0.021
  0.02   0.     0.042  0.029  0.021  0.008  0.     0.053  0.04   0.025
  0.031  0.032  0.     0.     0.     0.018  0.087  0.     0.005  0.     0.
  0.054  0.001  0.     0.     0.     0.     0.     0.106  0.     0.018
  0.017  0.     0.042  0.     0.113  0.002  0.     0.     0.     0.     0.
  0.063  0.058  0.     0.053  0.     0.     0.     0.334  0.048  0.     0.
  0.     0.089  0.06   0.079  0.083  0.     0.     0.153  0.     0.     0.
  0.     0.101  0.     0.     0.284  0.022  0.005  0.     0.014  0.003
  0.004  0.041  0.048  0.243  0.051  0.012  0.098  0.053  0.02   0.026  0.
  0.     0.003  0.     0.056  0.276  0.     0.037  0.024  0.06   0.041  0.
  0.008  0.122  0.125  0.     0.016  0.041  0.189  0.     0.028  0.287  0.
  0.     0.06   0.     0.071  0.015  0.057  0.111  0.     0.049  0.044  0.
  0.     0.084  0.   

Lasso coefficients: [ 0.     0.032  0.     0.     0.049  0.009  0.046  0.033  0.043  0.     0.
  0.     0.     0.032  0.     0.     0.037  0.09   0.022  0.045  0.134  0.
  0.     0.017  0.05   0.     0.     0.034  0.     0.001  0.     0.036
  0.043  0.048  0.     0.068  0.     0.     0.     0.     0.     0.     0.011
  0.026  0.     0.     0.     0.074  0.     0.023  0.067  0.04   0.     0.13
  0.13   0.     0.     0.037  0.     0.     0.08   0.009  0.     0.09   0.
  0.043  0.028  0.07   0.033  0.018  0.207  0.016  0.041  0.11   0.022
  0.015  0.     0.075  0.     0.     0.063  0.044  0.035  0.048  0.069
  0.308  0.     0.016  0.036  0.     0.     0.     0.     0.056  0.301
  0.05   0.039  0.     0.     0.099  0.     0.058  0.     0.018  0.049
  0.037  0.     0.     0.065  0.173  0.015  0.069  0.     0.02   0.078
  0.015  0.     0.     0.116  0.069  0.     0.017  0.021  0.041  0.     0.073
  0.089  0.133  0.     0.     0.     0.     0.027  0.048  0.038  0.     0.
  0.119  0.     0.137

Lasso coefficients: [ 0.     0.029  0.     0.     0.     0.035  0.01   0.007  0.055  0.009
  0.046  0.     0.     0.     0.033  0.071  0.002  0.049  0.053  0.     0.092
  0.046  0.     0.     0.019  0.015  0.     0.031  0.     0.     0.005
  0.007  0.024  0.     0.     0.053  0.     0.028  0.     0.003  0.06
  0.008  0.062  0.     0.035  0.034  0.056  0.     0.025  0.099  0.106  0.
  0.006  0.025  0.012  0.021  0.01   0.046  0.     0.008  0.     0.     0.
  0.066  0.     0.041  0.005  0.     0.309  0.     0.     0.053  0.028
  0.056  0.095  0.1    0.     0.     0.     0.05   0.114  0.026  0.099  0.
  0.026  0.     0.028  0.     0.021  0.     0.     0.222  0.005  0.     0.
  0.004  0.     0.     0.211  0.069  0.     0.     0.042  0.083  0.028
  0.047  0.     0.001  0.     0.026  0.     0.     0.     0.039  0.186  0.
  0.031  0.     0.031  0.047  0.019  0.01   0.     0.     0.093  0.     0.
  0.068  0.09   0.012  0.097  0.     0.052  0.     0.025  0.     0.     0.08
  0.     0.     0.   

[('crowns', 3284), ('Dutch', 3093), ('Jan', 2870), ('AG', 2429), ('Sweden', 2230)]
[('pay', 7322), ('contract', 6647), ('cash', 5635), ('sold', 5481), ('buying', 5202)]
[('company', 30511), ('bank', 14926), ('investors', 8297), ('firm', 7916), ('shareholders', 4140)]
[('stolen', 475), ('theft', 282), ('stole', 223), ('stealing', 152), ('steal', 134)]
[('rebounds', 274), ('assists', 230), ('kills', 107), ('steals', 12), ('digs', 11)]
[('bribery', 245), ('bribes', 236), ('kickbacks', 96), ('bribe', 69), ('payoffs', 57)]
[('smuggling', 365), ('smuggled', 163), ('extortion', 144), ('mafia', 117), ('blackmail', 92)]
[('fraud', 1353), ('scam', 85), ('defraud', 52), ('defrauded', 38), ('defrauding', 34)]
[('embezzlement', 89), ('embezzling', 46), ('misappropriation', 36), ('misappropriated', 24), ('misappropriating', 11)]
[('laundered', 28), ('launder', 19)]
[('laundering', 228)]
[('tonnes', 11785), ('unit', 6081), ('operations', 5900), ('southern', 4096), ('air', 3723)]
[('military', 8769), 

### Third configuration

Deactivating clusters with a coefficient lower than 0.2 times the max(abs(coefficient)) and splitting those with lower value than 0.6 times the max(abs(coefficient))

In [18]:
epochs = 20

kmeans = deepcopy(original_kmeans)

alpha = None 

for i in range(epochs):
    print('Epoch %d of %d' % (i + 1, epochs))
    print('Number of clusters: %d' % (kmeans.n_clusters))
    bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=False, verbose=False)
    lasso, alpha = fit_lasso(bows_train, y_train, alpha=alpha, verbose=True)
    
    bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
    validate(lasso, bows_val, y_val, threshold=0.02)
    
    keep, split = study_lasso(lasso.coef_, deactivate_threshold=0.1, verbose=True)

    print('Keeping %d clusters' % (len([x for x in keep if x == True])))
    print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
    next_centers = update_centers(kmeans, X, keep, split)
    
    kmeans = update_kmeans(kmeans, next_centers, X)
    print_clusters(kmeans, vocab)

Epoch 1 of 20
Number of clusters: 50
Lasso coefficients: [ 0.184  0.     0.     0.063  0.186  0.099  0.     0.     0.205  0.     0.134
  0.115  0.107  0.084  0.173  0.025  0.     0.     0.     0.059  0.     0.
  0.     0.186  0.065  0.079  0.     0.132  0.122  0.163  0.     0.     0.
  0.043  0.186  0.     0.     0.     0.     0.262  0.     0.021  0.     0.
  0.041  0.149  0.169  0.     0.     0.   ]
[ 0.016  0.024  0.019 ...,  0.017  0.048  0.015]
Accuracy: 0.781
Kappa: 0.563
Deactivate value: 0.03
Split value: 0.16
Keeping 9 clusters
Spliting 14 clusters
[('detect', 117), ('detection', 84), ('detector', 40), ('detectors', 36), ('detecting', 26)]
[('German', 10280), ('Germany', 7676), ('crowns', 3284), ('Sweden', 2230), ('Bosnia', 1888)]
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('Czech', 2957)]
[('Vatican', 396), ('Catholics', 241), ('priest', 184), ('Archbishop', 149), ('priests', 143)]
[('Co', 9971), ('John', 6081), ('B', 3949), ('David', 3602), ('B

Lasso coefficients: [ 0.005  0.     0.042  0.066  0.     0.111  0.106  0.     0.     0.188
  0.079  0.014  0.     0.139  0.     0.065  0.116  0.134  0.     0.     0.061
  0.045  0.069  0.066  0.031  0.087  0.019  0.096  0.103  0.     0.011
  0.059  0.     0.096  0.19   0.323  0.     0.045  0.     0.135  0.     0.017
  0.089  0.     0.   ]
[ 0.009 -0.01   0.001 ...,  0.013 -0.003  0.012]
Accuracy: 0.557
Kappa: 0.120
Deactivate value: 0.03
Split value: 0.19
Keeping 1 clusters
Spliting 23 clusters
[('European', 23424), ('EU', 10599), ('German', 10280), ('France', 9362), ('Europe', 8932)]
[('parliament', 8303), ('NATO', 3638), ('parliamentary', 3476), ('Netanyahu', 3166), ('Arafat', 2478)]
[('Milan', 1479), ('Croatia', 1003), ('Serbian', 874), ('Barcelona', 851), ('Croatian', 821)]
[('Italy', 5356), ('Paris', 4298), ('km', 3934), ('Czech', 2957), ('Turkish', 2364)]
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('Soviet', 2302)]
[('China', 10364), ('Chinese', 49

Lasso coefficients: [ 0.053  0.2    0.     0.     0.076  0.     0.     0.     0.141  0.173
  0.007  0.03   0.033  0.     0.     0.062  0.     0.236  0.082  0.26   0.
  0.     0.015  0.13   0.062  0.     0.045  0.075  0.     0.     0.076  0.
  0.087  0.     0.     0.     0.217  0.     0.104  0.03   0.04   0.213
  0.044  0.178  0.015  0.13   0.     0.009  0.     0.     0.106]
[-0.008  0.001 -0.004 ..., -0.012 -0.013 -0.011]
Accuracy: 0.513
Kappa: 0.032
Deactivate value: 0.03
Split value: 0.16
Keeping 7 clusters
Spliting 19 clusters
[('European', 23424), ('EU', 10599), ('parliament', 8303), ('Germany', 7676), ('Russian', 6800)]
[('yen', 9919), ('pct', 8369), ('GDP', 4458), ('Plc', 4453), ('shareholders', 4140)]
[('Czech', 2957), ('Sweden', 2230), ('Austria', 1789), ('Bosnian', 1630), ('Milan', 1479)]
[('St', 2369), ('Catholic', 1136), ('Rome', 1019), ('Pope', 781), ('Vatican', 396)]
[('ministry', 5024), ('Jewish', 2561), ('Christian', 1461), ('religious', 1068), ('church', 898)]
[('compan

Lasso coefficients: [ 0.106  0.     0.064  0.126  0.03   0.058  0.013  0.053  0.198  0.026  0.
  0.     0.045  0.     0.209  0.112  0.265  0.     0.     0.034  0.     0.05
  0.051  0.064  0.     0.     0.     0.     0.046  0.261  0.     0.     0.097
  0.157  0.     0.     0.     0.258  0.161  0.     0.     0.111  0.097]
[ 0.123  0.121  0.131 ...,  0.121  0.12   0.123]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.03
Split value: 0.16
Keeping 6 clusters
Spliting 17 clusters
[('Russian', 6800), ('Russia', 5594), ('Moscow', 3607), ('Yeltsin', 3203), ('bln', 2674)]
[('political', 12465), ('party', 11253), ('EU', 10599), ('China', 10364), ('yen', 9919)]
[('Turkish', 2364), ('Bosnia', 1888), ('Bosnian', 1630), ('Serb', 1397), ('Cyprus', 1305)]
[('European', 23424), ('German', 10280), ('Europe', 8932), ('GMT', 8406), ('pct', 8369)]
[('club', 2206), ('Portugal', 1455), ('striker', 963), ('Barcelona', 851), ('Juventus', 633)]
[('England', 3425), ('stg', 3368), ('pence', 1491), ('Wales', 1244

Lasso coefficients: [ 0.179  0.     0.094  0.029  0.194  0.016  0.     0.     0.089  0.     0.023
  0.111  0.     0.075  0.     0.06   0.107  0.     0.008  0.183  0.     0.098
  0.024  0.     0.     0.382  0.012  0.032  0.05   0.082  0.04   0.033
  0.005  0.052  0.009  0.003  0.038  0.     0.191  0.086  0.003  0.007
  0.017  0.284  0.268  0.116  0.049  0.     0.     0.004  0.     0.     0.17
  0.151]
[ 0.085  0.08   0.084 ...,  0.086  0.08   0.079]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.04
Split value: 0.23
Keeping 3 clusters
Spliting 21 clusters
[('party', 11253), ('China', 10364), ('minister', 10086), ('yen', 9919), ('election', 9302)]
[('European', 23424), ('EU', 10599), ('France', 9362), ('analysts', 8719), ('GMT', 8406)]
[('Milosevic', 923), ('Dutroux', 454), ('Maskhadov', 354), ('Karadzic', 234), ('Denktash', 220)]
[('Russian', 6800), ('Moscow', 3607), ('Czech', 2957), ('Turkish', 2364), ('Poland', 2269)]
[('striker', 963), ('defender', 665), ('Juventus', 633), ('midfie

Lasso coefficients: [ 0.     0.     0.16   0.017  0.022  0.068  0.094  0.048  0.117  0.116  0.
  0.059  0.008  0.058  0.     0.134  0.006  0.     0.     0.256  0.045
  0.108  0.     0.135  0.     0.079  0.067  0.023  0.036  0.036  0.063  0.
  0.055  0.     0.     0.022  0.018  0.     0.031  0.169  0.     0.     0.
  0.167  0.172  0.177]
[ 0.067  0.07   0.067 ...,  0.066  0.064  0.066]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.03
Split value: 0.15
Keeping 6 clusters
Spliting 18 clusters
[('Serb', 1397), ('Milosevic', 923), ('OSCE', 889), ('Chernomyrdin', 876), ('Erbakan', 748)]
[('European', 23424), ('EU', 10599), ('Russian', 6800), ('Russia', 5594), ('Moscow', 3607)]
[('German', 10280), ('pct', 8369), ('Germany', 7676), ('crowns', 3284), ('Czech', 2957)]
[('club', 2206), ('Newcastle', 712), ('Manchester', 657), ('cricket', 644), ('rugby', 638)]
[('striker', 963), ('defender', 665), ('midfielder', 620), ('goalkeeper', 483), ('strikers', 421)]
[('London', 11016), ('York', 10679), 

Lasso coefficients: [ 0.136  0.     0.056  0.137  0.173  0.02   0.03   0.     0.05   0.101
  0.101  0.018  0.049  0.139  0.123  0.039  0.     0.036  0.074  0.     0.154
  0.102  0.     0.     0.15   0.     0.     0.     0.     0.     0.     0.
  0.     0.053  0.05   0.11   0.     0.03   0.047  0.013  0.121  0.153
  0.192  0.178]
[ 0.099  0.084  0.096 ...,  0.103  0.083  0.102]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.02
Split value: 0.12
Keeping 11 clusters
Spliting 16 clusters
[('Serb', 1397), ('Milosevic', 923), ('Chernomyrdin', 876), ('Erbakan', 748), ('Serbs', 669)]
[('European', 23424), ('EU', 10599), ('German', 10280), ('Europe', 8932), ('Germany', 7676)]
[('pct', 8369), ('Russian', 6800), ('Ministry', 4503), ('bln', 2674), ('Social', 2429)]
[('Newcastle', 712), ('Liverpool', 517), ('UEFA', 473), ('Chelsea', 372), ('Arsenal', 315)]
[('penalty', 1252), ('ball', 1106), ('striker', 963), ('defender', 665), ('midfielder', 620)]
[('MON', 992), ('Manchester', 657), ('Villa', 28

Lasso coefficients: [ 0.144  0.094  0.     0.006  0.091  0.     0.02   0.115  0.06   0.071
  0.041  0.159  0.007  0.     0.116  0.064  0.124  0.039  0.117  0.006
  0.079  0.134  0.13   0.118  0.     0.     0.093  0.     0.007  0.     0.106
  0.057  0.083  0.02   0.073  0.008  0.092  0.039  0.033  0.069  0.037
  0.065  0.     0.     0.     0.169  0.115  0.18   0.184  0.177]
[ 0.167  0.167  0.167 ...,  0.17   0.167  0.168]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.02
Split value: 0.11
Keeping 14 clusters
Spliting 21 clusters
[('Moscow', 3607), ('Yeltsin', 3203), ('Soviet', 2302), ('zlotys', 1123), ('Ukraine', 1105)]
[('Milosevic', 923), ('Tutsis', 537), ('Karadzic', 234), ('Djindjic', 120), ('Mladic', 95)]
[('Bosnia', 1888), ('Bosnian', 1630), ('Serb', 1397), ('Cyprus', 1305), ('Albania', 1270)]
[('Bayern', 369), ('Klinsmann', 126), ('Capello', 107), ('Eriksson', 76), ('Maradona', 67)]
[('club', 2206), ('Euro', 559), ('Football', 559), ('UEFA', 473), ('Fiat', 280)]
[('basket', 580

Lasso coefficients: [ 0.042  0.074  0.04   0.     0.027  0.007  0.003  0.044  0.     0.031
  0.002  0.075  0.07   0.236  0.059  0.177  0.038  0.111  0.     0.037
  0.26   0.063  0.     0.098  0.     0.064  0.     0.     0.062  0.114
  0.009  0.067  0.108  0.057  0.015  0.     0.     0.112  0.033  0.034
  0.042  0.     0.011  0.182  0.18   0.126  0.165  0.145  0.161]
[ 0.135  0.134  0.135 ...,  0.138  0.136  0.135]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.03
Split value: 0.16
Keeping 7 clusters
Spliting 26 clusters
[('Russian', 6800), ('Russia', 5594), ('NATO', 3638), ('Moscow', 3607), ('Yeltsin', 3203)]
[('EU', 10599), ('pct', 8369), ('parliament', 8303), ('Plc', 4453), ('francs', 3977)]
[('Boris', 1390), ('Lebed', 929), ('Viktor', 731), ('Vladimir', 466), ('Sergei', 441)]
[('Republic', 2830), ('Social', 2429), ('Socialist', 2058), ('federation', 860), ('Sinn', 683)]
[('Capello', 107), ('Eriksson', 76), ('Dalglish', 65), ('Gullit', 63), ('Hoddle', 59)]
[('UEFA', 473), ('Bayern'

Lasso coefficients: [ 0.023  0.07   0.047  0.091  0.     0.002  0.08   0.     0.091  0.     0.099
  0.085  0.     0.116  0.206  0.091  0.007  0.008  0.015  0.013  0.067
  0.174  0.111  0.     0.     0.077  0.075  0.146  0.     0.     0.     0.083
  0.03   0.     0.027  0.059  0.051  0.06   0.084  0.     0.234  0.147
  0.004  0.     0.001  0.204  0.021  0.257  0.085  0.     0.003  0.041]
[ 0.069  0.073  0.072 ...,  0.068  0.064  0.068]
Accuracy: 0.503
Kappa: 0.000
Deactivate value: 0.03
Split value: 0.15
Keeping 5 clusters
Spliting 24 clusters
[('European', 23424), ('German', 10280), ('Europe', 8932), ('Germany', 7676), ('Russia', 5594)]
[('EU', 10599), ('parliament', 8303), ('NATO', 3638), ('Yeltsin', 3203), ('Socialist', 2058)]
[('Polish', 1584), ('zlotys', 1123), ('zloty', 378), ('Vaclav', 269), ('Gyula', 115)]
[('Helmut', 1000), ('Klaus', 741), ('Hans', 663), ('der', 422), ('Juergen', 287)]
[('Russian', 6800), ('Soviet', 2302), ('Boris', 1390), ('Lebed', 929), ('Viktor', 731)]
[('Sw

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso = fit_lasso(bows_train, y_train, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
#     validate(lasso, bows_val, y_val, threshold=0.02)
    
#     keep, split = study_lasso(lasso.coef_, deactivate_value=0.01, use_mean=True, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso = fit_lasso(bows_train, y_train, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
#     validate(lasso, bows_val, y_val, threshold=0.02)
    
#     keep, split = study_lasso(lasso.coef_, deactivate_value=0.01, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso = fit_lasso(bows_train, y_train, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
#     validate(lasso, bows_val, y_val, threshold=0.02)
    
#     keep, split = study_lasso(lasso.coef_, deactivate_threshold=0.2, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)