# Seleccionando palabras

In [1]:
import string
import random
import numpy as np
import reuters_reader
from kmc2 import kmc2
from collections import Counter
from sklearn.externals import joblib
from gensim.models.keyedvectors import KeyedVectors
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from copy import deepcopy

np.set_printoptions(precision=3)

### RCV1 Dataset

Use the function ```reuters_reader.reader(path)``` to retrieve the available documents from the rcv1 dataset stored in `path`. This function returns a generator (```reader```) which yields a single document each time we call ```next(reader)```. Each document is a dictionary with the followitn useful keys:
 - "title" is the title of the document
 - "text" is the body of the document
 - "bip:topics:1.0" is the list of topics
 
There are a total of 804420 available documents, although some may have no topic.

#### Building the dataset
We build a balanced dataset that contains ```n_docs```. To get a balanced dataset we iterate through the documents generator until we have ```n_docs / 2``` documents with the desired topic and the same amount without it. 

#### Get the labels
Select a topic we want to classify using the variable topic (there is a list of the topics https://gist.github.com/gavinmh/6253739). Then build the list of labels using a 1 for those documents with that topic and 0 otherwise

#### Training and validation set
Finally, we split the dataset using the ```train_split``` value. 

In [2]:
path = 'rcv1'
n_docs = 100000
train_split = 0.8
topic = 'GCAT'

docs = []
reader = reuters_reader.reader(path)

topic_true = 0
topic_false = 0

while len(docs) < n_docs:
    doc = next(reader)
    if topic in doc['bip:topics:1.0']:
        topic_true += 1
        if topic_true <= n_docs // 2:
            docs.append(doc)
    else:
        topic_false += 1
        if topic_false <= n_docs // 2:
            docs.append(doc)
     
random.shuffle(docs)

labels = np.zeros((n_docs), dtype=np.int16)
labels = [1 if topic in doc['bip:topics:1.0'] else 0 for doc in docs]

print('{} docs with topic {} (from {})'.format(np.sum(labels), topic, n_docs))

split_point = int(n_docs * train_split)
x_train, y_train = docs[:split_point], labels[:split_point]
x_val, y_val = docs[split_point:], labels[split_point:]

print('Training with {} docs'.format(len(x_train)))
print('Validating with {} docs'.format(len(x_val)))

19961028 / 146544newsML.xml failed to parse XML.
19970601 / 629003newsML.xml failed to parse XML.
19970725 / 756041newsML.xml failed to parse XML.
50000 docs with topic GCAT (from 100000)
Training with 80000 docs
Validating with 20000 docs


### Word2vec model

We are loading the well known word2vec model from __[Google](https://code.google.com/archive/p/word2vec/)__ which is stored in the binary file `GoogleNews-vectors-negative300.bin`.

Load the word2vec model

In [3]:
w2v_name = 'GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=True)

### Get our vocabulary

Get all the the vectors from the word2vec for our vocabulary. Our vocabulary can include all the words used in the word2vec model or be limited to the words in our dataset.

We can change this behaviour with the flag ```dataset_vocabulary```. ```False``` will use all the words from the word2vec model and ```True``` will limit them to just the words that are in our dataset and in the model at the same time.

There is a ```count_threshold``` to remove those words appearing very few times because they are probably errors.

As we have to split each documents in individual words, we already save this inside each document with the key "counter".

After this cell, ```X``` is a matrix including all the vectors we are going to use.

In [4]:
dataset_vocabulary = True
count_threshold = 5

if dataset_vocabulary:
    vocab = Counter()
    for doc in docs:
        doc["counter"] = Counter()
        doc["word_count"] = 0
        words = doc["text"].split()
        words = [word.strip(string.punctuation) for word in words]
        for word in words:
            if word in w2v:
                doc["counter"][word] += 1
                doc["word_count"] += 1
                vocab[word] += 1
    vocab = {word: count for word, count in vocab.items() if count > count_threshold}
    vocab_array = np.array(list(vocab))
else:
    vocab = w2v.index2word

X = np.zeros((len(vocab), w2v.vector_size), dtype=np.float32)
for index, word in enumerate(vocab):
    X[index, :] += w2v[word]
    
print("Vocabulary length: {}".format(X.shape[0]))
print("Vector length: {}".format(X.shape[1]))

Vocabulary length: 62100
Vector length: 300


In [5]:
from scipy.sparse import csr_matrix

word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = dict(enumerate(vocab))
indptr = [0]
indices = []
data = []

idf_docs = Counter()
for doc in docs:
    for word in doc["counter"]:
        idf_docs[word] += 1

for doc in docs:
    max_f = doc["counter"].most_common(1)[0][1]
    for word in doc["counter"]:
        if word in word2idx:
            indices.append(word2idx[word])
            tf = doc["counter"][word] / max_f
            idf = np.log(n_docs / idf_docs[word])
            data.append(tf * idf)
    indptr.append(len(indices))
    
matrix = csr_matrix((data, indices, indptr), dtype=float)

### Baseline

In [6]:
# split_point = int(n_docs * train_split)
# baseline_x_train, baseline_y_train = matrix[:split_point], labels[:split_point]
# baseline_x_val, baseline_y_val = matrix[split_point:], labels[split_point:]

# lasso = fit_lasso(baseline_x_train, baseline_y_train)
# validate(lasso, baseline_x_val, baseline_y_val)

### Initial clusters


In [7]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
scores, _ = chi2(matrix, labels)
sorted_idx = np.argsort(scores, kind="mergesort")[-50:]
words = [idx2word[idx] for idx in sorted_idx]
initial_centers = np.array([w2v[word] for word in words])
words

['Shares',
 'Revs',
 'DATE',
 'tonnes',
 'Year',
 'Quarterly',
 'stocks',
 'Prior',
 'Record',
 'Avg',
 'profit',
 'Latest',
 'futures',
 'Pay',
 'rating',
 'Ended',
 'price',
 'loss',
 'Co',
 'rose',
 'traders',
 'net',
 'Total',
 'NOTE',
 'company',
 'prices',
 'stock',
 'bonds',
 'Income',
 'sales',
 'Shr',
 'police',
 'Ltd',
 'income',
 'stories',
 'index',
 'pct',
 'percent',
 'Amount',
 'cents',
 'market',
 'his',
 'million',
 'yen',
 'Inc',
 'per',
 'share',
 'shares',
 'vs',
 'Net']

Let's take a look at the most common and uncommon words in our dataset

In [8]:
print(Counter(vocab).most_common()[:10])
print(Counter(vocab).most_common()[-10:])

[('the', 1244626), ('in', 526190), ('said', 330822), ('on', 294388), ('for', 233096), ('The', 208167), ('that', 164547), ('was', 163462), ('at', 151405), ('is', 150826)]
[('stampers', 6), ('Schwertfeger', 6), ('CACL', 6), ('supertasters', 6), ('Ozimek', 6), ('SARU', 6), ('Kometani', 6), ('blockers', 6), ('Balestrino', 6), ('ISESCO', 6)]


### Initial Cluster

Train the first Kmeans cluster using the complete set of words in our dataset.

In [9]:
n_clusters = 50
# kmeans_name = 'kmeans' + str(n_clusters) + '.pkl'

In [10]:
from sklearn.cluster import MiniBatchKMeans

class Kmeans:

    def __init__(self, n_clusters):
        self.cluster_centers_ = []
        self.labels_ = []
        self.n_clusters = n_clusters
        self.kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, compute_labels=True)
    
    def fit(self, X):
        self.kmeans.fit(X)
        self.cluster_centers_ = kmc2(X, self.n_clusters)
        self.update(self.cluster_centers_, X)
    
    def update(self, new_centers, X):
        self.cluster_centers_ = self.kmeans.cluster_centers_ = new_centers
        self.n_clusters = len(new_centers)
        self.labels_, _ = self.kmeans._labels_inertia_minibatch(X)
        self.kmeans.labels_ = self.labels_
        self.counts_ = self.kmeans.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
        for i in range(self.n_clusters):
            self.counts_[i] = np.sum(self.labels_ == i)
            self.kmeans.counts_[i] = np.sum(self.kmeans.labels_ == i)
        
    def predict(self, x):
        return self.kmeans.predict(x)
            
        
kmeans = Kmeans(n_clusters=n_clusters)
kmeans.fit(X)

### Print each cluster

To have an idea of the clusters we are workings with we can print some of their neighbords. The option selected here is to print the most representative neighbords (those that appear more times in the dataset)

In [11]:
def print_clusters(kmeans, vocab, topn=5): 
    vocab_array = np.array(list(vocab))
    clusters_len = np.empty((kmeans.n_clusters), dtype=int)
    for (i, center) in enumerate(kmeans.cluster_centers_):
        cluster_vocab = vocab_array[np.where(kmeans.labels_ == i)[0]]
        clusters_len[i] = len(cluster_vocab)
        cluster_counter = Counter({w: vocab[w] for w in cluster_vocab})
#         for v in cluster_vocab:
#             cluster_counter[v] = vocab[v]
        print(cluster_counter.most_common(topn))
    print("Clusters mean length: %d" % (np.mean(clusters_len)))
    print("Clusters min length: %d" % (np.min(clusters_len)))
    print("Clusters max length: %d" % (np.max(clusters_len)))
    

print_clusters(kmeans, vocab)

[('Liberal', 874), ('Chretien', 331), ('Liberals', 168), ('Axworthy', 110), ('Mulroney', 63)]
[('barrels', 2418), ('refinery', 1543), ('Petroleum', 1531), ('bpd', 1375), ('pipeline', 1225)]
[('MDL', 12), ('demurrage', 11), ('McMoRan', 6)]
[('War', 2700), ('Digital', 355), ('Pen', 236), ('Extraordinary', 234), ('Book', 233)]
[('Securities', 6042), ('property', 3343), ('land', 3137), ('Holdings', 2313), ('Investment', 2049)]
[('analyst', 5205), ('Research', 1822), ('Results', 1293), ('Statistics', 1074), ('Latest', 996)]
[('Conrail', 495), ('highway', 476), ('Rt', 256), ('Highway', 114), ('Interstate', 87)]
[('Instruments', 176), ('Controls', 58), ('sensors', 50), ('detectors', 36), ('Components', 28)]
[('setting', 1714), ('creating', 981), ('dealing', 870), ('critics', 638), ('ignored', 483)]
[('pesos', 2058), ('baht', 1416), ('ringgit', 1249), ('Philippine', 1237), ('rupiah', 1196)]
[('weapons', 1940), ('guns', 428), ('artillery', 387), ('ammunition', 297), ('weapon', 247)]
[('British'

### Build a bow model 

Using the clusters from the kmeans classifier build a bow for each document. This bag of words can be normalized usign the frequency of each word with ```useFrequency=True```. 

How? 
* For each document
    * For each word
        * Obtain the w2v vector for that word
        * Obtain the cluster for that vector
        * Add 1 to that cluster in the document bow

To improve the performance the cluster for each word is saved in a dictionary. This way for each word we first check that dictionary instead of first the w2v model and then the classifier.


In [12]:
def clusters_bow(docs_to_bow, kmeans, w2v, reference_bows=None, useFrequency=True, verbose=False):
    n_docs = len(docs_to_bow)
    n_clusters = kmeans.n_clusters
    bo_clusters = np.zeros((n_docs, n_clusters))

    hashed_clusters = {}
    for (i, doc) in enumerate(docs_to_bow):
        for word, count in doc["counter"].items():
            if word in hashed_clusters:
                cluster = hashed_clusters[word]
            else:
                cluster = kmeans.predict([w2v[word]])[0]
                hashed_clusters[word] = cluster
            bo_clusters[i][cluster] += count
     
    
    if reference_bows is None:
        normalized = np.round(bo_clusters / (bo_clusters + 0.001))    
    else:
        normalized = np.round(reference_bows / (reference_bows + 0.001))
        n_docs = len(reference_bows)
            
    if useFrequency:
        max_doc = np.max(bo_clusters, axis=1)
        tf = bo_clusters / max_doc.reshape((bo_clusters.shape[0], 1)).repeat(bo_clusters.shape[1], axis=1)
#         print("tf")
#         print(tf)
        count = np.sum(normalized, axis=0) + 1
#         print("count")
#         print(count)
        idf = np.log(n_docs / count)
#         print("idf")
#         print(idf)
        boc = tf * idf
    else:
        boc = normalized
    
    if verbose:
        for i in range(10):
            print('Document %d: %s (sum = %.2f)' % (i + 1, np.array2string(boc[i, :]), np.sum(boc[i, :])))
        
    return boc

# bows = clusters_bow(docs, kmeans, w2v)
# print(bows)

### Lasso model

Once we have our bags of words we can study how good are our clusters. To do so we use a Lasso function feeding it with the bows. 

#### Build and train the model

We build a Lasso model using the sklearn functions. Lasso is configured to only use positive coefficients (because it is easir to visualize them). 

If we do not have an ```alpha``` value the function uses cross validation to obtain it. 

#### Keep, split or discard clusters

Using the lasso coefficients we can decide how good is a particular cluster. There are different ways to make this decision: 

 - With ```use_mean=False``` and no value for ```deactivate_threshold``` we remove clusters with a coefficient lower than ```deactivate_value``` (typically near 0), split those with a coefficient lower than ```split_threshold``` times the maximum coefficient, and kept those with the higher coefficients. 
 
- With ```use_mean=False``` and a value for ```deactivate_threshold``` we remove clusters with a coefficient lower than ```deactivate_threshold``` times the maximum coefficient, split those with a coefficient lower than ```split_threshold``` times the maximum coefficient, and kept those with the higher coefficients. 

- With ```use_mean=True``` we remove clusters with a coefficient lower than the mean coefficient minus the standard deviation, split those between that value and mean plus standard coefficient, and kept those with a coefficient higher than this last value (mean + std)

  

In [13]:
def fit_lasso(bows, labels, alpha=None, verbose=False):
    if alpha:
        clf = linear_model.Lasso(alpha=alpha, positive=True)
    else:
        clf = linear_model.LassoCV(positive=True)
    clf.fit(bows, labels)
    
    if verbose:
        print('Lasso coefficients: %s' % (np.array2string(clf.coef_, suppress_small=True)))
        
    if alpha:
        return clf, clf.alpha_
    else:
        return clf, alpha

# lasso = fit_lasso(bows, labels)

In [14]:
def study_lasso(lasso_values, deactivate_value=0.1, deactivate_threshold=None, 
                split_threshold=0.6, use_mean=False, verbose=False):  
    
    if use_mean:
        mean = np.mean([i for i in np.abs(lasso_values) if i > 0])
        std = np.std([i for i in np.abs(lasso_values) if i > 0])
        deactivate_value = mean - std
        split_value = mean + std
    else:
        max_w = np.max(np.abs(lasso_values))
        split_value = split_threshold * max_w
        if deactivate_threshold:
            deactivate_value = deactivate_threshold * max_w
        
    if verbose:
        print('Deactivate value: %.2f' % (deactivate_value))
        print('Split value: %.2f' % (split_value))
    
    deactivate = [x <= deactivate_value for x in np.abs(lasso_values)]
    split = [x > deactivate_value and x <= split_value for x in np.abs(lasso_values)]
    keep = [x > split_value for x in np.abs(lasso_values)]
    
    return keep, split, deactivate

# keep, split = study_lasso(lasso.coef_)

# print('Keeping %d clusters' % (len([x for x in keep if x == True])))
# print('Spliting %d clusters' % (len([x for x in split if x == True])))

# new_count = len([x for x in keep if x == True]) + 2 * len([x for x in split if x == True])
# print('Using %d clusters in the next iteration' % (new_count))

### Build new clusters

Once we know which clusters to keep and which to split, we can build the new ones. 

To split one cluster we need to select all the points belonging to that cluster and classify then using 2 neighbourds. 

In [15]:
def update_centers(kmeans, X, keep, split):
    n_centers = len([x for x in keep if x == True]) + 2 * len([x for x in split if x == True])
    updated_centers = np.empty((n_centers, kmeans.cluster_centers_.shape[1]))
    new_idx = 0
    for i in range(len(keep)):
        if keep[i]:
            updated_centers[new_idx, :] = kmeans.cluster_centers_[i, :]
            new_idx += 1
        if split[i]:
            # create kmeans with this data
            newX = X[np.where(kmeans.labels_ == i)[0], :]
            if (newX.shape[0] < 2):
                continue
            small_class = MiniBatchKMeans(n_clusters=2, random_state=0, compute_labels=True)
            small_class.fit(newX)
            updated_centers[new_idx, :] = small_class.cluster_centers_[0, :]
            updated_centers[new_idx + 1, :] = small_class.cluster_centers_[1, :]
            new_idx += 2
            
    return updated_centers[:new_idx, :]
        
# new_centers = update_centers(kmeans, X, keep, split)

In [16]:
def update_dataset(kmeans, X, vocab, deactivate, keep_if_big=None):
    if keep_if_big:
        clusters = [i for i in range(len(deactivate)) if deactivate[i] and 
                len(np.where(kmeans.labels_ == i)[0]) <= keep_if_big]
    else:
        clusters = [i for i in range(len(deactivate)) if deactivate[i]]
    words = np.empty((0,), dtype=np.int)
    for cluster_i in clusters:
        words = np.concatenate((words, np.array(np.where(kmeans.labels_ == cluster_i)[0])))
    mask = np.ones(X.shape[0], dtype=bool)
    mask[words] = False
    X = X[mask, :]
    new_vocab = {w: c for i, (w, c) in enumerate(vocab.items()) if mask[i]}
    return X, new_vocab

# X2, vocab2 = update_dataset(kmeans, X, vocab, deactivate)

### Update the classifier

We have to use the new clusters to classify the data in the following steps. One possible to solution to use these new clusters is to update the classifier centers and the relevant attributes. Then we can use the classifier's function ```predict``` as before. Moreover, to keep using this classifier to build the next cluster we also need to update the ```labels_``` and ```counts_``` parameters. 

In [17]:
def update_kmeans(kmeans, new_centers, x):
    kmeans.cluster_centers_ = new_centers
    kmeans.n_clusters = len(new_centers)
    kmeans.labels_, _ = kmeans._labels_inertia_minibatch(x)
    kmeans.counts_ = np.zeros(kmeans.n_clusters, dtype=np.int32)
    for i in range(kmeans.n_clusters):
        kmeans.counts_[i] = np.sum(kmeans.labels_ == i)
    
    return kmeans

# kmeans = update_kmeans(kmeans, new_centers, X)

### Validation

In [18]:
def validate(lasso, bows, y_true, threshold=None):
    y_predicted = lasso.predict(bows)
    print("Predicted:")
    print(y_predicted[:10])
    print(y_predicted[-10:])
    print("True:")
    print(y_true[:10])
    print(y_true[-10:])
    if threshold is None:
        threshold = np.mean(y_predicted)
    y_predicted = [1 if i > threshold else 0 for i in y_predicted]
    accuracy = accuracy_score(y_val, y_predicted)
    kappa = cohen_kappa_score(y_val, y_predicted)
    
    print('Accuracy: %.3f' % (accuracy))
    print('Kappa: %.3f' % (kappa))

## Play time

Start by saving the first kmeans (so we can use it multiple times) and printing the first set of clusters. 

In [19]:
original_kmeans = deepcopy(kmeans)
original_x = deepcopy(X)
original_vocab = deepcopy(vocab)
print_clusters(kmeans, vocab)

[('Liberal', 874), ('Chretien', 331), ('Liberals', 168), ('Axworthy', 110), ('Mulroney', 63)]
[('barrels', 2418), ('refinery', 1543), ('Petroleum', 1531), ('bpd', 1375), ('pipeline', 1225)]
[('MDL', 12), ('demurrage', 11), ('McMoRan', 6)]
[('War', 2700), ('Digital', 355), ('Pen', 236), ('Extraordinary', 234), ('Book', 233)]
[('Securities', 6042), ('property', 3343), ('land', 3137), ('Holdings', 2313), ('Investment', 2049)]
[('analyst', 5205), ('Research', 1822), ('Results', 1293), ('Statistics', 1074), ('Latest', 996)]
[('Conrail', 495), ('highway', 476), ('Rt', 256), ('Highway', 114), ('Interstate', 87)]
[('Instruments', 176), ('Controls', 58), ('sensors', 50), ('detectors', 36), ('Components', 28)]
[('setting', 1714), ('creating', 981), ('dealing', 870), ('critics', 638), ('ignored', 483)]
[('pesos', 2058), ('baht', 1416), ('ringgit', 1249), ('Philippine', 1237), ('rupiah', 1196)]
[('weapons', 1940), ('guns', 428), ('artillery', 387), ('ammunition', 297), ('weapon', 247)]
[('British'

### First configuration

Using the mean and standard deviation to select which clusters to keep or divide.

The output for each epoch is:

 - Current number of clusters
 - Lasso coefficients
 - Predicted values for the validation set
 - Accuracy
 - Kappa
 - Value for deactivating clusters (those with lower coefficient)
 - Value for splitting clusters (those with lower coefficient and not deactivated)
 - Clusters to be kept
 - Clusters to be split
 - Clusters

In [20]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# alpha = None

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso, alpha = fit_lasso(bows_train, y_train, alpha=alpha, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, reference_bows=bows_train, useFrequency=True, verbose=False)
#     validate(lasso, bows_val, y_val)
    
#     keep, split = study_lasso(lasso.coef_, use_mean=True, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)    

### Second configuration

Deactivating clusters with a coefficient lower than 0.01 and splitting those with lower value than 0.6 times the max(abs(coefficient))

In [21]:
epochs = 20

kmeans = deepcopy(original_kmeans)
X = deepcopy(original_x)
vocab = deepcopy(original_vocab)

alpha = None

for i in range(epochs):
    print('Epoch %d of %d' % (i + 1, epochs))
    print('Number of clusters: %d' % (kmeans.n_clusters))
    bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
    lasso, alpha = fit_lasso(bows_train, y_train, alpha=alpha, verbose=True)
    
    bows_val = clusters_bow(x_val, kmeans, w2v, reference_bows=bows_train, useFrequency=True, verbose=False)
    validate(lasso, bows_val, y_val)
    
    keep, split, deactivate = study_lasso(lasso.coef_, deactivate_value=0.01, verbose=True)

    print('Keeping %d clusters' % (len([x for x in keep if x == True])))
    print('Spliting %d clusters' % (len([x for x in split if x == True])))
    print('Deactivating %d clusters' % (len([x for x in deactivate if x == True])))
    
    next_centers = update_centers(kmeans, X, keep, split)
    print("Before", X.shape)
    X, vocab = update_dataset(kmeans, X, vocab, deactivate, keep_if_big=1000)
    print("After", X.shape)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
    kmeans.update(next_centers, X)
    print_clusters(kmeans, vocab)

Epoch 1 of 20
Number of clusters: 50
Lasso coefficients: [  1.18    0.      0.      0.      0.      0.      0.      0.      2.927
   0.      7.383   1.353   4.09    1.519   1.855   0.      2.057   2.342
   0.173   0.      3.655   0.      0.      0.      1.906   7.396   1.02
   0.      0.      2.085   0.      0.      0.      0.      0.917   0.      0.
   0.     14.484   1.608   0.      7.305   0.      0.349   1.876   0.
   0.118   0.      1.278   0.   ]
Predicted:
[ 0.302  0.768  0.302  0.458  0.302  0.302  0.302  0.667  0.404  0.798]
[ 0.571  0.43   0.516  0.386  0.32   0.339  0.527  0.302  0.39   0.302]
True:
[0, 1, 1, 1, 0, 0, 1, 1, 0, 1]
[1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
Accuracy: 0.706
Kappa: 0.412
Deactivate value: 1.45
Split value: 8.69
Keeping 1 clusters
Spliting 14 clusters
Deactivating 35 clusters
Before (62100, 300)
After (59446, 300)
[('its', 80655), ('state', 19427), ('under', 17366), ('Commission', 8876), ('cut', 8637)]
[('the', 1244626), ('in', 526190), ('on', 294388), ('for

Lasso coefficients: [ 0.]
Predicted:
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
[ 0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5  0.5]
True:
[0, 1, 1, 1, 0, 0, 1, 1, 0, 1]
[1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
Accuracy: 0.500
Kappa: 0.000
Deactivate value: 0.00
Split value: 0.00
Keeping 0 clusters
Spliting 0 clusters
Deactivating 1 clusters
Before (57286, 300)
After (57286, 300)


ValueError: Found array with 0 sample(s) (shape=(0, 300)) while a minimum of 1 is required by check_pairwise_arrays.

### Third configuration

Deactivating clusters with a coefficient lower than 0.2 times the max(abs(coefficient)) and splitting those with lower value than 0.6 times the max(abs(coefficient))

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)
# X = deepcopy(original_x)
# vocab = deepcopy(original_vocab)

# alpha = None 

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso, alpha = fit_lasso(bows_train, y_train, alpha=alpha, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, reference_bows=bows_train, useFrequency=True, verbose=False)
#     validate(lasso, bows_val, y_val)
    
#     keep, split, deactivate = study_lasso(lasso.coef_, deactivate_threshold=0.1, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
#     print('Deactivating %d clusters' % (len([x for x in deactivate if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
#     print("Before", X.shape)
#     X = update_dataset(kmeans, X, deactivate)
#     print("After", X.shape)    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso = fit_lasso(bows_train, y_train, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
#     validate(lasso, bows_val, y_val, threshold=0.02)
    
#     keep, split = study_lasso(lasso.coef_, deactivate_value=0.01, use_mean=True, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso = fit_lasso(bows_train, y_train, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
#     validate(lasso, bows_val, y_val, threshold=0.02)
    
#     keep, split = study_lasso(lasso.coef_, deactivate_value=0.01, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)

In [None]:
# epochs = 20

# kmeans = deepcopy(original_kmeans)

# for i in range(epochs):
#     print('Epoch %d of %d' % (i + 1, epochs))
#     print('Number of clusters: %d' % (kmeans.n_clusters))
#     bows_train = clusters_bow(x_train, kmeans, w2v, useFrequency=True, verbose=False)
#     lasso = fit_lasso(bows_train, y_train, verbose=True)
    
#     bows_val = clusters_bow(x_val, kmeans, w2v, verbose=False)
#     validate(lasso, bows_val, y_val, threshold=0.02)
    
#     keep, split = study_lasso(lasso.coef_, deactivate_threshold=0.2, verbose=True)

#     print('Keeping %d clusters' % (len([x for x in keep if x == True])))
#     print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
#     next_centers = update_centers(kmeans, X, keep, split)
    
#     kmeans = update_kmeans(kmeans, next_centers, X)
#     print_clusters(kmeans, vocab)