# Seleccionando palabras

In [1]:
import string
import numpy as np
import reuters_reader
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from sklearn.externals import joblib
from gensim.models.keyedvectors import KeyedVectors
from sklearn import linear_model

np.set_printoptions(precision=3)

### RCV1 Dataset

Use the function ```reuters_reader.reader(path)``` to retrieve the available documents from the rcv1 dataset stored in `path`. This function generates a document which is a dictionary with the data from an specific rcv1 entry. There are a total of 804420 available documents, although some may have no topic.

Some useful keys in this dicitionary are:
 - "title"
 - "text" which is the body of document
 - "bip:topics:1.0" is a list of topics

In [2]:
n_docs = 50000
docs = []
reader = reuters_reader.reader('rcv1')
#total_documents = 0
#for doc in reader:
#    total_documents += 1
#print('Total documents: {}'.format(total_documents))
print('Using: {}'.format(n_docs))
for i in range(n_docs):
    doc = next(reader)
    docs.append(doc)
    if i < 10:
        print(doc['title'])
        print(doc['bip:topics:1.0'])
        print(doc['text'][:100], doc['text'][-100:])

Using: 500000
None
['C12', 'CCAT', 'GCAT', 'GCRIM']
South African Airways (SAA) SAA.CN has accused domestic rival Comair, whose new franchise agreement   had obtained much more detailed information on passengers.

- Johannesburg newsroom +27 11 482 1003
None
['C11', 'CCAT']
German retail group Karstadt AG plans to move to a holding structure as soon as it completes its int .2 million marks in 1995 from a depressed 41.9 million in 1994.

--Frankfurt Newsroom, +49 69 756525
None
['E12', 'ECAT', 'GCAT', 'GPOL']
An Argentine bishop on Saturday attacked government plans to reform the labour market and said what  piness and creativity, that our pensioners lead a languid existence, nearly like a prolonged agony."
None
['GCAT', 'GVIO']
A Jewish settler fired shots at Arab houses in the West Bank town of Hebron on Sunday but caused no  

Settlers have said a partial Israeli army pullout from the town would put their lives in jeopardy.
None
['E12', 'E21', 'E211', 'ECAT', 'GCAT', 'GPOL']
Prime Mi

### Word2vec model

We are going to load a well known word2vec model from __[Google](https://code.google.com/archive/p/word2vec/)__ which is stored in the binary file `GoogleNews-vectors-negative300.bin`.

With the model we define a numpy matrix `X`, which contains as much vectors as words are in the word2vec vocabulary. We need this dataset to train the kmeans algorithm.

Load the word2vec model

In [None]:
w2v_name = 'GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=True)

### Get our vocabulary

Get all the the vectors from the word2vec for our vocabulary. Our vocabulary can include all the words used in the word2vec model or be limited to the words in our dataset.

We can change the beahivour with the flag ```dataset_vocabulary```. ```False``` will use all the words from the word2vec model and ```True``` will limit them to just the words that are in our dataset and in the model at the same time.

There is a ```count_threshold``` to remove those words appearing very few times because they are errors.

After this cell, ```X``` is a matrix including all the vectors we are going to use. We also store the words for each document in ```doc["counter"]```. 

In [None]:
dataset_vocabulary = True
count_threshold = 5

if dataset_vocabulary:
    vocab = Counter()
    for doc in docs:
        doc["counter"] = Counter()
        words = doc["text"].split()
        words = [word.strip(string.punctuation) for word in words]
        for word in words:
            if word in w2v:
                doc["counter"][word] += 1
                vocab[word] += 1
    vocab = {word: count for word, count in vocab.items() if count > count_threshold}
else:
    vocab = w2v.index2word

X = np.zeros((len(vocab), w2v.vector_size), dtype=np.float32)
for index, word in enumerate(vocab):
    X[index, :] += w2v[word]
    
print("Vocabulary length: {}".format(X.shape[0]))
print("Vector length: {}".format(X.shape[1]))

Let's take a look at the most common and uncommon words in our dataset

In [None]:
print(Counter(vocab).most_common()[:10])
print(Counter(vocab).most_common()[-10:])

### Get the labels ###
Select a topic we want to classify using the variable ```topic``` (there is a list of the topics __[here](https://gist.github.com/gavinmh/6253739)__). Then build the list of labels using a ```1``` for those documents with that topic and 0 otherwise 

In [None]:
topic = 'ECAT'

labels = np.zeros((n_docs), dtype=np.int16)
labels = [1 if topic in doc['bip:topics:1.0'] else 0 for doc in docs]

print('{} docs with topic {} (from {})'.format(np.sum(labels), topic, n_docs))

### Initial Cluster

Train the first Kmeans cluster

In [None]:
n_clusters = 50
kmeans_name = 'kmeans' + str(n_clusters) + '.pkl'

In [None]:
classifier = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, compute_labels=True)
classifier.fit(X)
joblib.dump(classifier, kmeans_name)

In [None]:
# def print_clusters(classifier, w2v, topn=3):
# #     w2v.init_sims(replace=True)
#     for (i, center) in enumerate(classifier.cluster_centers_):
#         #print(center)
#         similar = w2v.wv.most_similar_cosmul(positive=[center], topn=topn)
#         similar = [t[0] for t in similar]
#         print("Cluster %d: %s" % (i + 1, ', '.join(similar)))
        
#print_clusters(classifier, w2v)    


### Build a bow model 

Using the clusters from the kmeans classifier build a bow for each document. This bag of words can be normalized usign the frequency of each word with ```useFrequency=True```. 

How? 
* For each document
    * For each word
        * Obtain the w2v vector for that word
        * Obtain the cluster for that vector
        * Add 1 to that cluster in the document bow

To improve the performance the cluster for each word is saved in a dictionary. This way for each word we first check that dictionary instead of the model and then the classifier.


In [None]:
def clusters_bow(docs, classifier, w2v, useFrequency=True, printing=False):
    n_docs = len(docs)
    n_clusters = classifier.n_clusters
    bo_clusters = np.zeros((n_docs, n_clusters))

    hashed_clusters = {}
    for (i, doc) in enumerate(docs):
        for word, count in doc["counter"].items():
            if word in hashed_clusters:
                cluster = hashed_clusters[word]
            else:
                cluster = classifier.predict([w2v[word]])[0]
                hashed_clusters[word] = cluster
            bo_clusters[i][cluster] += count
            
    if useFrequency:
        sums = np.sum(bo_clusters, axis=1)
        boc = bo_clusters / sums.reshape((sums.shape[0], 1))
    else:
        idx = bo_clusters != 0
        # Add a small amount to the denominator to avoid zero division
        boc = np.round(bo_clusters / (bo_clusters + 0.001))
        
    if printing:
        for i in range(10):
            print('Document %d: %s' % (i + 1, np.array2string(boc[i, :])))
        
    return boc

# bows = clusters_bow(docs, classifier, w2v)

Use these bows to classify

### Keep, split or discard clusters

Once we have our bags of words we can study how good are our clusters. To do so we use a Lasso function feeding it with the bows. 

From this function we obtain different coefficients that we will use to decide how good is a particular cluster. Only clusters with a coefficient higher than ```deactivate_value``` are kept, although between these some must be splitted. Splitted clusters are those with a coefficient lower than ```split_threshold``` times the maximum coefficient. 

In [None]:
def study_clusters(bows, labels, deactivate_value=0.1, split_threshold=0.6, printing=False):  
    clf = linear_model.LassoCV()
    clf.fit(bows, labels)
    
    max_w = np.max(np.abs(clf.coef_))
    split_value = deactivate_value * max_w

    if printing:
        print('Lasso coefficients: %s' % (np.array2string(clf.coef_, suppress_small=True)))
        print('Split value (using threshold %.2f): %.2f' % (split_threshold, split_value))
    
    # deactivate = [x <= deactivate_value for x in np.abs(clf.coef_)]
    split = [x > deactivate_value and x <= split_value for x in np.abs(clf.coef_)]
    keep = [x > split_value for x in np.abs(clf.coef_)]
    
    return keep, split

# keep, split = study_clusters(bows, labels)

# print('Keeping %d clusters' % (len([x for x in keep if x == True])))
# print('Spliting %d clusters' % (len([x for x in split if x == True])))

# new_count = len([x for x in keep if x == True]) + 2 * len([x for x in split if x == True])
# print('Using %d clusters in the next iteration' % (new_count))

Once we know which clusters to keep and which to split, we can build the new ones. 

To split one cluster we need to select all the points belonging to that cluster and classify then using 2 neighbourds. 

In [None]:
def update_centers(classifier, X, keep, split):
    n_centers = len([x for x in keep if x == True]) + 2 * len([x for x in split if x == True])
    updated_centers = np.empty((n_centers, classifier.cluster_centers_.shape[1]))
    new_idx = 0
    for i in range(len(keep)):
        if keep[i]:
            updated_centers[new_idx, :] = classifier.cluster_centers_[i, :]
            new_idx += 1
        if split[i]:
            # create classifier with this data
            newX = X[np.where(classifier.labels_ == i)[0], :]
            small_class = MiniBatchKMeans(n_clusters=2, random_state=0, compute_labels=True)
            small_class.fit(newX)
            updated_centers[new_idx, :] = small_class.cluster_centers_[0, :]
            updated_centers[new_idx + 1, :] = small_class.cluster_centers_[1, :]
            new_idx += 2
            
    return updated_centers
        
# new_centers = update_centers(classifier, X, keep, split)
    
# print(classifier.cluster_centers_)
# print(new_centers)
# print(classifier.counts_)
# print(classifier.labels_)
# print(classifier.labels_.shape)

### Build a new classifier

We have to use the new clusters to classify the data in the following steps. One possible to solution to use these new clusters is to update the classifier centers and the relevant attributes. Then we can use the classifier's function ```predict``` as before. Moreover, to keep using this classifier to build the next cluster we also need to update the ```labels_``` and ```counts_``` parameters. 

In [None]:
def update_classifier(classifier, new_centers, x):
    classifier.cluster_centers_ = new_centers
    classifier.n_clusters = len(new_centers)
    classifier.labels_, _ = classifier._labels_inertia_minibatch(x)
    classifier.counts_ = np.zeros(classifier.n_clusters, dtype=np.int32)
    for i in range(classifier.n_clusters):
        classifier.counts_[i] = np.sum(classifier.labels_ == i)
    
    return classifier

# classifier = update_classifier(classifier, new_centers, X)

In [None]:
epochs = 10;
original_classifier = classifier;

for i in range(epochs):
    print('Epoch %d of %d' % (i + 1, epochs))
    print('Number of clusters: %d' % (classifier.n_clusters))
    bows = clusters_bow(docs, classifier, w2v, printing=True)
    keep, split = study_clusters(bows, labels, printing=True)

    print('Keeping %d clusters' % (len([x for x in keep if x == True])))
    print('Spliting %d clusters' % (len([x for x in split if x == True])))
    
    next_centers = update_centers(classifier, X, keep, split)
    
    classifier = update_classifier(classifier, next_centers, X)
#     print_clusters(classifier, w2v)  
    