# Seleccionando palabras

In [1]:
import string
import random
import numpy as np
import reuters_reader
from sklearn.cluster import MiniBatchKMeans
from collections import Counter
from sklearn.externals import joblib
from gensim.models.keyedvectors import KeyedVectors
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from copy import deepcopy
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from scipy.sparse import csr_matrix
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from pandas import DataFrame
from IPython.display import display

np.set_printoptions(precision=3)

### RCV1 Dataset

Use the function ```reuters_reader.reader(path)``` to retrieve the available documents from the rcv1 dataset stored in `path`. This function returns a generator (```reader```) which yields a single document each time we call ```next(reader)```. Each document is a dictionary with the followitn useful keys:
 - "title" is the title of the document
 - "text" is the body of the document
 - "bip:topics:1.0" is the list of topics
 
There are a total of 804420 available documents, although some may have no topic.

#### Building the dataset
We build a balanced dataset that contains ```n_docs```. To get a balanced dataset we iterate through the documents generator until we have ```n_docs / 2``` documents with the desired topic and the same amount without it. 

#### Get the labels
Select a topic we want to classify using the variable topic (there is a list of the topics https://gist.github.com/gavinmh/6253739). Then build the list of labels using a 1 for those documents with that topic and 0 otherwise

#### Training and validation set
Finally, we split the dataset using the ```train_split``` value. 

In [4]:
path = 'rcv1'
n_docs = 100000
train_split = 0.8
topic = 'GCAT'

docs = []
reader = reuters_reader.reader(path)

topic_true = 0
topic_false = 0

while len(docs) < n_docs:
    doc = next(reader)
    docs.append(doc)
     
random.shuffle(docs)

labels = np.zeros((n_docs), dtype=np.int16)
labels = [1 if topic in doc['bip:topics:1.0'] else 0 for doc in docs]

print('{} docs with topic {} (from {})'.format(np.sum(labels), topic, n_docs))

split_point = int(n_docs * train_split)
x_train, y_train = docs[:split_point], labels[:split_point]
x_val, y_val = docs[split_point:], labels[split_point:]

print('Training with {} docs'.format(len(x_train)))
print('Validating with {} docs'.format(len(x_val)))

19960917 / 59589newsML.xml failed to parse XML.
28464 docs with topic GCAT (from 100000)
Training with 80000 docs
Validating with 20000 docs


### Word2vec model

We are loading the well known word2vec model from __[Google](https://code.google.com/archive/p/word2vec/)__ which is stored in the binary file `GoogleNews-vectors-negative300.bin`.

Load the word2vec model

In [5]:
w2v_name = 'GoogleNews-vectors-negative300.bin'
w2v = KeyedVectors.load_word2vec_format(w2v_name, binary=True)

### Get our vocabulary

Get all the the vectors from the word2vec for our vocabulary. Our vocabulary can include all the words used in the word2vec model or be limited to the words in our dataset.

We can change this behaviour with the flag ```dataset_vocabulary```. ```False``` will use all the words from the word2vec model and ```True``` will limit them to just the words that are in our dataset and in the model at the same time.

There is a ```count_threshold``` to remove those words appearing very few times because they are probably errors.

As we have to split each documents in individual words, we already save this inside each document with the key "counter".

After this cell, ```X``` is a matrix including all the vectors we are going to use.

In [6]:
words_list = []
words_embedding = []
words_count = []
word2idx = {}
idx2word = {}
word2cluster = {}
cluster2words = {}

count_threshold = 5

for doc in docs:
    doc["counter"] = Counter()
    doc["word_count"] = 0
    words = doc["text"].split()
    words = [word.strip(string.punctuation) for word in words]
    for word in words:
        if word in w2v:
            doc["counter"][word] += 1
            doc["word_count"] += 1
    for word, count in doc["counter"].items():
        try:
            words_count[word2idx[word]] += count
        except:
            words_list.append(word)
            words_count.append(count)
            word2idx[word] = len(words_list) - 1
                
keep_it = [count > count_threshold for count in words_count] 
    
words_list = [word for idx, word in enumerate(words_list) if keep_it[idx]]
words_count = [count for idx, count in enumerate(words_count) if keep_it[idx]]
word2idx = {word: idx for idx, word in enumerate(words_list)}
    
words_embedding = np.zeros((len(words_list), w2v.vector_size), dtype=np.float32)
for idx, word in enumerate(words_list):
    words_embedding[idx, :] += w2v[word]
    
idx2word = dict(enumerate(words_list))

print("Vocabulary length: {}".format(len(words_list)))

for _ in range(100):
    idx = random.randint(0, len(words_list) - 1)
    word = words_list[idx]
    assert(word2idx[word] == idx)
    assert((w2v[word] == words_embedding[idx]).all())

Vocabulary length: 56631


In [7]:
indptr = [0]
indices = []
data = []

for doc in docs:
    for word in doc["counter"]:
        if word in word2idx:
            indices.append(word2idx[word])
            data.append(1)
    indptr.append(len(indices))
    
matrix = csr_matrix((data, indices, indptr), dtype=float)

In [8]:
def fit_svm(X_train, y_train, grid_search=False):
    if grid_search:
        c_parameters = [2 ** i for i in range(-15, 15)]
#     print(c_parameters)
        tuned_parameters = [{'C': c_parameters}]
        clf = GridSearchCV(LinearSVC(dual=False), tuned_parameters, cv=10,
                           scoring='accuracy', return_train_score=True)
        clf.fit(X_train, y_train)    
    else:
        features2c = {50: 8, 100: 8, 150: 8, 200: 8, 250: 1, 300: 0.5, 350: 0.25, 400: 0.25, 
                    500: 0.25, 750: 0.25, 1000: 0.03, 1500: 0.03, 2000: 0.015625, 5000: 0.0078125, 10000: 0.0078125}
        n_features = X_train.shape[1]
        
        features_thresholds = sorted(list(features2c.keys()))
        for i, val in enumerate(features_thresholds):
            if n_features < val:
                break

        if i == 0:
            selected_c = features2c[features_thresholds[0]]
        elif i == len(features_thresholds) - 1:
            selected_c = features2c[features_thresholds[len(features_thresholds) - 1]]
        else:
            left_threshold = features_thresholds[i - 1]
            right_threshold = features_thresholds[i]
            if n_features - left_threshold > right_threshold - n_features:
                selected_c = features2c[features_thresholds[i]]
            else:
                selected_c = features2c[features_thresholds[i - 1]]
        
        clf = LinearSVC(dual=False, C=selected_c)
        clf.fit(X_train, y_train)
    return clf

In [9]:
def fit_lasso(bows, labels, alpha=None, verbose=False):
    if alpha:
        clf = linear_model.Lasso(alpha=alpha, positive=True)
    else:
        clf = linear_model.LassoCV(positive=True)
    clf.fit(bows, labels)
    
    if verbose:
        print('Lasso coefficients: %s' % (np.array2string(clf.coef_, suppress_small=True)))
        
    if alpha:
        return clf, clf.alpha_
    else:
        return clf, alpha

In [10]:
def validate(classifier, bows, y_true, threshold=None):
    y_predicted = classifier.predict(bows)
    if threshold is None:
        threshold = np.mean(y_predicted)
    y_predicted = [1 if i > threshold else 0 for i in y_predicted]
    accuracy = accuracy_score(y_val, y_predicted)
    kappa = cohen_kappa_score(y_val, y_predicted)
    
    print('Accuracy: %.3f' % (accuracy))
    print('Kappa: %.3f' % (kappa))

### Baseline

In [11]:
grid_search = False

scores, _ = chi2(matrix, labels)
sorted_idx = np.argsort(scores, kind="mergesort")

split_point = int(n_docs * train_split)

features = [50, 100, 150, 200, 250, 300, 350, 400, 500, 750, 1000, 1500, 2000, 5000, 10000, 'all']

for number_features in features:
    print("Using %s features" % (str(number_features)))
    
    if number_features != 'all':
        idx = sorted_idx[-number_features:]
    else:
        idx = list(range(matrix.shape[1]))
    
    baseline_x_train, baseline_y_train = matrix[:split_point, idx], labels[:split_point]
    baseline_x_val, baseline_y_val = matrix[split_point:, idx], labels[split_point:]

    print("Using lasso classifier:")
    lasso, alpha = fit_lasso(baseline_x_train, baseline_y_train)
    validate(lasso, baseline_x_val, baseline_y_val)
    
    print("Using SVM classifier:")
    svm_classifier = fit_svm(baseline_x_train, baseline_y_train, grid_search=grid_search)
    if grid_search:
        df = DataFrame(svm_classifier.cv_results_ )
        display(df[['param_C', 'rank_test_score']])
    validate(svm_classifier, baseline_x_val, baseline_y_val)
    
    print("")

Using 50 features
Using lasso classifier:
Accuracy: 0.847
Kappa: 0.629
Using SVM classifier:
Accuracy: 0.871
Kappa: 0.653

Using 100 features
Using lasso classifier:
Accuracy: 0.870
Kappa: 0.682
Using SVM classifier:
Accuracy: 0.894
Kappa: 0.722

Using 150 features
Using lasso classifier:
Accuracy: 0.882
Kappa: 0.713
Using SVM classifier:
Accuracy: 0.907
Kappa: 0.758

Using 200 features
Using lasso classifier:
Accuracy: 0.886
Kappa: 0.722
Using SVM classifier:
Accuracy: 0.915
Kappa: 0.778

Using 250 features
Using lasso classifier:
Accuracy: 0.890
Kappa: 0.734
Using SVM classifier:
Accuracy: 0.919
Kappa: 0.790

Using 300 features
Using lasso classifier:
Accuracy: 0.892
Kappa: 0.737
Using SVM classifier:
Accuracy: 0.923
Kappa: 0.801

Using 350 features
Using lasso classifier:
Accuracy: 0.897
Kappa: 0.749
Using SVM classifier:
Accuracy: 0.926
Kappa: 0.810

Using 400 features
Using lasso classifier:
Accuracy: 0.897
Kappa: 0.750
Using SVM classifier:
Accuracy: 0.929
Kappa: 0.818

Using 500

KeyboardInterrupt: 