In [1]:
import pandas as pd
import nltk.data
import _pickle as cPickle
import time
import sys
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans, MiniBatchKMeans
from nltk.corpus import stopwords
from gensim.models.keyedvectors import KeyedVectors
from nltk.stem import SnowballStemmer
from scipy.sparse import find
import scipy.sparse as sparse
import unicodedata
import re

# 2 What is the goal of the BagOfCentroids script?
This script focuses on collecting data from the Extract_data script, as well as loading or training the W2V and Kmeans models. The objective is to generate a bag of centroids, from which we can create instances of the same length regardless of the size of the documents contained in the dataset.

## 2.1 Cleaning the stop words of the text
This time, to generate our training set, we will eliminate the stopwords in order to suppress the noise in the algorithm that will later be used to classify the documents


In [2]:
def review_to_wordlist(raw_review, stemmer=False):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove non-letters
    #letters_only = re.sub("[^A-Za-z0-9]", " ", review_text) 
    letters_only = re.sub("[^\w\d]", " ", raw_review) 
    #
    # 2. Split into individual words
    #### Para este modelo W2V no modificamos las mayúsculas
    words = letters_only.split()
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("spanish"))                  
    # 
    # 4. Remove stop words and apply or not stemming
    if stemmer:
        meaningful_words = [stemmer.stem(w) for w in words if not w in stops]
    else:
        # "re.sub("^\d+$", "DIGITO", w) Change all numbers with the token “DIGITO”
        meaningful_words = [re.sub("^\d+$", "DIGITO", w) for w in words if not w in stops]
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    #return( " ".join( meaningful_words ))
    return meaningful_words

## 2.2 Loading the W2V model
This function is intended to load a W2V model. This allows us to obtain the word vectors that will be used to calculate the centroids.

In [2]:
def load_W2V_model(path):
    model = KeyedVectors.load_word2vec_format(path, binary=True)
    print("Loaded W2V model")
    return model

## 2.3 Training Kmeans model
Once we have the words vectors, we are able to train a kmeans model. The objective is to make clusters in an unsupervised way and allow us to generate the data set we need for the classification algorithm.

In [4]:
def train_kmeans(model, path, wpc):
    start = time.time() # Start time

    # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
    # vhufa average of 15 words per cluster
    # wiki average of 280 words per cluster
    word_vectors = model.vectors.astype(np.float64)
    num_clusters = int(word_vectors.shape[0] / wpc)

    # Initalize a k-means object and use it to extract centroids
    # kmeans_clustering = KMeans( n_clusters = num_clusters ) # high memory consumption
    kmeans_clustering = MiniBatchKMeans(n_clusters=num_clusters, batch_size=num_clusters)
    kmeans_clustering.fit( word_vectors )

    # save the model kmeans_clustering
    with open(path, 'wb') as fid:
        cPickle.dump(kmeans_clustering, fid)

    idx = kmeans_clustering.predict( word_vectors )

    # Get the end time and print how long the process took
    end = time.time()
    elapsed = end - start
    print ("Time taken for K Means clustering: ", elapsed, "seconds.")

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number                                                                                            
    #word_centroid_map = dict(zip( model.index2word, idx ))
    return dict(zip( model.index2word, idx ))

## 2.4 Loading Kmeans model
Once we have the words vectors, we are able to train a kmeans model. With this function we can load a previously trained kmeans model. The objective is to make clusters in an unsupervised way and allow us to generate the data set we need for the classification algorithm.

In [5]:
def load_kmeans(model, path):
    word_vectors = model.vectors.astype(np.float64)
    with open(path, 'rb') as fid:
        km_model = cPickle.load(fid)
    print("Loaded Kmeans model")
    idx = km_model.predict( word_vectors )
    return dict(zip( model.index2word, idx ))

## 2.5 Creating bag of centroids
So far, we focused on getting the different clusters, but in this part we will create the instances of each document. For this, we obtain to which cluster belongs each word from the text and represent all the attributes as the number of times the words appear in each one of the clusters

In [6]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype=np.uint16 )
    #    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [7]:
wpc = 10
path_W2V = "../W2V/sbw_vectors.bin"
path_Kmeans = "../KMeans/sbw_vectors.pkl"
path_data = "../hufa_test_wiki_w2v/test"
path_centroids = "../hufa_test_wiki_w2v/minitest_centroids_test.npz"
#stemmer = SnowballStemmer('spanish')

word_centroid_map = load_kmeans(load_W2V_model(path_W2V), path_Kmeans)
#word_centroid_map = train_kmeans(load_W2V_model(path_W2V), path_Kmeans, wpc)
df = pd.read_csv(path_data, sep='\t', index_col=0)
print ("Loaded dataset")

Loaded W2V model
Loaded Kmeans model
Loaded dataset


In [9]:
# nltk.download()
num_clusters = int(len(word_centroid_map)/wpc)
clean_train_reviews = [review_to_wordlist( text, stemmer ) for text in df["text"][ids_df]]

train_centroids = None
    
for review in clean_train_reviews:
    train_centroids = sparse.vstack([train_centroids, sparse.csr_matrix(create_bag_of_centroids( review, word_centroid_map ))])

In [14]:
print("Saving new instances...")
#np.save(path_centroids, np.concatenate((train_centroids, np.array([df.label]).T), axis=1))
# train_centroids = sparse.load_npz("../hufa_train_wiki_w2v/scentroids_train.npz")
sparse.save_npz(path_centroids, train_centroids)
print("Finished")

Saving new instances...
Finished


### hstack Xtest, Ytest or Xtrain, Ytrain

In [2]:
train_centroids = sparse.load_npz("../hufa_train_wiki_w2v/centroids_train.npz")
test_centroids = sparse.load_npz("../hufa_test_wiki_w2v/centroids_test.npz")

In [3]:
path_data_test = "../hufa_test_wiki_w2v/test"
path_data_train = "../hufa_train_wiki_w2v/train"
df_test = pd.read_csv(path_data_test, sep='\t', index_col=0)
df_train = pd.read_csv(path_data_train, sep='\t', index_col=0)
print ("Loaded dataset")

Loaded dataset


In [4]:
Ytest = sparse.csr_matrix(np.array(df_test["label"]).reshape(df_test["label"].shape[0],1))
Xtest = sparse.csr_matrix(test_centroids)
test_centroids = sparse.hstack([Xtest, Ytest]).tocsr()

Ytrain = sparse.csr_matrix(np.array(df_train["label"]).reshape(df_train["label"].shape[0],1))
Xtrain = sparse.csr_matrix(train_centroids)
train_centroids = sparse.hstack([Xtrain, Ytrain]).tocsr()

In [11]:
sparse.save_npz("../hufa_train_wiki_w2v/full_centroids_train.npz", train_centroids)
sparse.save_npz("../hufa_test_wiki_w2v/full_centroids_test.npz", test_centroids)

In [5]:
# test_p_row_index = test_centroids[:,-1].nonzero()[0]
# test_n_row_index = np.delete(np.arange(0, test_centroids.shape[0]), test_p_row_index)
# test_positivos = test_centroids[test_p_row_index,:]
# test_negativos = test_centroids[test_n_row_index,:]

# train_p_row_index = train_centroids[:,-1].nonzero()[0]
# train_n_row_index = np.delete(np.arange(0, train_centroids.shape[0]), train_p_row_index)
# train_positivos = train_centroids[train_p_row_index,:]
# train_negativos = train_centroids[train_n_row_index,:]

In [8]:
ids_df = [28673, 54473, 54478, 54482, 54501, 54508, 54509, 54532, 54539,
       54558, 54562, 54580, 54585, 54617, 54632, 54637, 54639, 54641,
       54643, 54657, 54696, 54698, 54715, 54730, 54746, 54767, 54769,
       54770, 54774, 54791, 54814, 54827, 54830, 54838, 54841, 54844,
       54848, 54852, 54855, 54876, 54917, 54926, 54928, 54932]
len(df["text"][ids_df])

44

In [13]:
train_centroids.shape, Ytest.shape, test_centroids.shape

((44, 3827), (44, 1), (44, 3828))

In [10]:
Ytest = sparse.csr_matrix(np.array(df["label"][ids_df]).reshape(df["label"][ids_df].shape[0],1))

In [11]:
Xtest = sparse.csr_matrix(train_centroids)
test_centroids = sparse.hstack([Xtest, Ytest]).tocsr()

In [12]:
df.iloc[ids_df].to_csv("../hufa_test_stem_skip/fn_samples", sep='\t')