# 2- Data preprocessing e ingeniería de características

Ingeniería de características. Estrategias y métodos a analizar en este notebook:

- Bag-of-words
- n-grams
- Term Frequency-Inverse Document Frequency (TF-IDF)

Referencias: https://github.com/dipanjanS

# Import necessary dependencies and settings

In [24]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alberto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Sample corpus of text documents

Definimos un texto sencillo con una serie de frases y etiquetas

In [43]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,El cielo azul es maravilloso.,tiempo
1,Me encanta este cielo azul!,tiempo
2,El zorro marrón salta sobre el perro perezoso.,animales
3,El zorro marrón es rápido y el perro azul es p...,animales
4,El cielo es muy azul y está muy bonito hoy,tiempo
5,El perro es perezoso pero el zorro marrón es r...,animales


# Simple text pre-processing

In [46]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [47]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['cielo azul maravilloso', 'encanta cielo azul',
       'zorro marrn salta perro perezoso',
       'zorro marrn rpido perro azul perezoso !',
       'cielo azul est bonito hoy', 'perro perezoso zorro marrn rpido !'],
      dtype='<U39')

# Bag-of-Words Model

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1],
       [1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1]], dtype=int64)

In [49]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,azul,bonito,cielo,encanta,est,hoy,maravilloso,marrn,perezoso,perro,rpido,salta,zorro
0,1,0,1,0,0,0,1,0,0,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,1,1,0,1,1
3,1,0,0,0,0,0,0,1,1,1,1,0,1
4,1,1,1,0,1,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,1,1,1,0,1


# Bag of n-grams Model

In [50]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,azul est,azul maravilloso,azul perezoso,bonito hoy,cielo azul,encanta cielo,est bonito,marrn rpido,marrn salta,perezoso zorro,perro azul,perro perezoso,rpido perro,salta perro,zorro marrn
0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1
3,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1
4,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1


# TF-IDF Model

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,azul,bonito,cielo,encanta,est,hoy,maravilloso,marrn,perezoso,perro,rpido,salta,zorro
0,0.44,0.0,0.51,0.0,0.0,0.0,0.74,0.0,0.0,0.0,0.0,0.0,0.0
1,0.44,0.0,0.51,0.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.41,0.41,0.0,0.59,0.41
3,0.35,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.4,0.4,0.48,0.0,0.4
4,0.3,0.51,0.35,0.0,0.51,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43,0.43,0.43,0.51,0.0,0.43


# Document Similarity

In [52]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.453926,0.0,0.151644,0.313826,0.0
1,0.453926,1.0,0.0,0.151644,0.313826,0.0
2,0.0,0.0,1.0,0.654475,0.0,0.697533
3,0.151644,0.151644,0.654475,1.0,0.104841,0.93827
4,0.313826,0.313826,0.0,0.104841,1.0,0.0
5,0.0,0.0,0.697533,0.93827,0.0,1.0


## Clustering documents using similarity features

In [53]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,El cielo azul es maravilloso.,tiempo,0
1,Me encanta este cielo azul!,tiempo,0
2,El zorro marrón salta sobre el perro perezoso.,animales,1
3,El zorro marrón es rápido y el perro azul es p...,animales,1
4,El cielo es muy azul y está muy bonito hoy,tiempo,0
5,El perro es perezoso pero el zorro marrón es r...,animales,1


# Topic models

In [54]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
features

Unnamed: 0,T1,T2
0,0.803876,0.196124
1,0.803872,0.196128
2,0.164032,0.835968
3,0.182072,0.817928
4,0.831978,0.168022
5,0.161401,0.838599


## Show topics and their weights

In [55]:
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()


[('cielo', 1.8633678213052478), ('azul', 1.746404591997586), ('maravilloso', 1.2261460223542635), ('encanta', 1.2260658988418311), ('est', 1.0016731739278135), ('hoy', 1.0015784254698974), ('bonito', 1.0015120529520418)]

[('marrn', 1.7251536231820133), ('perro', 1.7251142695717183), ('zorro', 1.7249704534103825), ('perezoso', 1.7249062089963374), ('rpido', 1.4734848357659662), ('salta', 1.0743700451161469), ('azul', 0.7790302587406899)]



## Clustering documents using topic model features

In [56]:
km = KMeans(n_clusters=2)
km.fit_transform(features)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,El cielo azul es maravilloso.,tiempo,1
1,Me encanta este cielo azul!,tiempo,1
2,El zorro marrón salta sobre el perro perezoso.,animales,0
3,El zorro marrón es rápido y el perro azul es p...,animales,0
4,El cielo es muy azul y está muy bonito hoy,tiempo,1
5,El perro es perezoso pero el zorro marrón es r...,animales,0


# Word Embeddings

In [37]:
from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Set values for various parameters
feature_size = 10    # Word vector dimensionality  
window_context = 10          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count = min_word_count,
                          sample=sample)

In [57]:
w2v_model.wv['cielo']

KeyError: "word 'cielo' not in vocabulary"

In [39]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector
    
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [40]:
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
pd.DataFrame(w2v_feature_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.005688,0.010045,0.009654,0.011344,-0.003961,-0.00602,-0.007675,0.00297,-0.001411,-0.018327
1,0.000102,0.005705,0.007002,0.012783,-0.011234,-0.003832,-0.015921,0.001519,0.00511,-0.019446
2,0.009695,0.008582,0.020755,0.020775,0.017472,0.014232,0.001473,-0.005121,-0.002159,0.004147
3,0.005189,0.010487,0.011684,0.02275,0.011136,0.005468,-9.3e-05,-0.004319,-0.002197,-0.001337
4,-0.018018,0.011073,0.002856,0.000967,-0.008941,0.011948,0.005797,0.00429,0.002976,0.001147
5,0.007353,0.017026,0.023971,0.025173,0.017347,0.012788,0.008473,-0.00136,6.4e-05,0.007368


In [41]:
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1
