# Text Classification

# Text Normalization

In [1]:
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer

In [4]:
stopword_list = nltk.corpus.stopwords.words("english")
wnl = WordNetLemmatizer()

In [5]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [7]:
contraction_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), 
                                    flags=re.IGNORECASE|re.DOTALL)

def expand_contractions(text):
    
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match, match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

In [40]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def pos_tag_text(text):
    
    return [(w, get_wordnet_pos(t)) for w, t in nltk.pos_tag(text)]

def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag else word\
                        for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    
    return lemmatized_text

In [42]:
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))

def remove_special_characters(text):
    
    tokens = tokenize_text(text)
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

In [43]:
def remove_stopwords(text):
    
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    
    return filtered_text

In [44]:
pipeline = [expand_contractions, lemmatize_text, remove_special_characters, remove_stopwords]

def normalize_corpus(corpus, tokenize=False):
    
    normalized_corpus = []
    
    for text in corpus:
        for step in pipeline:
            text = step(text)
            
        normalized_corpus.append(text)
        if tokenize: # estranho
            text = tokenize_text(text)
            normalized_corpus.append(text)
            
    return normalized_corpus

# Feature Extraction

In [45]:
CORPUS = [
    'the sky is blue',
    'sky is blue and sky is beautiful',
    'the beautiful sky is so blue',
    'i love blue cheese'
]

new_doc = ['loving this blue sky today']

In [48]:
import pandas as pd

In [58]:
def gf(vector, labels):
    
    return pd.DataFrame(vector, columns=labels)

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 2))
bow_features = bow_vectorizer.fit_transform(CORPUS)

print(gf(bow_features.todense(), bow_vectorizer.get_feature_names()))

   and  and sky  beautiful  beautiful sky  blue  blue and  blue cheese  \
0    0        0          0              0     1         0            0   
1    1        1          1              0     1         1            0   
2    0        0          1              1     1         0            0   
3    0        0          0              0     1         0            1   

   cheese  is  is beautiful   ...     is so  love  love blue  sky  sky is  so  \
0       0   1             0   ...         0     0          0    1       1   0   
1       0   2             1   ...         0     0          0    2       2   0   
2       0   1             0   ...         1     0          0    1       1   1   
3       1   0             0   ...         0     1          1    0       0   0   

   so blue  the  the beautiful  the sky  
0        0    1              0        1  
1        0    0              0        0  
2        1    1              1        0  
3        0    0              0        0  

[4 rows x 21

In [65]:
new_doc_features = bow_vectorizer.transform(new_doc)
print(gf(new_doc_features.todense(), bow_vectorizer.get_feature_names()).T)

               0
and            0
and sky        0
beautiful      0
beautiful sky  0
blue           1
blue and       0
blue cheese    0
cheese         0
is             0
is beautiful   0
is blue        0
is so          0
love           0
love blue      0
sky            1
sky is         0
so             0
so blue        0
the            0
the beautiful  0
the sky        0


In [68]:
from sklearn.feature_extraction.text import TfidfTransformer

def tfidf_transformer(bow_matrix):
    
    transformer = TfidfTransformer(norm='l2',
                                  smooth_idf=True,
                                  use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    
    return transformer, tfidf_matrix

In [69]:
import numpy as np

feature_names = bow_vectorizer.get_feature_names()

tfidf_trans, tfidf_features = tfidf_transformer(bow_features)
features = np.round(tfidf_features.todense(), 2)
print(gf(features, feature_names))

    and  and sky  beautiful  beautiful sky  blue  blue and  blue cheese  \
0  0.00     0.00       0.00           0.00  0.27      0.00         0.00   
1  0.31     0.31       0.24           0.00  0.16      0.31         0.00   
2  0.00     0.00       0.28           0.36  0.19      0.00         0.00   
3  0.00     0.00       0.00           0.00  0.25      0.00         0.48   

   cheese    is  is beautiful   ...     is so  love  love blue   sky  sky is  \
0    0.00  0.33          0.00   ...      0.00  0.00       0.00  0.33    0.33   
1    0.00  0.40          0.31   ...      0.00  0.00       0.00  0.40    0.40   
2    0.00  0.23          0.00   ...      0.36  0.00       0.00  0.23    0.23   
3    0.48  0.00          0.00   ...      0.00  0.48       0.48  0.00    0.00   

     so  so blue   the  the beautiful  the sky  
0  0.00     0.00  0.41           0.00     0.52  
1  0.00     0.00  0.00           0.00     0.00  
2  0.36     0.36  0.28           0.36     0.00  
3  0.00     0.00  0.00     

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus, ngram_range=(1, 1)):
    
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    
    return vectorizer, features

In [72]:
tfidf_vectorizer, tfidf_features = tfidf_extractor(CORPUS)

print(gf(np.round(tfidf_features.todense(), 2), tfidf_vectorizer.get_feature_names()))

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [73]:
nd_tfidf = tfidf_vectorizer.transform(new_doc)

print(gf(np.round(nd_tfidf.todense(), 2), tfidf_vectorizer.get_feature_names()))

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


## Advanced Word Vectorization Models

In [75]:
import gensim
import nltk

TOKENIZED_CORPUS = [nltk.word_tokenize(sentence) for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence) for sentence in new_doc]

model = gensim.models.Word2Vec(TOKENIZED_CORPUS, size=10, window=10, min_count=2, sample=1e-3)



In [77]:
print(model['sky'])

[-0.04033279  0.04343031 -0.01067983  0.00521895  0.00383821  0.03068873
  0.00573959  0.04973888  0.02468563 -0.01511299]


  """Entry point for launching an IPython kernel.


In [78]:
print(model['blue'])

[ 0.03642776  0.03646963  0.0220179  -0.0389447   0.02501153 -0.03793925
  0.04849996  0.00477016 -0.03181459  0.00453657]


  """Entry point for launching an IPython kernel.


In [79]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,), dtype='float64')
    nwords = 0.
    
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
            
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

In [82]:
def averaged_word_vectorizer(corpus, model, num_features):
    
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    
    return np.array(features)

In [84]:
avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS, model=model, num_features=10)

print(np.round(avg_word_vec_features, 3))

[[ 0.018  0.038 -0.002  0.001 -0.006 -0.01   0.019  0.033  0.019  0.007]
 [-0.001  0.034 -0.006 -0.004 -0.012  0.002  0.012  0.029  0.024  0.005]
 [ 0.009  0.033 -0.007 -0.008 -0.014 -0.004  0.01   0.026  0.021  0.006]
 [ 0.036  0.036  0.022 -0.039  0.025 -0.038  0.048  0.005 -0.032  0.005]]


  if __name__ == '__main__':


In [85]:
nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc, model=model, num_features=10)

print(np.round(nd_avg_word_vec_features, 3))

[[-0.002  0.04   0.006 -0.017  0.014 -0.004  0.027  0.027 -0.004 -0.005]]


  if __name__ == '__main__':


## Evaluating Classification Models

In [2]:
from sklearn import metrics
import numpy as np
import pandas as pd
from collections import Counter

actual_labels = [0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1]
predicted_labels = [0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0]

ac = Counter(actual_labels)
pc = Counter(predicted_labels)

In [3]:
print('Actual counts:', ac.most_common())

Actual counts: [(0, 10), (1, 10)]


In [4]:
print('Predicted counts:', pc.most_common())

Predicted counts: [(0, 11), (1, 9)]


In [7]:
cm = metrics.confusion_matrix(y_true=actual_labels, y_pred=predicted_labels, labels=[0, 1])

print(pd.DataFrame(data=cm, columns=pd.MultiIndex(levels=[['Predicted:'],
                                                         ['spam', 'ham']],
                                                  labels=[[0, 0], [0, 1]]),
                   index=pd.MultiIndex(levels=[['Actual:'],
                                              ['spam', 'ham']],
                                       labels=[[0, 0], [0, 1]])))

             Predicted:    
                   spam ham
Actual: spam          5   5
        ham           6   4


Accuracy is defined as the overall accuracy or proportion of correct predictions of the
model

Precision is defined as the number of predictions made that are actually correct
or relevant out of all the predictions based on the positive class. This is also known as
positive predictive value

Recall is defined as the number of instances of the positive class that were correctly
predicted. This is also known as hit rate, coverage, or sensitivity

F1 score is another accuracy measure that is computed by taking the harmonic mean
of the precision and recall

# Building a Multi-Class Classification System

In [10]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split

def get_data():
    
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    
    return data

def prepare_datasets(corpus, labels, test_size=.33):
    
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, test_size=test_size, random_state=42)
    
    return train_X, test_X, train_Y, test_Y

def remove_empty_docs(corpus, labels):
    
    filtered_corpus = []
    filtered_labels = []
    
    for doc, label in zip(corpus, labels):
        
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)
            
    return filtered_corpus, filtered_labels

In [11]:
dataset = get_data()

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [12]:
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [13]:
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)

In [14]:
print('Sample document:', corpus[10])
print('Class label:', labels[10])
print('Actual class label:', dataset.target_names[labels[10]])

Sample document: the blood of the lamb.

This will be a hard task, because most cultures used most animals
for blood sacrifices. It has to be something related to our current
post-modernism state. Hmm, what about used computers?

Cheers,
Kent
Class label: 19
Actual class label: talk.religion.misc


In [15]:
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus, labels, test_size=.3)

# Continuar pag. 206