## Text classification blueprint

- prepare train and test datasets
- text normalization
- feature extraction
- model training
- model prediction and evaluation
- model deployment

In [3]:
CONTRACTION_MAP = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [68]:
import re
import nltk
import string
import pandas as pd
import numpy as np
from pattern.en import tag

from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
import scipy.sparse as sp
from numpy.linalg import norm

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [5]:
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [6]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [8]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contraction_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                     flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
    
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contraction_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", '', expanded_text)
    return expanded_text

In [10]:
# Annotate text tokens with POS tags.
def pos_tag_text(text):
    # Convert Penn treebank tag to wordnet tag.
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in 
                         tagged_text]
    return tagged_lower_text

In [11]:
# Lemmatize based on POS tags
def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag else word
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [12]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [13]:
def remove_stopwords(text):
    tokens = tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [14]:
def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)

    return normalized_corpus

In [15]:
CORPUS = [
    'the sky is blue',
    'sky is blue and sky is beautiful',
    'the beautiful sky is blue',
    'i love blue cheese'
]
new_doc = ['loving this blue sky today']

In [17]:
def bow_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [18]:
# Build bow vectorizer and get features.
bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
features

matrix([[0, 0, 1, 0, 1, 0, 1, 1],
        [1, 1, 1, 0, 2, 0, 2, 0],
        [0, 1, 1, 0, 1, 0, 1, 1],
        [0, 0, 1, 1, 0, 1, 0, 0]])

In [19]:
# Extract features from new document using built vectorizer.
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
new_doc_features

matrix([[0, 0, 1, 0, 0, 0, 1, 0]])

In [23]:
# Print feature names.
feature_names = bow_vectorizer.get_feature_names()
feature_names

['and', 'beautiful', 'blue', 'cheese', 'is', 'love', 'sky', 'the']

In [24]:
def display_features(features, feature_names):
    df = pd.DataFrame(data=features, 
                      columns=feature_names)
    return df

In [25]:
display_features(features, feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0,0,1,0,1,0,1,1
1,1,1,1,0,2,0,2,0
2,0,1,1,0,1,0,1,1
3,0,0,1,1,0,1,0,0


In [27]:
display_features(new_doc_features, feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0,0,1,0,0,0,1,0


In [41]:
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2', 
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

In [43]:
feature_names = bow_vectorizer.get_feature_names()

# Build tfidf transformer and show train corpus tfidf features.
tfidf_trans, tfidf_features = tfidf_transformer(bow_features)
features = np.round(tfidf_features.todense(), 2)
display_features(features, feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,0.4,0.0,0.49,0.0,0.49,0.6
1,0.44,0.35,0.23,0.0,0.56,0.0,0.56,0.0
2,0.0,0.52,0.34,0.0,0.42,0.0,0.42,0.52
3,0.0,0.0,0.35,0.66,0.0,0.66,0.0,0.0


In [44]:
# Show tfidf features for new_doc using built tfidf transformer.
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,0.63,0.0,0.0,0.0,0.77,0.0


# Building the tfidf from scratch.

In [46]:
# Compute term frequency.
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')

# Show term frequencies.
display_features(tf, feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,1.0,1.0,1.0,0.0,2.0,0.0,2.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
3,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


In [52]:
# Build the document frequency matrix.
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # To smoothen idf later.

In [53]:
# Show document frequency.
display_features([df], feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,2,3,5,2,4,2,4,3


In [55]:
# Compute inverse document frequencies.
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)

In [56]:
# Show inverse document frequencies.
display_features([np.round(idf, 2)], feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,1.92,1.51,1.0,1.92,1.22,1.92,1.22,1.51


In [57]:
# Compute idf diagonal matrix.
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()

In [59]:
# Print the idf diagonal matrix.
np.round(idf, 2)

array([[1.92, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 1.51, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 1.92, 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.22, 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 1.92, 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.22, 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.51]])

In [61]:
# Compute tfidf feature matrix.
tfidf = tf * idf
display_features(np.round(tfidf, 2), feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,1.0,0.0,1.22,0.0,1.22,1.51
1,1.92,1.51,1.0,0.0,2.45,0.0,2.45,0.0
2,0.0,1.51,1.0,0.0,1.22,0.0,1.22,1.51
3,0.0,0.0,1.0,1.92,0.0,1.92,0.0,0.0


In [62]:
# Compute L2 norms.
norms = norm(tfidf, axis=1)
norms

array([2.50494598, 4.35010407, 2.92529459, 2.88865719])

In [63]:
# Print the norms for each document.
np.round(norms, 2)

array([2.5 , 4.35, 2.93, 2.89])

In [64]:
# Compute normalized tfidf.
norm_tfidf = tfidf / norms[:, None]

In [65]:
# Show final feature matrix.
display_features(np.round(norm_tfidf, 2), feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,0.4,0.0,0.49,0.0,0.49,0.6
1,0.44,0.35,0.23,0.0,0.56,0.0,0.56,0.0
2,0.0,0.52,0.34,0.0,0.42,0.0,0.42,0.52
3,0.0,0.0,0.35,0.66,0.0,0.66,0.0,0.0


In [67]:
# Compute new doc terms freq from bow freqs.
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')

# Compute tfidf using idf matrix from train corpus.
nd_tfidf = nd_tf * idf
nd_norms = norm(nd_tfidf, axis = 1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]

# Show new_doc tfidf feature vector.
display_features(np.round(norm_nd_tfidf, 2), feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,0.63,0.0,0.0,0.0,0.77,0.0


In [69]:
def tfidf_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [70]:
# Build tfidf vectorizer and get training corpus feature vectors.
tfidf_vectorizer, tfidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tfidf_features.todense(), 2),
                 feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,0.4,0.0,0.49,0.0,0.49,0.6
1,0.44,0.35,0.23,0.0,0.56,0.0,0.56,0.0
2,0.0,0.52,0.34,0.0,0.42,0.0,0.42,0.52
3,0.0,0.0,0.35,0.66,0.0,0.66,0.0,0.0


In [71]:
# Get tfidf feature vector for the new document.
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)

Unnamed: 0,and,beautiful,blue,cheese,is,love,sky,the
0,0.0,0.0,0.63,0.0,0.0,0.0,0.77,0.0
