## Bag of Words and TF-IDF

* CountVectorizer: bag of words
* TfidfTransformer: TF-IDF values
* TfidfVectorizer: Bag of words and Tf-idf

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/alok-kumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alok-
[nltk_data]     kumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/alok-
[nltk_data]     kumar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
corpus = ["The first time you see The Second Renaissance it may look boring.",
        "Look at it at least twice and definitely watch part 2.",
        "It will change your view of the matrix.",
        "Are the human people the ones who started the war?",
        "Is AI a bad thing ?"]

### stopword removal and lemmatize

In [3]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

Use the skills you learned so far to create a function tokenize that takes in a string of text and applies the following:

    case normalization (convert to all lowercase)
    punctuation removal
    tokenization, lemmatization, and stop word removal using nltk


In [10]:
def tokenize(text):
    """
    Input: string of text
    Output: clean text with case normalization, punctuation removal, tokenization, lemmatization and stop word removal
    
    """
    # case normalization
    text = text.lower()
    
    # punctuation removal
    text = re.sub(r'[^\w\s\d]','', text)
    
    # tokenization
    text_tokens = word_tokenize(text)
    
    # lemmatization and stop word removal
    tokens = [lemmatizer.lemmatize(word) for word in text_tokens if word not in stop_words]
    
    return tokens

## CountVectorizer: Bag of Words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# initialize the vectorizer object
vect = CountVectorizer(tokenizer=tokenize)

In [12]:
x = vect.fit_transform(corpus)

In [13]:
x.toarray()

array([[0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
        0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0]], dtype=int64)

In [22]:
# check the vocabulary of vect
vect.vocabulary_

{'2': 0,
 'ai': 1,
 'bad': 2,
 'boring': 3,
 'change': 4,
 'definitely': 5,
 'first': 6,
 'human': 7,
 'least': 8,
 'look': 9,
 'matrix': 10,
 'may': 11,
 'one': 12,
 'part': 13,
 'people': 14,
 'renaissance': 15,
 'second': 16,
 'see': 17,
 'started': 18,
 'thing': 19,
 'time': 20,
 'twice': 21,
 'view': 22,
 'war': 23,
 'watch': 24}

## Tfidf Transformer

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)

In [16]:
# use the count vectorizer to compute tf-idf values
tfidf = transformer.fit_transform(x)

In [17]:
tfidf.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.36419547,  0.        ,
         0.        ,  0.36419547,  0.        ,  0.        ,  0.26745392,
         0.        ,  0.36419547,  0.        ,  0.        ,  0.        ,
         0.36419547,  0.36419547,  0.36419547,  0.        ,  0.        ,
         0.36419547,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.39105193,  0.        ,  0.        ,  0.        ,  0.        ,
         0.39105193,  0.        ,  0.        ,  0.39105193,  0.28717648,
         0.        ,  0.        ,  0.        ,  0.39105193,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.39105193,  0.        ,  0.        ,  0.39105193],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.57735027,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.57735027,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0

## Tfidf Vectorizer

Tfidfvectorizer = countvectorizer + tfidftransformer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [19]:
# compute bag of words count and tf-idf values
X = vectorizer.fit_transform(corpus)

In [20]:
X.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.30298183,  0.        ,  0.        ,  0.30298183,  0.        ,
         0.        ,  0.20291046,  0.        ,  0.24444384,  0.        ,
         0.30298183,  0.        ,  0.        ,  0.        ,  0.        ,
         0.30298183,  0.30298183,  0.30298183,  0.        ,  0.40582093,
         0.        ,  0.30298183,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.30298183,  0.        ],
       [ 0.        ,  0.30015782,  0.        ,  0.60031564,  0.        ,
         0.        ,  0.        ,  0.30015782,  0.        ,  0.        ,
         0.        ,  0.20101919,  0.30015782,  0.24216544,  0.        ,
         0.        ,  0.        ,  0.        ,  0.30015782,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.30015782,  0.        ,  0.        ,
         0.30015782,  0.        ,  0.        ,  0.