# Bag of Words

In [8]:
import numpy as np
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
doc1 = " This is a very good and plain paper. This is really good and interesting"
doc2 = "This paper is very interesting, awesome"

In [10]:
# We can also tokenize a sentence or text by using word_tokenize
tokens = word_tokenize(doc1)
tokens
# Now here this hsould be used after removing punctuations

['This',
 'is',
 'a',
 'very',
 'good',
 'and',
 'plain',
 'paper',
 '.',
 'This',
 'is',
 'really',
 'good',
 'and',
 'interesting']

In [11]:
def clean_text(sent):
    sent = sent.strip(" ")
    sent = re.sub("\s+"," ",sent)
    tokens = word_tokenize(sent)
    stop_updated = stopwords.words("english") + list(punctuation)
    # Removing punctuations also with stopwords
    final_word = [term for term in tokens if term not in stop_updated and len(term)>2]
    # Tanking that words which are having more than 2 charachters
    res = " ".join(final_word)
    return res

In [12]:
doc1_clean = clean_text(doc1.lower())
doc1_clean

'good plain paper really good interesting'

In [13]:
doc2_clean = clean_text(doc2.lower())
doc2_clean

'paper interesting awesome'

In [14]:
doc = pd.DataFrame([doc1_clean, doc2_clean], columns = ['cleaned_text'])
doc

Unnamed: 0,cleaned_text
0,good plain paper really good interesting
1,paper interesting awesome


## Extract features from the text
- CountVectorizer extracts features from the text 
- CountVectorizer allows to apply some transformations on text so that we arrive at a matrix of numbers

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

- fit: extract distinct words from the text corpus to form a bag of words

In [17]:
cv.fit(doc['cleaned_text'])
# extract different words from the text

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
# Obtaining the BOW's
cv.get_feature_names()

['awesome', 'good', 'interesting', 'paper', 'plain', 'really']

In [19]:
cv.vocabulary_
# this is giving the index of each word

{'good': 1,
 'plain': 4,
 'paper': 3,
 'really': 5,
 'interesting': 2,
 'awesome': 0}

In [21]:
x = cv.transform(doc['cleaned_text'])
x
# this gives the sparse matrix of 2 rows (records) and 6 columns(features --> unique words)

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [22]:
x.toarray()

array([[0, 2, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 0]], dtype=int64)

In [24]:
x.todense()
# This gives a matrix

matrix([[0, 2, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 0]], dtype=int64)

In [26]:
# Document Term Matrix- Term Frequency Matrix
dtm = pd.DataFrame(x.toarray(), columns = cv.get_feature_names())
dtm
# Features will be columns

Unnamed: 0,awesome,good,interesting,paper,plain,really
0,0,2,1,1,1,1
1,1,0,1,1,0,0


In [27]:
# Transpose the document term matrix
dtm.T
# Term Document Matrix- Features are the index

Unnamed: 0,0,1
awesome,0,1
good,2,0
interesting,1,1
paper,1,1
plain,1,0
really,1,0


## n Gram Analysis
- unigram - a token comprises of exactle one word
- bigram: a token comprises of exactly 2 words
- trigram: a token comprises of exactly 3 words

### Example: How are you doing
- unigram: "how", "are", "you", "doing"
- bigram: "how are", "are you", "you doing"
- trigram: "how are you", 'are you doing"

## Any combination of unigrams, bigrams and trigrams can be obtained from CountVectorizer
- By default the vectorizer's in sklearn will extract only unigrams ie ngram_range will be set to (1,1) by default which means that features are exactly of length one, which results in a unigram
- To extract bigram- set ngram_range - (2,2)
- To extract trigram- set ngram_range- (3,3)
- To extract both unigram and bigram- set ngram_range- (1,2)
- To extract unigram, bigram and trigram- set ngram_range- (1,3)

In [28]:
# Using CountVectorizer with ngram_range
count_vect_bg = CountVectorizer(ngram_range = (2,2))

In [31]:
x_bg = count_vect_bg.fit_transform(doc['cleaned_text'])
# performing fit and transform at the same time

In [32]:
x_bg
# Sparse matrix is created

<2x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [33]:
x_bg.toarray()

array([[1, 1, 0, 0, 1, 1, 1],
       [0, 0, 1, 1, 0, 0, 0]], dtype=int64)

In [34]:
count_vect_bg.get_feature_names()
# Here we are getting combination of 2 words

['good interesting',
 'good plain',
 'interesting awesome',
 'paper interesting',
 'paper really',
 'plain paper',
 'really good']

In [35]:
df = pd.DataFrame(x_bg.toarray(), columns = count_vect_bg.get_feature_names())
df

Unnamed: 0,good interesting,good plain,interesting awesome,paper interesting,paper really,plain paper,really good
0,1,1,0,0,1,1,1
1,0,0,1,1,0,0,0


## max_features: maximum features in the document term matrix
- selecting top features from document term matrix based on the frequency

In [36]:
# Using CountVectorizer with max_features parameters
cv = CountVectorizer(max_features = 3)

In [37]:
x = cv.fit_transform(doc['cleaned_text'])
x

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [38]:
df = pd.DataFrame(x.toarray(), columns = cv.get_feature_names())
df

Unnamed: 0,good,interesting,paper
0,2,1,1
1,0,1,1


In [39]:
df.sum()
# This gives the total frequency of each word in whole dataset

good           2
interesting    2
paper          2
dtype: int64

In [40]:
# Using CountVectorizer with ngram_range and max_features
cv_ng_mf = CountVectorizer(ngram_range = (2,2), max_features = 3)
x =cv_ng_mf.fit_transform(doc['cleaned_text'])
df = pd.DataFrame(x.toarray(), columns = cv_ng_mf.get_feature_names())
df

Unnamed: 0,good interesting,good plain,interesting awesome
0,1,1,0
1,0,0,1
