In [29]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

In [3]:
X = ["Some say the world will end in fire,",
     "Some say in ice."]

In [4]:
len(X)

2

In [5]:
#implementing bag_of_words model for text data
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
vect.vocabulary_

{'end': 0,
 'fire': 1,
 'ice': 2,
 'in': 3,
 'say': 4,
 'some': 5,
 'the': 6,
 'will': 7,
 'world': 8}

In [8]:
X_bag_of_words =vect.transform(X)

In [9]:
X_bag_of_words

<2x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [10]:
X_bag_of_words.shape

(2, 9)

In [11]:
X_bag_of_words.toarray()

array([[1, 1, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 0, 0, 0]])

In [16]:
vect.inverse_transform(X_bag_of_words)

[array(['end', 'fire', 'in', 'say', 'some', 'the', 'will', 'world'],
       dtype='<U5'), array(['ice', 'in', 'say', 'some'], dtype='<U5')]

In [18]:
# using tf-idf (term frequency- inverse document frequency)

from sklearn.feature_extraction.text import TfidfVectorizer

tf_vect = TfidfVectorizer()
tf_vect.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
np.set_printoptions(precision=2)

print(tf_vect.transform(X).toarray())

[[0.39 0.39 0.   0.28 0.28 0.28 0.39 0.39 0.39]
 [0.   0.   0.63 0.45 0.45 0.45 0.   0.   0.  ]]


In [22]:
# using Ngrams method

bgrams = CountVectorizer(ngram_range=(2,2))

bgrams.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [23]:
bgrams.vocabulary_

{'end in': 0,
 'in fire': 1,
 'in ice': 2,
 'say in': 3,
 'say the': 4,
 'some say': 5,
 'the world': 6,
 'will end': 7,
 'world will': 8}

In [25]:
bgrams.transform(X).toarray()

array([[1, 1, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 0, 1, 0, 0, 0]])

In [26]:
n_grams = CountVectorizer(ngram_range=(1,2))
n_grams.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
n_grams.vocabulary_

{'end': 0,
 'end in': 1,
 'fire': 2,
 'ice': 3,
 'in': 4,
 'in fire': 5,
 'in ice': 6,
 'say': 7,
 'say in': 8,
 'say the': 9,
 'some': 10,
 'some say': 11,
 'the': 12,
 'the world': 13,
 'will': 14,
 'will end': 15,
 'world': 16,
 'world will': 17}

In [28]:
n_grams.get_feature_names()

['end',
 'end in',
 'fire',
 'ice',
 'in',
 'in fire',
 'in ice',
 'say',
 'say in',
 'say the',
 'some',
 'some say',
 'the',
 'the world',
 'will',
 'will end',
 'world',
 'world will']

In [29]:
n_grams.transform(X).toarray()

array([[1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]])

In [8]:
# character vectorsier
X = ["Some say the world will end in fire,",
     "Some say in ice."]

from sklearn.feature_extraction.text import CountVectorizer

char_vect = CountVectorizer(ngram_range=(1,2), analyzer="char")
char_vect.fit(X)

CountVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
print(char_vect.get_feature_names())

[' ', ' e', ' f', ' i', ' s', ' t', ' w', ',', '.', 'a', 'ay', 'c', 'ce', 'd', 'd ', 'e', 'e ', 'e,', 'e.', 'en', 'f', 'fi', 'h', 'he', 'i', 'ic', 'il', 'in', 'ir', 'l', 'l ', 'ld', 'll', 'm', 'me', 'n', 'n ', 'nd', 'o', 'om', 'or', 'r', 're', 'rl', 's', 'sa', 'so', 't', 'th', 'w', 'wi', 'wo', 'y', 'y ']


In [14]:
zen = """Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!"""

lines = [line for line in zen.split('\n')]

In [15]:
lines

['Beautiful is better than ugly.',
 'Explicit is better than implicit.',
 'Simple is better than complex.',
 'Complex is better than complicated.',
 'Flat is better than nested.',
 'Sparse is better than dense.',
 'Readability counts.',
 "Special cases aren't special enough to break the rules.",
 'Although practicality beats purity.',
 'Errors should never pass silently.',
 'Unless explicitly silenced.',
 'In the face of ambiguity, refuse the temptation to guess.',
 'There should be one-- and preferably only one --obvious way to do it.',
 "Although that way may not be obvious at first unless you're Dutch.",
 'Now is better than never.',
 'Although never is often better than *right* now.',
 "If the implementation is hard to explain, it's a bad idea.",
 'If the implementation is easy to explain, it may be a good idea.',
 "Namespaces are one honking great idea -- let's do more of those!"]

In [16]:
len(lines)

19

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect1 = TfidfVectorizer()
vect1.fit_transform(X)

<2x9 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [37]:
counts = vect1.transform(X).toarray()
print(counts)

[[0.39166832 0.39166832 0.         0.27867523 0.27867523 0.27867523
  0.39166832 0.39166832 0.39166832]
 [0.         0.         0.63009934 0.44832087 0.44832087 0.44832087
  0.         0.         0.        ]]


In [41]:
vect1.vocabulary_

{'end': 0,
 'fire': 1,
 'ice': 2,
 'in': 3,
 'say': 4,
 'some': 5,
 'the': 6,
 'will': 7,
 'world': 8}

In [47]:
most_common = np.argmax(counts.sum(axis=0))
print('most common token index : ',most_common)
print('most common token is : ',vect1.get_feature_names()[most_common])

most common token index :  3
most common token is :  in


In [56]:
# fitting tri_gram

from sklearn.feature_extraction.text import CountVectorizer

vect2 = CountVectorizer(ngram_range=(3,3))
vect2.fit(lines)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [57]:
vect2.vocabulary_

{'although never is': 0,
 'although practicality beats': 1,
 'although that way': 2,
 'ambiguity refuse the': 3,
 'and preferably only': 4,
 'are one honking': 5,
 'aren special enough': 6,
 'at first unless': 7,
 'be good idea': 8,
 'be obvious at': 9,
 'be one and': 10,
 'beautiful is better': 11,
 'better than complex': 12,
 'better than complicated': 13,
 'better than dense': 14,
 'better than implicit': 15,
 'better than nested': 16,
 'better than never': 17,
 'better than right': 18,
 'better than ugly': 19,
 'break the rules': 20,
 'cases aren special': 21,
 'complex is better': 22,
 'do more of': 23,
 'easy to explain': 24,
 'enough to break': 25,
 'errors should never': 26,
 'explain it bad': 27,
 'explain it may': 28,
 'explicit is better': 29,
 'face of ambiguity': 30,
 'first unless you': 31,
 'flat is better': 32,
 'great idea let': 33,
 'hard to explain': 34,
 'honking great idea': 35,
 'idea let do': 36,
 'if the implementation': 37,
 'implementation is easy': 38,
 'impl

In [61]:
counts1 = vect2.transform(lines).toarray()
print(counts1)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [59]:
bag_of_word = vect2.transform(lines)
bag_of_word

<19x88 sparse matrix of type '<class 'numpy.int64'>'
	with 97 stored elements in Compressed Sparse Row format>