# Module 1 Text Feature Vectorization

In [None]:
# import numpy as np
# np.set_printoptions(precision=2)

In [None]:
# corpus = [
# 'It was the best of times',
# 'It was the worst of times',
# 'It was the age of wisdom',
# 'It was the age age age of foolishness'
# ]

In [1]:
corpus = open("data/corpus.txt","r").readlines()

In [2]:
corpus

['It was the best of times\n',
 'It was the worst of times\n',
 'It was the age of wisdom\n',
 'It was the age age age of foolishness']

## Count Vectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X = cv.fit_transform(corpus)

# cv.fit(corpus)
# X = cv.transform(corpus)

print(cv.get_feature_names())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


In [6]:
X.toarray()

array([[0, 1, 0, 1, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [3, 0, 1, 1, 1, 1, 0, 1, 0, 0]], dtype=int64)

In [8]:
import pandas as pd
pd.DataFrame(X.toarray(), columns = cv.get_feature_names())

Unnamed: 0,age,best,foolishness,it,of,the,times,was,wisdom,worst
0,0,1,0,1,1,1,1,1,0,0
1,0,0,0,1,1,1,1,1,0,1
2,1,0,0,1,1,1,0,1,1,0
3,3,0,1,1,1,1,0,1,0,0


In [5]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

It was the best of times
[0 1 0 1 1 1 1 1 0 0]

It was the worst of times
[0 0 0 1 1 1 1 1 0 1]

It was the age of wisdom
[1 0 0 1 1 1 0 1 1 0]

It was the age age age of foolishness
[3 0 1 1 1 1 0 1 0 0]



## N-gram

In [11]:
cv = CountVectorizer(ngram_range=(1, 2))
X = cv.fit_transform(corpus)
print(cv.get_feature_names())

['age', 'age age', 'age of', 'best', 'best of', 'foolishness', 'it', 'it was', 'of', 'of foolishness', 'of times', 'of wisdom', 'the', 'the age', 'the best', 'the worst', 'times', 'was', 'was the', 'wisdom', 'worst', 'worst of']


In [14]:
X.toarray()

array([[0.        , 0.60735961, 0.        , 0.31694544, 0.31694544,
        0.31694544, 0.4788493 , 0.31694544, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.31694544, 0.31694544,
        0.31694544, 0.4788493 , 0.31694544, 0.        , 0.60735961],
       [0.4788493 , 0.        , 0.        , 0.31694544, 0.31694544,
        0.31694544, 0.        , 0.31694544, 0.60735961, 0.        ],
       [0.85328005, 0.        , 0.36075918, 0.18825911, 0.18825911,
        0.18825911, 0.        , 0.18825911, 0.        , 0.        ]])

In [10]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

It was the best of times
[0 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0]

It was the worst of times
[0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1]

It was the age of wisdom
[1 0 1 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1 0 0]

It was the age age age of foolishness
[3 2 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 1 1 0 0 0]



## TF-IDF Vectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


In [19]:
X.toarray()

array([[0.        , 0.60735961, 0.        , 0.31694544, 0.31694544,
        0.31694544, 0.4788493 , 0.31694544, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.31694544, 0.31694544,
        0.31694544, 0.4788493 , 0.31694544, 0.        , 0.60735961],
       [0.4788493 , 0.        , 0.        , 0.31694544, 0.31694544,
        0.31694544, 0.        , 0.31694544, 0.60735961, 0.        ],
       [0.85328005, 0.        , 0.36075918, 0.18825911, 0.18825911,
        0.18825911, 0.        , 0.18825911, 0.        , 0.        ]])

In [20]:
import pandas as pd
pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,age,best,foolishness,it,of,the,times,was,wisdom,worst
0,0.0,0.60736,0.0,0.316945,0.316945,0.316945,0.478849,0.316945,0.0,0.0
1,0.0,0.0,0.0,0.316945,0.316945,0.316945,0.478849,0.316945,0.0,0.60736
2,0.478849,0.0,0.0,0.316945,0.316945,0.316945,0.0,0.316945,0.60736,0.0
3,0.85328,0.0,0.360759,0.188259,0.188259,0.188259,0.0,0.188259,0.0,0.0


In [None]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

## Hash Vectorizer

In [34]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=10, 
                               norm = None, 
                               alternate_sign = False)
X = vectorizer.fit_transform(corpus)

In [35]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector, '\n')

It was the best of times
[0. 3. 0. 0. 0. 0. 1. 0. 2. 0.] 

It was the worst of times
[0. 2. 0. 0. 1. 0. 1. 0. 2. 0.] 

It was the age of wisdom
[0. 1. 0. 0. 0. 1. 1. 0. 2. 1.] 

It was the age age age of foolishness
[1. 1. 0. 0. 0. 0. 1. 0. 2. 3.] 



In [31]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=10,
                               norm = None,
                               alternate_sign = True)
X = vectorizer.fit_transform(corpus)

In [33]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector, '\n')

It was the best of times
[ 0. -1.  0.  0.  0.  0.  1.  0.  0.  0.] 

It was the worst of times
[ 0.  0.  0.  0. -1.  0.  1.  0.  0.  0.] 

It was the age of wisdom
[ 0.  1.  0.  0.  0. -1.  1.  0.  0.  1.] 

It was the age age age of foolishness
[-1.  1.  0.  0.  0.  0.  1.  0.  0.  3.] 



In [36]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=10,
                               norm = 'l1',
                               alternate_sign = True)
X = vectorizer.fit_transform(corpus)

In [38]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector, '\n')

It was the best of times
[ 0.  -0.5  0.   0.   0.   0.   0.5  0.   0.   0. ] 

It was the worst of times
[ 0.   0.   0.   0.  -0.5  0.   0.5  0.   0.   0. ] 

It was the age of wisdom
[ 0.    0.25  0.    0.    0.   -0.25  0.25  0.    0.    0.25] 

It was the age age age of foolishness
[-0.16666667  0.16666667  0.          0.          0.          0.
  0.16666667  0.          0.          0.5       ] 



In [39]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=10,
                               norm = 'l2',
                               alternate_sign = True)
X = vectorizer.fit_transform(corpus)

In [41]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector, '\n')

It was the best of times
[ 0.         -0.70710678  0.          0.          0.          0.
  0.70710678  0.          0.          0.        ] 

It was the worst of times
[ 0.          0.          0.          0.         -0.70710678  0.
  0.70710678  0.          0.          0.        ] 

It was the age of wisdom
[ 0.   0.5  0.   0.   0.  -0.5  0.5  0.   0.   0.5] 

It was the age age age of foolishness
[-0.28867513  0.28867513  0.          0.          0.          0.
  0.28867513  0.          0.          0.8660254 ] 

