# Module 1 Text Feature Vectorization

In [7]:
# import numpy as np
# np.set_printoptions(precision=2)

In [46]:
# corpus = [
# 'It was the best of times',
# 'It was the worst of times',
# 'It was the age of wisdom',
# 'It was the age age age of foolishness'
# ]

In [1]:
corpus = open("data/corpus.txt","r").readlines()

In [2]:
corpus

['It was the best of times\n',
 'It was the worst of times\n',
 'It was the age of wisdom\n',
 'It was the age age age of foolishness']

## Count Vectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X = cv.fit_transform(corpus)

# cv.fit(corpus)
# X = cv.transform(corpus)

print(cv.get_feature_names())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']


In [4]:
X.toarray()

array([[0, 1, 0, 1, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [3, 0, 1, 1, 1, 1, 0, 1, 0, 0]], dtype=int64)

In [5]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

It was the best of times
[0 1 0 1 1 1 1 1 0 0]

It was the worst of times
[0 0 0 1 1 1 1 1 0 1]

It was the age of wisdom
[1 0 0 1 1 1 0 1 1 0]

It was the age age age of foolishness
[3 0 1 1 1 1 0 1 0 0]



## N-gram

In [6]:
cv = CountVectorizer(ngram_range=(1, 2))
X = cv.fit_transform(corpus)
print(cv.get_feature_names())

['age', 'age age', 'age of', 'best', 'best of', 'foolishness', 'it', 'it was', 'of', 'of foolishness', 'of times', 'of wisdom', 'the', 'the age', 'the best', 'the worst', 'times', 'was', 'was the', 'wisdom', 'worst', 'worst of']


In [7]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

It was the best of times
[0 0 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0]

It was the worst of times
[0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1]

It was the age of wisdom
[1 0 1 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1 0 0]

It was the age age age of foolishness
[3 2 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 1 1 0 0 0]



## TF-IDF Vectorizer

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())

['age', 'best', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom1', 'worst']


In [69]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

It was the best of times
[0.   0.61 0.   0.32 0.32 0.32 0.48 0.32 0.   0.  ]

It was the worst of times
[0.   0.   0.   0.32 0.32 0.32 0.48 0.32 0.   0.61]

It was the age of wisdom1
[0.48 0.   0.   0.32 0.32 0.32 0.   0.32 0.61 0.  ]

It was the age age age of foolishness
[0.85 0.   0.36 0.19 0.19 0.19 0.   0.19 0.   0.  ]



## Hash Vectorizer

In [84]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=7)
X = vectorizer.fit_transform(corpus)


In [85]:
for text, vector in zip(corpus, X.toarray()):
    print(text.strip())
    print(vector)
    print()

It was the best of times
[ 0.   0.  -0.5 -0.5  0.5  0.5  0. ]

It was the worst of times
[ 0.  -0.5  0.  -0.5  0.5  0.5  0. ]

It was the age of wisdom1
[0.  0.5 0.5 0.  0.5 0.5 0. ]

It was the age age age of foolishness
[0.   0.95 0.   0.   0.   0.32 0.  ]

