## Machine Learning :: Text feature extraction (tf-idf) – Part I

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [1]:
train_set = ("The sky is blue.", "The sun is bright.")
test_set = ("The sun in the sky is bright.", "We can see the shining sun, the bright sun.")


#### countervectorizer
- 문서를 토큰 리스트로 변환한다.
- 각 문서에서 토큰의 출현 빈도를 센다.
- 각 문서를 BOW 인코딩 벡터로 변환한다.

In [4]:
vectorizer = CountVectorizer() #convert the text to lowercase, accents removal, token extraction, filter stop words, etc…
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [8]:
vectorizer.fit_transform(train_set)
print(vectorizer.vocabulary_)

{'the': 5, 'sky': 3, 'is': 2, 'blue': 0, 'sun': 4, 'bright': 1}


In [10]:
smatrix = vectorizer.transform(test_set)
print(smatrix) # vocabulary 에 있는 인덱스들이 얼만 큼 나왔는지 빈도 표현

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	2
  (1, 1)	1
  (1, 4)	2
  (1, 5)	2


In [11]:
smatrix.todense() #2*6 (2: cardinality / 6: feature 개수)

matrix([[0, 1, 1, 1, 1, 2],
        [0, 1, 0, 0, 2, 2]])

## Machine Learning :: Text feature extraction (tf-idf) – Part II

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [15]:
train_set = ("The sky is blue.", "The sun is bright.")
test_set = ("The sun in the sky is bright.", "We can see the shining sun, the bright sun.")


In [17]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(train_set)
print(vectorizer.vocabulary_)

{'the': 5, 'sky': 3, 'is': 2, 'blue': 0, 'sun': 4, 'bright': 1}


In [18]:
freq_term_matrix = count_vectorizer.transform(test_set)

In [19]:
print(freq_term_matrix.todense())

[[0 1 1 1 1 2]
 [0 1 0 0 2 2]]


In [26]:
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
print(tfidf.idf_)

[2.09861229 1.         1.40546511 1.40546511 1.         1.        ]


In [28]:
tf_idf_matrix = tfidf.transform(freq_term_matrix)
print(tf_idf_matrix.todense())

[[0.         0.31701073 0.44554752 0.44554752 0.31701073 0.63402146]
 [0.         0.33333333 0.         0.         0.66666667 0.66666667]]
