In [2]:
 
import pandas as pd
 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
 
# this is a very toy example, do not try this at home unless you want to understand the usage differences
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]
 

In [5]:

#instantiate CountVectorizer()
cv=CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(docs)

In [37]:
word_count_vector.shape

(5, 16)

In [39]:
cv.get_feature_names()[:10]

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'from',
 'had',
 'house',
 'little',
 'mouse']

In [9]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [11]:
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["tf_idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['tf_idf_weights'])

Unnamed: 0,tf_idf_weights
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


In [13]:
# count matrix
count_vector=cv.transform(docs)
 
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [15]:
feature_names = cv.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


# -------------------------------

In [26]:
### bag-of-words는 CountVectorizer 클래스로 구현이 가능하다.
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer() 
# NLP 혹은 tokenizer 기능까지는 아니고, string에서 whitespace로 구분된 영역들을 벡터로 추출해줌.
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

print(count.vocabulary_) # 단어집의 콘텐츠를 나타낸다. value값은 인덱스를 나타낸다.
print(bag.toarray()) # 단어집에 있는 단어들의 각각의 카운트 갯수를 나타낸다.

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [29]:
### tf-idf
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import TfidfTransformer
words=count.get_feature_names()
print(words)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tdm2 =tfidf.fit_transform(count.fit_transform(docs)).toarray()
print(tdm2)
# print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
# tf-idf를 계산한 행렬을 나타낸다.

['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather']
[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


In [30]:
### tf-idf
np.set_printoptions(precision=2)
from sklearn.feature_extraction.text import TfidfTransformer
words=count.get_feature_names()
print(words)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tdm2 = tfidf.fit_transform(bag)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
# tf-idf를 계산한 행렬을 나타낸다.
for i, n in sorted(zip(tdm2[0].indices, tdm2[0].data)):
    print(words[i], n)


['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather']
[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]
is 0.4337078595086741
shining 0.5584778353707552
sun 0.5584778353707552
the 0.4337078595086741


# -----------------------------

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA

train_set = ["The sky is blue.", "The sun is bright."] #Documents
test_set = ["The sun in the sky is bright."] #Query
stopWords = stopwords.words('english')

vectorizer = CountVectorizer(stop_words = stopWords)
#print vectorizer
transformer = TfidfTransformer()
#print transformer

trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
testVectorizerArray = vectorizer.transform(test_set).toarray()
print ('Fit Vectorizer to train set', trainVectorizerArray)
print ('Transform Vectorizer to test set', testVectorizerArray)

transformer.fit(trainVectorizerArray)

print (transformer.transform(trainVectorizerArray).toarray())

transformer.fit(testVectorizerArray)

tfidf = transformer.transform(testVectorizerArray)
print (tfidf.todense())

Fit Vectorizer to train set [[1 0 1 0]
 [0 1 0 1]]
Transform Vectorizer to test set [[0 1 1 1]]
[[0.71 0.   0.71 0.  ]
 [0.   0.71 0.   0.71]]
[[0.   0.58 0.58 0.58]]


FileNotFoundError: [Errno 2] No such file or directory: 'the house had a tiny little mouse'