## Use TfidfTransformer and TfidfVectorizer in sciket-learn

sklearn中的两个类颇为近似，因此confusing，试以本文解释之。

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# 这是“语料库”
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]

要使用TfidfTransformer，需要先通过CountVectorizer，后者计算TF，限制lexicon大小，去掉停用词等等。

In [3]:
cv = CountVectorizer()
word_counts = cv.fit_transform(docs)
word_counts.toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 0],
       [0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 2, 0],
       [1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0]])

In [4]:
cv.get_feature_names()

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'from',
 'had',
 'house',
 'little',
 'mouse',
 'of',
 'ran',
 'saw',
 'story',
 'the',
 'tiny']

这确实就是想要的TF结果，接下来计算IDF。

In [5]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_counts)

df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=['idf_weights'])
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


现在可以计算TF-IDF了，注意：现在使用两个transform方法：

In [6]:
# 这里以docs为例，实际应用中，可以对新文档使用
count_vector = cv.transform(docs)
tfidf_vector = tfidf_transformer.transform(count_vector)

In [7]:
first_doc_vec = tfidf_vector[0]

df = pd.DataFrame(first_doc_vec.T.todense(), index=cv.get_feature_names(), columns=['tfidf'])
df.sort_values(by=["tfidf"], ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


## 使用Tfidfvectorizer - All at once

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(use_idf=True, norm=None)
tfidf_vectorizer_vecs = tfidf_vectorizer.fit_transform(docs)

In [11]:
first_doc_vec = tfidf_vectorizer_vecs[0]

df = pd.DataFrame(first_doc_vec.T.todense(), 
                  index=tfidf_vectorizer.get_feature_names(), 
                  columns=['tfidf'])
df.sort_values(by=["tfidf"], ascending=False)

Unnamed: 0,tfidf
had,2.098612
little,2.098612
tiny,2.098612
house,1.693147
mouse,1.0
the,1.0
ate,0.0
away,0.0
cat,0.0
end,0.0
